Spaces:

iamrobotbear
/

blip-vqa-gradio

Paused

File size: 3,453 Bytes

6ded388
5053a56
f69bea2
 
3bc78d3
fcd98ee
 
 
54ddd45
bd67535
24a0d89
 
 
 
 
 
 
 
 
 
 
 
 
6fa10d1
f69bea2
 
 
cadcb55
 
f69bea2
 
 
 
6ded388
f69bea2
f260439
f69bea2
 
 
 
 
f260439
f69bea2
 
 
 
 
f260439
f69bea2
 
 
 
6ded388
87b83c0
 
 
 
 
 
 
 
 
 
 
6ded388
f69bea2
 
 
864ff4a
 
f69bea2
 
 
 
 
864ff4a

import gradio as gr
from transformers import AutoProcessor, Blip2ForConditionalGeneration
import torch
from PIL import Image

# Check for GPU availability and set the device variable accordingly
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load the BLIP-2 model and processor
processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b")

# Load model with additional parameters if GPU is available, else load without additional parameters
if torch.cuda.is_available():
    device_map = {0: 'cpu', 1: 'cpu'}  # Define a custom device map if needed
    model = Blip2ForConditionalGeneration.from_pretrained(
        "Salesforce/blip2-opt-2.7b",
        load_in_8bit=True,
        device_map=device_map,
        load_in_8bit_fp32_cpu_offload=True
    )
else:
    model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b")


def blip2_interface(image, prompted_caption_text, vqa_question, chat_context):
    # Prepare image input
    image_input = Image.fromarray(image).convert('RGB')
    inputs = processor(image_input, return_tensors="pt").to(device)  # Remove torch.float16 dtype conversion
    
    
    # Image Captioning
    generated_ids = model.generate(**inputs, max_new_tokens=20)
    image_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()

    # Prompted Image Captioning
    inputs = processor(image_input, text=prompted_caption_text, return_tensors="pt").to(device)
    generated_ids = model.generate(**inputs, max_new_tokens=20)
    prompted_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
    
    # Visual Question Answering (VQA)
    prompt = f"Question: {vqa_question} Answer:"
    inputs = processor(image_input, text=prompt, return_tensors="pt").to(device)
    generated_ids = model.generate(**inputs, max_new_tokens=10)
    vqa_answer = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
    
    # Chat-based Prompting
    prompt = chat_context + " Answer:"
    inputs = processor(image_input, text=prompt, return_tensors="pt").to(device)
    generated_ids = model.generate(**inputs, max_new_tokens=10)
    chat_response = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()

    return image_caption, prompted_caption, vqa_answer, chat_response

# Define Gradio input components
image_input = gr.inputs.Image(type="numpy", label="Image Input")
prompted_caption_input = gr.inputs.Textbox(label="Prompted Caption Text")
vqa_question_input = gr.inputs.Textbox(label="VQA Question")
chat_context = gr.inputs.Textbox(label="Chat Context")

# Define Gradio output components with labels corresponding to the inputs
image_caption_result = gr.outputs.Textbox(label="Image Caption")
prompted_caption_result = gr.outputs.Textbox(label="Prompted Image Caption")
vqa_answer = gr.outputs.Textbox(label="VQA Answer")
chat_response = gr.outputs.Textbox(label="Chat Response")

# Create Gradio interface
iface = gr.Interface(
    fn=blip2_interface,
    inputs=[image_input, prompted_caption_input, vqa_question_input, chat_context],
    outputs=[image_caption_result, prompted_caption_result, vqa_answer, chat_response],
    title="BLIP-2 Image Captioning and VQA",
    description="Interact with the BLIP-2 model for image captioning, prompted image captioning, visual question answering, and chat-based prompting.",
)

if __name__ == "__main__":
    iface.launch()