Spaces:
Sleeping
Sleeping
import torch | |
from PIL import Image | |
import requests | |
import openai | |
from transformers import (Owlv2Processor, Owlv2ForObjectDetection, | |
AutoProcessor, AutoModelForMaskGeneration, | |
BlipProcessor, BlipForConditionalGeneration) | |
import matplotlib.pyplot as plt | |
import matplotlib.patches as patches | |
import base64 | |
import io | |
import numpy as np | |
import gradio as gr | |
import json | |
import os | |
from dotenv import load_dotenv | |
# Load environment variables | |
load_dotenv() | |
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY') | |
openai.api_key = OPENAI_API_KEY | |
def generate_image_caption(image): | |
device = 'cuda' if torch.cuda.is_available() else 'cpu' | |
processor = BlipProcessor.from_pretrained('Salesforce/blip-image-captioning-base') | |
model = BlipForConditionalGeneration.from_pretrained('Salesforce/blip-image-captioning-base').to(device) | |
inputs = processor(image, return_tensors='pt').to(device) | |
out = model.generate(**inputs) | |
caption = processor.decode(out[0], skip_special_tokens=True) | |
return caption | |
def analyze_caption(caption): | |
messages = [ | |
{ | |
"role": "user", | |
"content": f"""Your task is to determine if the following image description is surprising or not surprising. | |
Description: "{caption}" | |
If the description is surprising, determine which element, figure, or object is making it surprising and write it only in one sentence with no more than 6 words; otherwise, write 'NA'. | |
Also, rate how surprising the image is on a scale of 1-5, where 1 is not surprising at all and 5 is highly surprising. | |
Provide the response as a JSON with the following structure: | |
{{ | |
"label": "[surprising OR not surprising]", | |
"element": "[element]", | |
"rating": [1-5] | |
}} | |
""" | |
} | |
] | |
response = openai.ChatCompletion.create( | |
model="gpt-4", | |
messages=messages, | |
max_tokens=100, | |
temperature=0.1 | |
) | |
return response.choices[0].message.content | |
# The rest of your functions (process_image_detection, show_mask, etc.) remain the same | |
def process_and_analyze(image): | |
if image is None: | |
return None, "Please upload an image first." | |
if OPENAI_API_KEY is None: | |
return None, "OpenAI API key not found in environment variables." | |
try: | |
# Handle different input types | |
if isinstance(image, tuple): | |
image = image[0] # Take the first element if it's a tuple | |
if isinstance(image, np.ndarray): | |
image = Image.fromarray(image) | |
if not isinstance(image, Image.Image): | |
raise ValueError("Invalid image format") | |
# Generate caption | |
caption = generate_image_caption(image) | |
# Analyze caption | |
gpt_response = analyze_caption(caption) | |
response_data = json.loads(gpt_response) | |
if response_data["label"].lower() == "surprising" and response_data["element"].lower() != "na": | |
result_buf = process_image_detection(image, response_data["element"], response_data["rating"]) | |
result_image = Image.open(result_buf) | |
analysis_text = f"Label: {response_data['label']}\nElement: {response_data['element']}\nRating: {response_data['rating']}/5" | |
return result_image, analysis_text | |
else: | |
return image, "Not Surprising" | |
except Exception as e: | |
return None, f"Error processing image: {str(e)}" | |
# Create Gradio interface remains the same | |
if __name__ == "__main__": | |
demo = create_interface() | |
demo.launch() |