Spaces:
Running
on
Zero
Running
on
Zero
import gradio as gr | |
import spaces | |
import torch | |
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor | |
from qwen_vl_utils import process_vision_info | |
from PIL import Image | |
import numpy as np | |
import traceback | |
from typing import Any, Optional | |
import utils | |
from utils import BoundingBox | |
import blurnonymize | |
MODEL_NAME = "cborg/qwen2.5VL-3b-privacydetector" | |
MAX_NEW_TOKENS = 2048 | |
TEMPERATURE = 1.0 | |
MIN_P = 0.1 | |
SYSTEM_PROMPT = """You are a helpful assistant for privacy analysis of images. Please always answer in English. Please obey the users instructions and follow the provided format.""" | |
DEFAULT_PROMPT = """ | |
You are an expert at pixel perfect image analysis and in privacy. | |
First write down your thoughts within a <think> block. | |
Please go through all objects in the image and consider whether they are private data or not. | |
End this with a </think> block. | |
After going through everything, output your findings in an <output></output> block as a json list with the following keys: | |
{"label": <|object_ref_start|>str<|object_ref_end|>, "description": str, "explanation": str, "bounding_box": <|box_start|>[x_min, y_min, x_max, y_max]<|box_end|>, "severity": int} | |
Some things to remember: | |
- private data is only data thats linked to a human person, common examples being a persons face, name, address, license plate | |
- whenever something can be used to identify a unique human person, it is private data | |
- report sensitive data as well, such as a nude person | |
- Severity is a number between 0 and 10, with 0 being not private data and 10 being extremely sensitive private data. | |
- don't report items which dont contain private data in the final output, you may mention them in your thoughts | |
- animals and animal faces are not personal data, so a giraffe or a dog is not private data | |
- you can use whatever format you want within the <think> </think> blocks | |
- only output valid JSON in between the <output> </output> blocks, adhering to the schema provided | |
- output the bounding box always as an array of form [x_min, y_min, x_max, y_max] | |
- private data have a severity greater than 0, so a human face would have severity 6 | |
- go through the image step by step and report the private data, its better to be a bit too sensitive than to miss anything | |
- put the bounding boxes around the human's face and not the entire person when reporting people as personal data | |
- Think step by step, take your time. | |
Here is the image to analyse, start your analysis directly after: | |
""" | |
def build_messages(image, history: Optional[list[dict[str, Any]]] = None, prompt: Optional[str] = None): | |
if not prompt: | |
prompt = DEFAULT_PROMPT | |
if history: | |
return [ | |
*history, | |
{"role": "user", "content": [{"type": "text", "text": prompt}]}, | |
] | |
return [ | |
{ | |
"role": "system", | |
"content": [ | |
{ | |
"type": "text", | |
"text": SYSTEM_PROMPT, | |
} | |
], | |
}, | |
{ | |
"role": "user", | |
"content": [ | |
{"type": "text", "text": prompt}, | |
{"type": "image", "image": image}, | |
], | |
}, | |
] | |
# --- Model Loading --- | |
# Load model using unsloth for 4-bit quantization | |
try: | |
# default: Load the model on the available device(s) | |
model = Qwen2_5_VLForConditionalGeneration.from_pretrained( | |
MODEL_NAME, | |
torch_dtype=torch.bfloat16, | |
trust_remote_code=True | |
).to("cuda").eval() | |
tokenizer = AutoProcessor.from_pretrained(MODEL_NAME, trust_remote_code=True) | |
model.to("cuda").eval() # Ensure model is on GPU and in eval mode | |
print("Model loaded successfully.") | |
except Exception as e: | |
print(f"Error loading model: {e}") | |
print(traceback.format_exc()) | |
# Optionally raise or handle the error to prevent app launch if model fails | |
raise gr.Error(f"Failed to load model {MODEL_NAME}. Check logs. Error: {e}") | |
# --- Blurnonymizer Instance --- | |
try: | |
blurnonymizer_instance = blurnonymize.ImageBlurnonymizer() | |
print("Blurnonymizer initialized successfully.") | |
except Exception as e: | |
print(f"Error initializing Blurnonymizer: {e}") | |
print(traceback.format_exc()) | |
raise gr.Error(f"Failed to initialize Blurnonymizer. Check logs. Error: {e}") | |
# --- Core Processing Function --- | |
# add this so that the sam segmentation runs on the gpu | |
def anonymise_image(input_image_np: np.ndarray, boxes: list[BoundingBox]): | |
"""Calls the blurnonymizer instance to censor the image.""" | |
if not blurnonymizer_instance: | |
raise gr.Error("Blurnonymizer not initialized.") | |
return blurnonymizer_instance.censor_image_blur_easy( | |
input_image_np, boxes, method="segmentation", verbose=False # Set verbose as needed | |
) | |
def run_model_inference(input_image_pil: Image.Image, prompt_text: str): | |
""" | |
Runs model inference on the input image and prompt. | |
""" | |
# 1. Run Model Inference | |
print("Running model inference...") | |
messages = build_messages( | |
input_image_pil, | |
prompt=prompt_text) | |
input_text = tokenizer.apply_chat_template( | |
messages, tokenize=False, add_generation_prompt=True | |
) | |
image_inputs, video_inputs = process_vision_info(messages) | |
inputs = tokenizer( | |
text=[input_text], | |
images=image_inputs, | |
videos=video_inputs, | |
padding=True, | |
return_tensors="pt", | |
).to("cuda") | |
out_tokens = model.generate( | |
**inputs, | |
max_new_tokens=MAX_NEW_TOKENS, | |
use_cache=True, | |
temperature=TEMPERATURE, | |
min_p=MIN_P, | |
) | |
generated_ids_trimmed = [ | |
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, out_tokens) | |
] | |
raw_model_output = tokenizer.batch_decode( | |
generated_ids_trimmed, | |
skip_special_tokens=True, | |
clean_up_tokenization_spaces=True, | |
)[0] | |
input_height = inputs['image_grid_thw'][0][1]*14 | |
input_width = inputs['image_grid_thw'][0][2]*14 | |
if input_height != input_image_pil.height: | |
print("[!] tokenized image height differs from actual height:") | |
print(f"Actual: {input_image_pil.height}, processed: {input_height}") | |
if input_width != input_image_pil.width: | |
print("[!] tokenized image width differs from actual width:") | |
print(f"Actual: {input_image_pil.width}, processed: {input_width}") | |
print("[+] Model inference completed.") | |
print("[*] Raw output:") | |
print(raw_model_output) | |
return raw_model_output, input_height, input_width | |
# Request GPU for this function, allow up to 120 seconds | |
def analyze_image(input_image_pil: Image.Image, prompt_text: str): | |
""" | |
Analyzes the input image using the VLM, visualizes findings, and anonymizes. | |
""" | |
if input_image_pil is None: | |
raise gr.Error("Please upload an image.") | |
if not prompt_text: | |
raise gr.Error("Please provide a prompt.") | |
original_image_np = np.array(input_image_pil) | |
# 1. Run Model Inference | |
try: | |
raw_model_output, image_height, image_width = run_model_inference(input_image_pil, prompt_text) | |
except Exception as e: | |
print(f"Error during model inference: {e}") | |
print(traceback.format_exc()) | |
raise gr.Error(f"Model inference failed: {e}") | |
# 2. Parse Findings | |
try: | |
print("Parsing findings...") | |
# Use the provided utility functions | |
parsed_findings = utils.parse_into_models( | |
utils.parse_json_response(raw_model_output) | |
) | |
print(f"[+] Parsed {len(parsed_findings)} findings.") | |
if not parsed_findings: | |
print("[*] No findings were parsed from the model output.") | |
except Exception as e: | |
print(f"Error parsing model output: {e}") | |
print(traceback.format_exc()) | |
# Don't raise error here, allow visualization/anonymization steps to proceed if possible | |
# or return early with only original image if parsing is critical | |
gr.Warning( | |
f"Could not parse findings from model output: {e}. Visualization and anonymization might be incomplete." | |
) | |
# Fallback: visualize/anonymize based on empty findings list if needed | |
parsed_findings = [] # Ensure it's an empty list for downstream steps | |
# Initialize boxes_for_viz before the try block | |
boxes_for_viz = [] | |
try: | |
# 3. Visualize Findings | |
print("Visualizing findings...") | |
if parsed_findings: | |
# Convert Findings to BoundingBox for visualization function | |
boxes_for_viz = [BoundingBox.from_finding(f) for f in parsed_findings] | |
# Ensure image is in the correct format (np array) for visualize_boxes_annotated | |
visualized_image_np = utils.visualize_boxes_annotated( | |
original_image_np, boxes_for_viz | |
) | |
print("Visualization generated.") | |
else: | |
print("No findings to visualize, using original image.") | |
visualized_image_np = ( | |
original_image_np.copy() | |
) # Show original if no findings | |
except Exception as e: | |
print(f"Error during visualization: {e}") | |
print(traceback.format_exc()) | |
gr.Warning(f"Failed to visualize findings: {e}") | |
visualized_image_np = original_image_np.copy() # Fallback to original | |
try: | |
# 4. Anonymize Image | |
print("Anonymizing image...") | |
# Use the blurnonymize function with the raw output (as it might contain info needed by the func) | |
# Ensure image is numpy array | |
# Check if boxes_for_viz is populated before calling anonymise_image | |
if boxes_for_viz: | |
anonymized_image_np = anonymise_image(original_image_np, boxes_for_viz) | |
print("Anonymization generated.") | |
else: | |
print("No boxes found for anonymization, using original image.") | |
anonymized_image_np = original_image_np.copy() | |
except Exception as e: | |
print(f"Error during anonymization: {e}") | |
print(traceback.format_exc()) | |
gr.Warning(f"Failed to anonymize image: {e}") | |
anonymized_image_np = original_image_np.copy() # Fallback to original | |
# Convert numpy arrays back to PIL Images for Gradio output if needed, or let Gradio handle numpy | |
# Gradio's gr.Image output can handle numpy arrays directly | |
# Return the three images | |
return raw_model_output, visualized_image_np, anonymized_image_np | |
# --- Gradio Interface --- | |
with gr.Blocks() as demo: | |
gr.Markdown("# Private Data Detection & Anonymization UI") | |
gr.Markdown(f"Using model: `{MODEL_NAME}` on ZeroGPU.") | |
with gr.Row(): | |
with gr.Column(scale=1): | |
input_image = gr.Image(type="pil", label="Upload Image") | |
prompt_textbox = gr.Textbox( | |
label="Analysis Prompt", value=DEFAULT_PROMPT, lines=4 | |
) | |
analyze_button = gr.Button("Analyze Image") | |
with gr.Column(scale=2): | |
with gr.Column(): | |
raw_output = gr.Textbox( | |
label="Raw Model Output", interactive=False | |
) | |
output_visualized = gr.Image( | |
label="Detected Privacy Findings", type="numpy", interactive=False | |
) | |
output_anonymized = gr.Image( | |
label="Anonymized", type="numpy", interactive=False | |
) | |
analyze_button.click( | |
fn=analyze_image, | |
inputs=[input_image, prompt_textbox], | |
outputs=[raw_output, output_visualized, output_anonymized], | |
) | |
# --- Launch App --- | |
if __name__ == "__main__": | |
demo.queue().launch( | |
debug=True | |
) # Enable queue for handling multiple requests, debug mode for logs | |