Spaces:

cborg
/

imgprivllm

Running on Zero

App Files Files Community

hugohabicht01 commited on Apr 20

Commit

335bcd6

1 Parent(s): c8cd915

init

Browse files

Files changed (4) hide show

app.py +297 -0
blurnonymize.py +300 -0
requirements.txt +12 -0
utils.py +351 -0

app.py ADDED Viewed

	@@ -0,0 +1,297 @@

+import gradio as gr
+import spaces
+from unsloth import FastVisionModel
+import torch
+from PIL import Image
+import numpy as np
+import traceback
+from typing import Any, Optional
+# Import user-provided modules
+import utils
+from utils import Finding, BoundingBox  # Explicitly import needed classes
+import blurnonymize
+# --- Constants ---
+MODEL_NAME = "cborg/qwen2.5VL-3b-privacydetector"
+MAX_NEW_TOKENS = 2048
+TEMPERATURE = 1.0
+MIN_P = 0.1
+SYSTEM_PROMPT = """You are a helpful assistant for privacy analysis of images. Please always answer in English. Please obey the users instructions and follow the provided format."""
+DEFAULT_PROMPT = """
+You are an expert at pixel perfect image analysis and in privacy.
+First write down your thoughts within a <think> block.
+Please go through all objects in the image and consider whether they are private data or not.
+End this with a </think> block.
+After going through everything, output your findings in an <output></output> block as a json list with the following keys:
+{"label": <|object_ref_start|>str<|object_ref_end|>, "description": str, "explanation": str, "bounding_box": <|box_start|>[x_min, y_min, x_max, y_max]<|box_end|>, "severity": int}
+Some things to remember:
+- private data is only data thats linked to a human person, common examples being a persons face, name, address, license plate
+- whenever something can be used to identify a unique human person, it is private data
+- report sensitive data as well, such as a nude person
+- Severity is a number between 0 and 10, with 0 being not private data and 10 being extremely sensitive private data.
+- don't report items which dont contain private data in the final output, you may mention them in your thoughts
+- animals and animal faces are not personal data, so a giraffe or a dog is not private data
+- you can use whatever format you want within the <think> </think> blocks
+- only output valid JSON in between the <output> </output> blocks, adhering to the schema provided
+- output the bounding box always as an array of form [x_min, y_min, x_max, y_max]
+- private data have a severity greater than 0, so a human face would have severity 6
+- go through the image step by step and report the private data, its better to be a bit too sensitive than to miss anything
+- put the bounding boxes around the human's face and not the entire person when reporting people as personal data
+- Think step by step, take your time.
+Here is the image to analyse, start your analysis directly after:
+"""
+def build_messages(image, history: Optional[list[dict[str, Any]]] = None, prompt: Optional[str] = None):
+    if not prompt:
+        prompt = DEFAULT_PROMPT
+    if history:
+        return [
+            *history,
+            {"role": "user", "content": [{"type": "text", "text": prompt}]},
+        ]
+    return [
+        {
+            "role": "system",
+            "content": [
+                {
+                    "type": "text",
+                    "text": SYSTEM_PROMPT,
+                }
+            ],
+        },
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": prompt},
+                {"type": "image", "image": image},
+            ],
+        },
+    ]
+# --- Model Loading ---
+# Load model using unsloth for 4-bit quantization
+try:
+    model, tokenizer = FastVisionModel.from_pretrained(
+        model_name=MODEL_NAME,
+        load_in_4bit=True,
+    )
+    FastVisionModel.for_inference(model)
+    model.to("cuda").eval()  # Ensure model is on GPU and in eval mode
+    print("Model loaded successfully.")
+except Exception as e:
+    print(f"Error loading model: {e}")
+    print(traceback.format_exc())
+    # Optionally raise or handle the error to prevent app launch if model fails
+    raise gr.Error(f"Failed to load model {MODEL_NAME}. Check logs. Error: {e}")
+# --- Blurnonymizer Instance ---
+try:
+    blurnonymizer_instance = blurnonymize.ImageBlurnonymizer()
+    print("Blurnonymizer initialized successfully.")
+except Exception as e:
+    print(f"Error initializing Blurnonymizer: {e}")
+    print(traceback.format_exc())
+    raise gr.Error(f"Failed to initialize Blurnonymizer. Check logs. Error: {e}")
+# --- Core Processing Function ---
+@spaces.GPU(duration=20)  # add this so that the sam segmentation runs on the gpu
+def anonymise_image(input_image_np: np.ndarray, boxes: list[BoundingBox]):
+    """Calls the blurnonymizer instance to censor the image."""
+    if not blurnonymizer_instance:
+        raise gr.Error("Blurnonymizer not initialized.")
+    return blurnonymizer_instance.censor_image_blur_easy(
+        input_image_np, boxes, method="segmentation", verbose=False # Set verbose as needed
+    )
+def run_model_inference(input_image_pil: Image.Image, prompt_text: str):
+    """
+    Runs model inference on the input image and prompt.
+    """
+    # 1. Run Model Inference
+    print("Running model inference...")
+    messages = build_messages(
+        input_image_pil,
+        prompt=prompt_text)
+    input_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True)
+    # Prepare inputs for the model
+    inputs = tokenizer(
+        input_image_pil,
+        input_text,
+        return_tensors="pt",
+    ).to("cuda")
+    out_tokens = model.generate(
+        **inputs,
+        max_new_tokens=MAX_NEW_TOKENS,
+        use_cache=True,
+        temperature=TEMPERATURE,
+        min_p=MIN_P,
+    )
+    generated_ids_trimmed = [
+        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, out_tokens)
+    ]
+    raw_model_output = tokenizer.batch_decode(
+        generated_ids_trimmed,
+        skip_special_tokens=True,
+        clean_up_tokenization_spaces=True,
+    )[0]
+    input_height = inputs['image_grid_thw'][0][1]*14
+    input_width = inputs['image_grid_thw'][0][2]*14
+    if input_height != input_image_pil.height:
+        print("[!] tokenized image height differs from actual height:")
+        print(f"Actual: {input_image_pil.height}, processed: {input_height}")
+    if input_width != input_image_pil.width:
+        print("[!] tokenized image width differs from actual width:")
+        print(f"Actual: {input_image_pil.width}, processed: {input_width}")
+    print("[+] Model inference completed.")
+    print("[*] Raw output:")
+    print(raw_model_output)
+    return raw_model_output, input_height, input_width
+@spaces.GPU(duration=90)  # Request GPU for this function, allow up to 120 seconds
+def analyze_image(input_image_pil: Image.Image, prompt_text: str):
+    """
+    Analyzes the input image using the VLM, visualizes findings, and anonymizes.
+    """
+    if input_image_pil is None:
+        raise gr.Error("Please upload an image.")
+    if not prompt_text:
+        raise gr.Error("Please provide a prompt.")
+    original_image_np = np.array(input_image_pil)
+    # 1. Run Model Inference
+    try:
+        raw_model_output, image_height, image_width = run_model_inference(input_image_pil, prompt_text)
+    except Exception as e:
+        print(f"Error during model inference: {e}")
+        print(traceback.format_exc())
+        raise gr.Error(f"Model inference failed: {e}")
+    # 2. Parse Findings
+    try:
+        print("Parsing findings...")
+        # Use the provided utility functions
+        parsed_findings = utils.parse_into_models(
+            utils.parse_json_response(raw_model_output)
+        )
+        print(f"[+] Parsed {len(parsed_findings)} findings.")
+        if not parsed_findings:
+            print("[*] No findings were parsed from the model output.")
+    except Exception as e:
+        print(f"Error parsing model output: {e}")
+        print(traceback.format_exc())
+        # Don't raise error here, allow visualization/anonymization steps to proceed if possible
+        # or return early with only original image if parsing is critical
+        gr.Warning(
+            f"Could not parse findings from model output: {e}. Visualization and anonymization might be incomplete."
+        )
+        # Fallback: visualize/anonymize based on empty findings list if needed
+        parsed_findings = []  # Ensure it's an empty list for downstream steps
+    # Initialize boxes_for_viz before the try block
+    boxes_for_viz = []
+    try:
+        # 3. Visualize Findings
+        print("Visualizing findings...")
+        if parsed_findings:
+            # Convert Findings to BoundingBox for visualization function
+            boxes_for_viz = [BoundingBox.from_finding(f) for f in parsed_findings]
+            # Ensure image is in the correct format (np array) for visualize_boxes_annotated
+            visualized_image_np = utils.visualize_boxes_annotated(
+                original_image_np, boxes_for_viz
+            )
+            print("Visualization generated.")
+        else:
+            print("No findings to visualize, using original image.")
+            visualized_image_np = (
+                original_image_np.copy()
+            )  # Show original if no findings
+    except Exception as e:
+        print(f"Error during visualization: {e}")
+        print(traceback.format_exc())
+        gr.Warning(f"Failed to visualize findings: {e}")
+        visualized_image_np = original_image_np.copy()  # Fallback to original
+    try:
+        # 4. Anonymize Image
+        print("Anonymizing image...")
+        # Use the blurnonymize function with the raw output (as it might contain info needed by the func)
+        # Ensure image is numpy array
+        # Check if boxes_for_viz is populated before calling anonymise_image
+        if boxes_for_viz:
+            anonymized_image_np = anonymise_image(original_image_np, boxes_for_viz)
+            print("Anonymization generated.")
+        else:
+            print("No boxes found for anonymization, using original image.")
+            anonymized_image_np = original_image_np.copy()
+    except Exception as e:
+        print(f"Error during anonymization: {e}")
+        print(traceback.format_exc())
+        gr.Warning(f"Failed to anonymize image: {e}")
+        anonymized_image_np = original_image_np.copy()  # Fallback to original
+    # Convert numpy arrays back to PIL Images for Gradio output if needed, or let Gradio handle numpy
+    # Gradio's gr.Image output can handle numpy arrays directly
+    # Return the three images
+    return raw_model_output, visualized_image_np, anonymized_image_np
+# --- Gradio Interface ---
+with gr.Blocks() as demo:
+    gr.Markdown("# Private Data Detection & Anonymization UI")
+    gr.Markdown(f"Using model: `{MODEL_NAME}` on ZeroGPU.")
+    with gr.Row():
+        with gr.Column(scale=1):
+            input_image = gr.Image(type="pil", label="Upload Image")
+            prompt_textbox = gr.Textbox(
+                label="Analysis Prompt", value=DEFAULT_PROMPT, lines=4
+            )
+            analyze_button = gr.Button("Analyze Image")
+        with gr.Column(scale=2):
+            with gr.Column():
+                raw_output = gr.Textbox(
+                    label="Raw Model Output", interactive=False
+                )
+                output_visualized = gr.Image(
+                    label="Detected Privacy Findings", type="numpy", interactive=False
+                )
+                output_anonymized = gr.Image(
+                    label="Anonymized", type="numpy", interactive=False
+                )
+    analyze_button.click(
+        fn=analyze_image,
+        inputs=[input_image, prompt_textbox],
+        outputs=[raw_output, output_visualized, output_anonymized],
+    )
+# --- Launch App ---
+if __name__ == "__main__":
+    demo.queue().launch(
+        debug=True
+    )  # Enable queue for handling multiple requests, debug mode for logs

blurnonymize.py ADDED Viewed

	@@ -0,0 +1,300 @@

+import json
+import traceback
+from typing import Literal, Optional
+import cv2
+import matplotlib.patches as patches
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+from pydantic import BaseModel
+from sam2.build_sam import build_sam2
+from sam2.sam2_image_predictor import SAM2ImagePredictor
+from utils import *
+# --- Utility Functions (kept outside the class) ---
+def blur_image(img: np.ndarray):
+    """Applies Gaussian blur to an image."""
+    return cv2.GaussianBlur(img, (35, 35), 50)
+def plot_polygon_mask(image: np.ndarray, polygons: list[list[tuple[int, int]]]):
+    """
+    Plots polygon-based segmentation masks on top of an image.
+    """
+    plt.imshow(image)
+    for polygon in polygons:
+        if not polygon: continue # Skip empty polygons
+        polygon_array = np.array(polygon).reshape(-1, 2)
+        x, y = zip(*polygon_array)
+        x = list(x) + [x[0]]
+        y = list(y) + [y[0]]
+        plt.plot(x, y, '-r', linewidth=2)
+    plt.axis('off')
+    plt.tight_layout()
+    plt.show()
+def visualize_boxes(image, findings):
+    """Visualizes bounding boxes on an image."""
+    fig, ax = plt.subplots(1)
+    ax.imshow(image)
+    colors = ['r', 'g', 'b', 'c', 'm', 'y', 'k']
+    for i, finding in enumerate(findings):
+        [x_min, y_min, x_max, y_max] = finding.bounding_box
+        color = colors[i % len(colors)]
+        rect = patches.Rectangle((x_min, y_min), x_max - x_min, y_max - y_min, linewidth=2, edgecolor=color,
+                                 facecolor='none')
+        ax.add_patch(rect)
+        print(f"Finding {i + 1} (Color: {color}):")
+    if not findings:
+        print("No findings")
+    plt.xticks(np.arange(0, image.shape[1], 50))
+    plt.yticks(np.arange(0, image.shape[0], 50))
+    plt.show()
+# --- SAM Visualization Helpers (kept outside the class) ---
+def show_mask(mask, ax, random_color=False, borders=True):
+    """Displays a single mask on a matplotlib axis."""
+    if random_color:
+        color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)
+    else:
+        color = np.array([30 / 255, 144 / 255, 255 / 255, 0.6])
+    h, w = mask.shape[-2:]
+    mask = mask.astype(np.uint8)
+    mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
+    if borders:
+        contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
+        # contours = [cv2.approxPolyDP(contour, epsilon=0.01, closed=True) for contour in contours] # Optional smoothing
+        mask_image = cv2.drawContours(mask_image, contours, -1, (1, 1, 1, 0.5), thickness=2)
+    ax.imshow(mask_image)
+def show_points(coords, labels, ax, marker_size=375):
+    """Displays points (positive/negative) on a matplotlib axis."""
+    pos_points = coords[labels == 1]
+    neg_points = coords[labels == 0]
+    ax.scatter(pos_points[:, 0], pos_points[:, 1], color='green', marker='*', s=marker_size, edgecolor='white', linewidth=1.25)
+    ax.scatter(neg_points[:, 0], neg_points[:, 1], color='red', marker='*', s=marker_size, edgecolor='white', linewidth=1.25)
+def show_box(box, ax):
+    """Displays a bounding box on a matplotlib axis."""
+    x0, y0 = box[0], box[1]
+    w, h = box[2] - box[0], box[3] - box[1]
+    ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor='green', facecolor=(0, 0, 0, 0), lw=2))
+def show_masks(image, masks, scores, point_coords=None, box_coords=None, input_labels=None, borders=True):
+    """Displays multiple masks resulting from SAM prediction."""
+    for i, (mask, score) in enumerate(zip(masks, scores)):
+        plt.figure(figsize=(10, 10))
+        plt.imshow(image)
+        show_mask(mask, plt.gca(), borders=borders)
+        if point_coords is not None:
+            assert input_labels is not None
+            show_points(point_coords, input_labels, plt.gca())
+        if box_coords is not None:
+            show_box(box_coords, plt.gca())
+        if len(scores) > 1:
+            plt.title(f"Mask {i + 1}, Score: {score:.3f}", fontsize=18)
+        plt.axis('off')
+        plt.show()
+# --- ImageBlurnonymizer Class ---
+class ImageBlurnonymizer:
+    def __init__(self, checkpoint="./sam2.1_hiera_large.pt", model_cfg="configs/sam2.1/sam2.1_hiera_l.yaml"):
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.predictor = SAM2ImagePredictor(build_sam2(model_cfg, checkpoint, device=self.device))
+    @staticmethod
+    def _smoothen_mask(mask: np.ndarray):
+        """Applies morphological closing to smoothen mask boundaries."""
+        kernel = np.ones((20, 20), np.uint8)
+        return cv2.morphologyEx(mask, cv2.MORPH_CLOSE, kernel)
+    @staticmethod
+    def _mask_from_bbox(image_shape, bbox: tuple[int, int, int, int]):
+        """Creates a simple rectangular mask from a bounding box."""
+        height, width, *_ = image_shape # Allow for 2D or 3D shape tuple
+        xmin, ymin, xmax, ymax = bbox
+        mask = np.zeros((height, width), dtype=np.uint8)
+        mask[ymin:ymax, xmin:xmax] = 1
+        return mask # No need for np.array() conversion
+    @staticmethod
+    def _apply_blur_mask(image: np.ndarray, mask: np.ndarray):
+        """Applies a blur to an image based on a mask."""
+        if mask.ndim == 2: # Ensure mask is 3-channel for broadcasting
+             mask = np.stack((mask,) * image.shape[2], axis=-1)
+        blurred = blur_image(image) # Use the utility function
+        return np.where(mask, blurred, image)
+    @staticmethod
+    def _binary_mask_to_polygon(binary_mask: np.ndarray, epsilon=2.0):
+        """Converts a binary segmentation mask to polygon contours."""
+        try:
+            converted = (binary_mask * 255).astype(np.uint8)
+            # Use RETR_TREE to get hierarchy, CHAIN_APPROX_SIMPLE for efficiency
+            contours, _ = cv2.findContours(converted, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+            polygons = []
+            for contour in contours:
+                approx_contour = cv2.approxPolyDP(contour, epsilon, True)
+                # Ensure points are converted correctly
+                polygon = [(int(point[0][0]), int(point[0][1])) for point in approx_contour]
+                polygons.append(polygon)
+            return polygons
+        except Exception as e:
+            print(f"An error occurred during polygon conversion: {e}")
+            print(traceback.format_exc())
+            return None # Return None on error
+    def get_segmentation_mask(self, image: np.ndarray, bbox: tuple[int, int, int, int]):
+        """
+        Generates a segmentation mask for a region defined by a bounding box using SAM.
+        Adds points within the bounding box to guide SAM towards the intended object (e.g., face)
+        and away from surrounding elements (e.g., hair).
+        """
+        x_min, y_min, x_max, y_max = bbox
+        x_width = x_max - x_min
+        y_height = y_max - y_min # Corrected variable name
+        # Handle cases where box dimensions are too small for third calculations
+        x_third = x_width // 3 if x_width >= 3 else 0
+        y_third = y_height // 3 if y_height >= 3 else 0
+        center_point = [(x_min + x_max) // 2, (y_min + y_max) // 2]
+        # Define points ensuring they stay within the image boundaries
+        points = [center_point]
+        if y_third > 0:
+            points.append([center_point[0], center_point[1] - y_third])
+            points.append([center_point[0], center_point[1] + y_third])
+        if x_third > 0:
+             points.append([center_point[0] + x_third, center_point[1]])
+             points.append([center_point[0] - x_third, center_point[1]])
+        # Ensure points are valid coordinates (e.g., non-negative)
+        points = [[max(0, p[0]), max(0, p[1])] for p in points]
+        self.predictor.set_image(image)
+        masks, scores, _ = self.predictor.predict(
+            box=np.array(bbox), # Predictor might expect numpy array
+            point_coords=np.array(points),
+            point_labels=np.ones(len(points)), # Label 1 for inclusion
+            multimask_output=True,
+        )
+        # Sort masks by score and select the best one
+        sorted_ind = np.argsort(scores)[::-1]
+        best_mask = masks[sorted_ind[0]]
+        best_score = scores[sorted_ind[0]]
+        return self._smoothen_mask(best_mask), best_score
+    def censor_image_blur(self, image: np.ndarray, raw_out: str,
+                          method: Optional[Literal['segmentation', 'bbox']] = 'segmentation', verbose=False):
+        """
+        Censors an image by blurring regions identified in the raw_out string (LLM output).
+        """
+        json_output = parse_json_response(raw_out)
+        # Ensure json_output is a list before passing to parse_into_models
+        if isinstance(json_output, dict):
+            findings_list = [json_output]
+        elif isinstance(json_output, list):
+            findings_list = json_output
+        else:
+             # Handle unexpected type or raise an error
+             print(f"Warning: Unexpected output type from parse_json_response: {type(json_output)}")
+             findings_list = []
+        parsed = parse_into_models(findings_list)
+        # Filter findings based on severity
+        filtered = [entry for entry in parsed if entry.severity > 0]
+        if verbose:
+            visualize_boxes(image, filtered) # Use external visualization
+        masks = []
+        for finding in filtered:
+            bbox = finding.bounding_box # Assuming finding has a 'bounding_box' attribute
+            if method == 'segmentation':
+                mask, _ = self.get_segmentation_mask(image, bbox) # Use instance method
+                if verbose:
+                    polygons = self._binary_mask_to_polygon(mask)
+                    if polygons: # Check if polygon conversion was successful
+                       plot_polygon_mask(image, polygons) # Use external visualization
+            elif method == 'bbox':
+                 mask = self._mask_from_bbox(image.shape, bbox) # Use static method
+            else:
+                 print(f"Warning: Unknown method '{method}'. Defaulting to no mask for this finding.")
+                 continue # Skip if method is invalid
+            masks.append(mask)
+        if masks: # Check if any masks were generated
+            # Combine masks: logical OR ensures any pixel in any mask is included
+            combined_mask = np.zeros_like(masks[0], dtype=np.uint8)
+            for mask in masks:
+                # Ensure masks are boolean or uint8 for logical_or
+                combined_mask = np.logical_or(combined_mask, mask.astype(bool)).astype(np.uint8)
+            return self._apply_blur_mask(image, combined_mask) # Use static method
+        return image # Return original image if no masks
+    def censor_image_blur_easy(self, image: np.ndarray, boxes: list[BoundingBox],
+                          method: Optional[Literal['segmentation', 'bbox']] = 'segmentation', verbose=False):
+        """
+        Censors an image by blurring regions defined by a list of BoundingBox objects.
+        """
+        masks = []
+        for box in boxes:
+            bbox_tuple = box.to_tuple() # Convert BoundingBox object to tuple
+            if method == 'segmentation':
+                mask, _ = self.get_segmentation_mask(image, bbox_tuple)
+                if verbose:
+                    polygons = self._binary_mask_to_polygon(mask)
+                    if polygons:
+                        plot_polygon_mask(image, polygons)
+            elif method == 'bbox':
+                mask = self._mask_from_bbox(image.shape, bbox_tuple)
+            else:
+                 print(f"Warning: Unknown method '{method}'. Defaulting to no mask for this box.")
+                 continue
+            masks.append(mask)
+        if masks:
+            combined_mask = np.zeros_like(masks[0], dtype=np.uint8)
+            for mask in masks:
+                combined_mask = np.logical_or(combined_mask, mask.astype(bool)).astype(np.uint8)
+            return self._apply_blur_mask(image, combined_mask)
+        return image
+# Example Usage (Optional - keep outside class):
+# if __name__ == '__main__':
+#     # Load an image
+#     # img = cv2.imread('path/to/your/image.jpg')
+#     # img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) # Convert to RGB for matplotlib
+#     # Create an instance of the blurnonymizer
+#     # blurnonymizer = ImageBlurnonymizer()
+#     # Define bounding boxes or get raw LLM output
+#     # example_boxes = [BoundingBox(xmin=100, ymin=100, xmax=200, ymax=200)] # Assuming BoundingBox class exists
+#     # llm_output = '...' # Your raw LLM output string
+#     # Censor the image
+#     # censored_img_easy = blurnonymizer.censor_image_blur_easy(img, example_boxes, method='segmentation', verbose=True)
+#     # censored_img_llm = blurnonymizer.censor_image_blur(img, llm_output, method='segmentation', verbose=True)
+#     # Display or save the result
+#     # plt.imshow(censored_img_easy)
+#     # plt.show()

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+gradio
+unsloth
+transformers
+torch
+pydantic
+numpy
+pandas
+Pillow
+opencv-python
+spaces
+matplotlib
+sam2

utils.py ADDED Viewed

	@@ -0,0 +1,351 @@

+from pydantic import BaseModel, field_validator
+import numpy as np
+import json
+import matplotlib.patches as patches
+import matplotlib.pyplot as plt
+from PIL import Image
+import base64
+from io import BytesIO
+import io
+def encode_image(image: np.ndarray) -> str:
+    """Encodes a NumPy array image into a base64 JPEG string.
+    Args:
+        image: A NumPy array representing the image.
+    Returns:
+        A base64 encoded string prefixed with 'data:image/jpeg;base64,'.
+    """
+    pil_image = Image.fromarray(image)
+    buffer = BytesIO()
+    pil_image.save(buffer, format='jpeg')
+    return f"data:image/jpeg;base64,{base64.b64encode(buffer.getvalue()).decode('utf-8')}"
+def decode_image(base64_str: str) -> np.ndarray:
+    """Decodes a base64 encoded image string into a NumPy array.
+    Assumes the base64 string represents a valid image format (e.g., JPEG, PNG).
+    Args:
+        base64_str: The base64 encoded image string (may include prefix).
+    Returns:
+        A NumPy array representing the decoded image.
+    """
+    # Remove the prefix if it exists
+    if ',' in base64_str:
+        base64_str = base64_str.split(',', 1)[1]
+    # Decode the base64 string
+    image_data = base64.b64decode(base64_str)
+    # Convert the image data to a PIL Image
+    image = Image.open(io.BytesIO(image_data))
+    # Convert the PIL Image to a NumPy array
+    numpy_image = np.array(image)
+    return numpy_image
+class Finding(BaseModel):
+    """Represents a detected finding in an image, including its label,
+    description, explanation, bounding box coordinates, and severity level.
+    """
+    label: str
+    description: str
+    explanation: str
+    bounding_box: tuple[int, int, int, int]
+    severity: int
+    @field_validator("bounding_box")
+    @classmethod
+    def validate_bounding_box(cls, value: tuple[int, int, int, int]):
+        """Validates that the bounding box coordinates are logically consistent."""
+        if len(value) != 4:
+            raise ValueError("Bounding box must be a tuple of 4 integers")
+        if value[0] >= value[2]:
+            raise ValueError("Bounding box x_min (index 0) must be less than x_max (index 2)")
+        if value[1] >= value[3]:
+            raise ValueError("Bounding box y_min (index 1) must be less than y_max (index 3)")
+        return value
+class BoundingBox(BaseModel):
+    """Represents a bounding box with a label and explicit min/max coordinates. Assumess that the top left corner is the origin"""
+    label: str
+    x_min: int
+    y_min: int
+    x_max: int
+    y_max: int
+    @staticmethod
+    def from_finding(finding: Finding) -> 'BoundingBox':
+        """Creates a BoundingBox instance from a Finding instance."""
+        return BoundingBox(label=finding.label, x_min=finding.bounding_box[0], y_min=finding.bounding_box[1], x_max=finding.bounding_box[2], y_max=finding.bounding_box[3])
+    @staticmethod
+    def from_array(label: str, box: list[int]) -> 'BoundingBox':
+        """Creates a BoundingBox instance from a label and a list of coordinates."""
+        return BoundingBox(label=label, x_min=box[0], y_min=box[1], x_max=box[2], y_max=box[3])
+def parse_json_response(out: str) -> list[dict]:
+    """Extracts and parses JSON content from a string.
+    Handles responses potentially wrapped in <output> tags or markdown code blocks (```json).
+    Args:
+        out: The input string potentially containing JSON.
+    Returns:
+        The parsed JSON object (list or dictionary).
+    Raises:
+        ValueError: If no valid JSON content is found.
+    """
+    start_prefix = "<output>"
+    end_postfix = "</output>"
+    start_index = out.find(start_prefix)
+    end_index = out.rfind(end_postfix)
+    if start_index == -1:
+        # try to load by finding ```json ``` markers
+        start_index = out.rfind("```json")
+        end_index = out.rfind("```")
+        if start_index == -1 or end_index == -1:
+            raise ValueError("No JSON found in response")
+        start_index += len("```json")
+        fixed = out[start_index:end_index]
+        print(f"fixed: {fixed}")
+        return json.loads(fixed)
+    start_index += len(start_prefix)
+    fixed = out[start_index:end_index]
+    fixed = fixed.strip()
+    if fixed.startswith("```json"):
+        start_index = fixed.find("[")
+        end_index = fixed.rfind("]")
+        fixed = fixed[start_index:end_index + 1]
+    return json.loads(fixed)
+def parse_into_models(findings: list[dict]) -> list[Finding]:
+    """Parses and validates a list of dictionaries into a list of Finding models.
+    Args:
+        findings: A list of dictionaries, each representing a finding.
+    Returns:
+        A list of validated Finding model instances.
+    """
+    parsed = []
+    for box in findings:
+        model_finding = Finding.model_validate(box)
+        parsed.append(model_finding)
+    return parsed
+def parse_all_safe(out: str) -> list[Finding] | None:
+    """Safely parses a string potentially containing JSON findings into Finding models.
+    Combines `parse_json_response` and `parse_into_models`, returning None on any parsing error.
+    Args:
+        out: The input string.
+    Returns:
+        A list of Finding models if parsing is successful, otherwise None.
+    """
+    try:
+        return parse_into_models(parse_json_response(out))
+    except Exception:
+        return None
+def clamp(num: int | float, min_num: int | float = 0, max_num: int | float = 255) -> int | float:
+    """Clamps a number within a specified range [min_num, max_num]."""
+    return max(min_num, min(num, max_num))
+def enlarge_boxes(image_shape: tuple[int, int], findings: list[Finding], factor: float = 1.1) -> list[Finding]:
+    """Enlarges the bounding boxes of findings by a given factor, clamping to image boundaries.
+    Args:
+        image_shape: A tuple (height, width) representing the image dimensions.
+        findings: A list of Finding objects.
+        factor: The factor by which to enlarge the boxes (e.g., 1.1 for 10% larger).
+    Returns:
+        A new list of Finding objects with adjusted bounding boxes.
+    """
+    adjusted = []
+    img_height, img_width = image_shape
+    for box in findings:
+        x_min_orig, y_min_orig, x_max_orig, y_max_orig = box.bounding_box
+        x_width = x_max_orig - x_min_orig
+        y_width = y_max_orig - y_min_orig
+        # Calculate the amount to adjust on each side
+        x_adjust = (x_width * (factor - 1)) / 2
+        y_adjust = (y_width * (factor - 1)) / 2
+        # Calculate new coordinates and clamp them
+        x_min = clamp(x_min_orig - x_adjust, 0, img_width)
+        y_min = clamp(y_min_orig - y_adjust, 0, img_height)
+        x_max = clamp(x_max_orig + x_adjust, 0, img_width)
+        y_max = clamp(y_max_orig + y_adjust, 0, img_height)
+        # Ensure coordinates remain valid integers if they were originally
+        adjusted_bbox = (int(round(x_min)), int(round(y_min)), int(round(x_max)), int(round(y_max)))
+        # Validate adjusted box before creating new Finding
+        try:
+            Finding.validate_bounding_box(adjusted_bbox)
+            adjusted.append(box.model_copy(update={'bounding_box': adjusted_bbox}))
+        except ValueError:
+            # If enlarging makes the box invalid (e.g., min >= max), keep the original
+            adjusted.append(box) # Or handle the error differently if needed
+    return adjusted
+def change_box_format(shape: tuple[int, int, int], box: tuple[int, int, int, int]) -> tuple[float, float, float, float]:
+    """Normalizes bounding box coordinates from a 1000x1000 grid to the image dimensions.
+    This is only for gemini based models, as they returns coordinates normalized between 0-1000
+    Qwen based models don't need this
+    Assumes the input box coordinates are relative to a 1000x1000 grid.
+    Args:
+        shape: The shape of the target image (height, width, channels).
+        box: The bounding box tuple (x_min, y_min, x_max, y_max) in 1000x1000 coordinates.
+    Returns:
+        A tuple of normalized bounding box coordinates (x_min, y_min, x_max, y_max)
+        relative to the image dimensions.
+    """
+    y_height, x_width, _ = shape
+    # Normalize coordinates from 1000x1000 grid to image dimensions
+    x_min = (box[0] / 1000.0) * x_width
+    y_min = (box[1] / 1000.0) * y_height
+    x_max = (box[2] / 1000.0) * x_width
+    y_max = (box[3] / 1000.0) * y_height
+    return (x_min, y_min, x_max, y_max)
+def normalize_findings_boxes(shape: tuple[int, int, int], findings: list[Finding]) -> list[Finding]:
+    """Normalizes the bounding boxes of all findings in a list.
+    This is only for gemini based models, as they returns coordinates normalized between 0-1000
+    Qwen based models don't need this
+    Modifies the findings list in-place.
+    Args:
+        shape: The shape of the target image (height, width, channels).
+        findings: A list of Finding objects whose bounding boxes need normalization.
+    Returns:
+        The list of Finding objects with normalized bounding boxes (modified in-place).
+    """
+    for finding in findings:
+        # Ensure the bounding box is a tuple before passing
+        current_box = tuple(finding.bounding_box)
+        finding.bounding_box = change_box_format(shape, current_box)
+    return findings
+def change_box_format(shape, box):
+    y_width, x_width, _ = shape
+    # so apparently the bounding box always refers to a 1000x1000 grid
+    # so we need to normalize
+    # i assume that it has to do with the way their image embeddings work
+    x_min = (box[0] / 1000) * x_width
+    y_min = (box[1] / 1000) * y_width
+    x_max = (box[2] / 1000) * x_width
+    y_max = (box[3] / 1000) * y_width
+    return [x_min, y_min, x_max, y_max]
+def normalize_findings_boxes(shape, findings):
+    for finding in findings:
+        finding.bounding_box = change_box_format(shape, finding.bounding_box)
+    return findings
+def visualize_boxes(image, findings):
+    # Create a figure and axis
+    fig, ax = plt.subplots(1)
+    ax.imshow(image)
+    # Define a list of colors for the boxes
+    colors = ['r', 'g', 'b', 'c', 'm', 'y', 'k']
+    for i, finding in enumerate(findings):
+        [x_min, y_min, x_max, y_max] = finding.bounding_box
+        # Select a color for the current box
+        color = colors[i % len(colors)]
+        rect = patches.Rectangle((x_min, y_min),
+                                 x_max - x_min,
+                                 y_max - y_min,
+                                 linewidth=2, edgecolor=color, facecolor='none')
+        ax.add_patch(rect)
+        # Print the whole finding and the color of its box
+        print(f"Finding {i+1} (Color: {color}):")
+    if (len(findings) == 0):
+        print("No findings")
+    # Set x-axis ticks every 2 units
+    #plt.xticks(np.arange(0, image.shape[1], 50))  # Start, Stop, Step
+    #plt.yticks(np.arange(0, image.shape[0], 50))  # Start, Stop, Step
+    plt.show()
+def visualize_boxes_annotated(image: np.ndarray | Image.Image, boxes: list[BoundingBox]) -> np.ndarray:
+    """Draws bounding boxes with labels on an image and returns the annotated image as a NumPy array.
+    Args:
+        image: The input image (NumPy array or PIL Image).
+        boxes: A list of BoundingBox objects with coordinates relative to the image.
+    Returns:
+        A NumPy array representing the image with annotated bounding boxes.
+    """
+    if not isinstance(image, np.ndarray):
+        image = np.array(image)
+    # Create a figure and axis with high DPI
+    fig = plt.figure(dpi=300)
+    ax = plt.subplot(111)
+    ax.imshow(image)
+    ax.set_axis_off()
+    plt.subplots_adjust(left=0, right=1, top=1, bottom=0, wspace=0, hspace=0)
+    # Define a list of colors for the boxes
+    colors = ['r', 'g', 'b', 'c', 'm', 'y', 'k']
+    for i, box in enumerate(boxes):
+        x_min = box.x_min
+        y_min = box.y_min
+        x_max = box.x_max
+        y_max = box.y_max
+        label = box.label
+        # Select a color for the current box
+        color = colors[i % len(colors)]
+        rect = patches.Rectangle((x_min, y_min),
+                                 x_max - x_min,
+                                 y_max - y_min,
+                                 linewidth=1, edgecolor=color, facecolor='none')
+        ax.add_patch(rect)
+        # Add label text above the box
+        ax.text(x_min, y_min-5, label, color=color, fontsize=10,
+                bbox=dict(facecolor='white', alpha=0.7, edgecolor='none'))
+    # Instead of displaying, save to numpy array
+    fig.canvas.draw()
+    data = np.frombuffer(fig.canvas.buffer_rgba(), dtype=np.uint8)
+    data = data.reshape(fig.canvas.get_width_height()[::-1] + (4,))
+    # Convert RGBA to RGB
+    data = data[:, :, :3]
+    plt.close()
+    return data