Spaces:

caltech-animal-tracking
/

Primate_Batch_Detection

Sleeping

+BATCH_SIZE = 8 # Change this to your desired batch size
+CUDA_PATH = "/usr/local/cuda-12.3/" # Change this to your CUDA path
+from datetime import datetime
+import os
+import sys
+# set CUDA_HOME
+os.environ["CUDA_HOME"] = CUDA_PATH
+import gradio as gr
+from tqdm import tqdm
+import os
+import time
+from owl_batch import owl_batch_video
+# global CSV_PATH # csv that contains video names and detection results
+# global POS_ZIP # zip of positive videos and individual results
+# global NEG_ZIP # zip of negative videos and individual results
+def run_owl_batch(
+    input_vids : list[str] | str,
+    target_prompt: str,
+    species_prompt: str,
+    conf_threshold: float,
+    fps_processed: int,
+    scaling_factor: float
+) -> tuple[str, str, str]:
+    """
+    args:
+        input_vids: list of video paths
+        target_prompt: prompt to search for
+        species_prompt: prompt to query
+        threshold: threshold for detection
+        fps_processed: number of frames per second to process
+        scaling_factor: factor to scale the frames by
+    returns:
+        csv_path: path to csv file
+        pos_zip: path to zip file of positive videos
+        neg_zip: path to zip file of negative videos
+    """
+    start_time = time.time()
+    if type(input_vids) == str:
+        input_vids = [input_vids]
+    for vid in input_vids:
+        new_input_vid = vid.replace(" ", "_") # make sure there are no spaces in the name
+        os.rename(vid, new_input_vid)
+    # species prompt has to contain target prompt, otherwise add it
+    if target_prompt not in species_prompt:
+        species_prompt = f"{species_prompt}, {target_prompt}"
+    # turn target prompt into a list
+    target_prompt = target_prompt.split(", ")
+    now = datetime.now()
+    timestamp = now.strftime("%Y-%m-%d_%H-%M")
+    zip_path = owl_batch_video(
+                    input_vids,
+                    target_prompt,
+                    species_prompt,
+                    conf_threshold,
+                    fps_processed=fps_processed,
+                    scaling_factor=1/scaling_factor,
+                    batch_size=BATCH_SIZE,
+                    save_dir=f"temp_{timestamp}")
+    end_time = time.time()
+    print(f'Processing time: {end_time - start_time} seconds')
+    return zip_path
+with gr.Blocks() as demo:
+    gr.HTML(
+        """
+            <h1 align="center" style="font-size:xxx-large">🦍 Primate Detection</h1>
+        """
+    )
+    with gr.Row():
+        with gr.Column():
+            input = gr.File(label="Upload Videos", file_types=['.mp4', '.mov'], file_count="multiple")
+            target_prompt = gr.Textbox(label="What do you want to detect? (Multiple species should be separated by commas)")
+            species_prompt = gr.Textbox(label="Which species are in your dataset? (Multiple species should be separated by commas)")
+            with gr.Accordion("Advanced Options", open=False):
+                conf_threshold = gr.Slider(
+                    label="Confidence Threshold",
+                    info="Adjust the threshold to change the sensitivity of the model, lower thresholds being more sensitive.",
+                    minimum=0.0,
+                    maximum=1.0,
+                    value=0.3,
+                    step=0.05
+                )
+                fps_processed = gr.Slider(
+                    label="Frame Detection Rate",
+                    info="Adjust the frame detection rate. I.e. a value of 120 will run detection every 120 frames, a value of 1 will run detection on every frame. Note: the lower the number the slower the processing time.",
+                    minimum=1,
+                    maximum=120,
+                    value=10,
+                    step=1)
+                scaling_factor = gr.Slider(
+                    label="Downsample Factor",
+                    info="Adjust the downsample factor. Note: the higher the number the faster the processing time but lower the accuracy.",
+                    minimum=1,
+                    maximum=10,
+                    value=4,
+                    step=1
+                )
+            with gr.Row():
+                    clear_btn = gr.ClearButton(components=[input, target_prompt, species_prompt])
+                    run_btn = gr.Button(value="Run Detection", variant='primary')
+        with gr.Column():
+            download_file = gr.Files(label="CSV, Video Output", interactive=False)
+    run_btn.click(fn=run_owl_batch, inputs=[input, target_prompt, species_prompt, conf_threshold, fps_processed, scaling_factor], outputs=[download_file])
+    gr.DuplicateButton()
+    gr.Markdown(
+        """
+        ## Frequently Asked Questions
+        ##### How can I run the interface on my own computer?
+        By clicking on the three dots on the top right corner of the interface, you will be able to clone the repository or run it with a Docker image on your local machine. \
+        For local machine setup instructions please check the README file.
+        ##### The video is very slow to process, how can I speed it up?
+        You can speed up the processing by adjusting the frame detection rate in the advanced options. The lower the number the slower the processing time. Choosing only\
+        bounding boxes will make the processing faster. You can also duplicate the space using the Duplicate Button and choose a different GPU which will make the processing faster.
+        """
+    )
+demo.launch(share=True)

owl_batch.py ADDED Viewed

	@@ -0,0 +1,210 @@

+import os
+import shutil
+from tqdm import tqdm
+import cv2
+import pandas as pd
+import torch
+from PIL import Image
+from transformers import Owlv2Processor, Owlv2ForObjectDetection
+import math
+import zipfile
+from utils import plot_predictions, mp4_to_png, vid_stitcher
+def owl_batch_video(
+    input_vids: list[str],
+    target_prompt: list[str],
+    species_prompt: str,
+    threshold: float,
+    fps_processed: int = 1,
+    scaling_factor: float = 0.5,
+    batch_size: int = 8,
+    save_dir: str = "temp/"
+):
+    pos_preds = []
+    neg_preds = []
+    df = pd.DataFrame(columns=["video path", "detection?"])
+    for vid in input_vids:
+        detection = owl_video_detection(vid,
+                            target_prompt,
+                            species_prompt,
+                            threshold,
+                            fps_processed=fps_processed,
+                            scaling_factor=scaling_factor,
+                            batch_size=batch_size,
+                            save_dir=save_dir)
+        if detection == True:
+            pos_preds.append(vid)
+            row = pd.DataFrame({"video path": [vid], "detection?": ["True"]})
+            df = pd.concat([df, row], ignore_index=True)
+        else:
+            neg_preds.append(vid)
+            row = pd.DataFrame({"video path": [vid], "detection?": ["False"]})
+            df = pd.concat([df, row], ignore_index=True)
+    # save the df
+    df.to_csv(f"{save_dir}/detection_results.csv")
+    # zip the save_dir
+    zip_file = f"{save_dir}/results.zip"
+    zip_directory(save_dir, zip_file)
+    return zip_file
+def zip_directory(folder_path, output_zip_path):
+    with zipfile.ZipFile(output_zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
+        for root, dirs, files in os.walk(folder_path):
+            for file in files:
+                file_path = os.path.join(root, file)
+                # Write the file with a relative path to preserve folder structure
+                arcname = os.path.relpath(file_path, start=folder_path)
+                zipf.write(file_path, arcname)
+def preprocess_text(text_prompt: str, num_prompts: int = 1):
+    """
+    Takes a string of text prompts and returns a list of lists of text prompts for each image.
+    i.e. text_prompt = "a, b, c" -> [["a", "b", "c"], ["a", "b", "c"]]
+    """
+    text_prompt = [s.strip() for s in text_prompt.split(",")]
+    text_queries = [text_prompt] * num_prompts
+    # print("text_queries:", text_queries)
+    return text_queries
+def owl_batch_prediction(
+        images: torch.Tensor,
+        text_queries : list[str], # assuming that every image is queried with the same text prompt
+        threshold: float,
+        processor,
+        model,
+        device: str = 'cuda'
+    ):
+    inputs = processor(text=text_queries, images=images, return_tensors="pt").to(device)
+    with torch.no_grad():
+        outputs = model(**inputs)
+     # Target image sizes (height, width) to rescale box predictions [batch_size, 2]
+    target_sizes = torch.Tensor([img.size[::-1] for img in images]).to(device)
+    # Convert outputs (bounding boxes and class logits) to COCO API, resizes to original image size and filter by threshold
+    results = processor.post_process_object_detection(outputs=outputs, target_sizes=target_sizes, threshold=threshold)
+    return results
+def count_pos(phrases: list[str], text_targets: list[str]) -> int:
+    """
+    Counts how many phrases in the list match any of the target phrases.
+    Args:
+        phrases: A list of strings to evaluate.
+        text_targets: A list of target strings to match against.
+    Returns:
+        The number of phrases that match any of the targets.
+    """
+    if len(phrases) == 0 or len(text_targets) == 0:
+        return 0
+    target_set = set(text_targets)
+    return sum(1 for phrase in phrases if phrase in target_set)
+def owl_video_detection(
+        vid_path: str,
+        text_target: list[str],
+        text_prompt: str,
+        threshold: float,
+        fps_processed: int = 1,
+        scaling_factor: float = 0.5,
+        processor = Owlv2Processor.from_pretrained("google/owlv2-base-patch16-ensemble"),
+        model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16-ensemble").to('cuda'),
+        device: str = 'cuda',
+        batch_size: int = 8,
+        save_dir: str = "temp/",
+        ):
+    """
+    Runs owl on a video and saves the results to a dataframe.
+    Returns True if text_target is detected in the video, False otherwise.
+    Stops running owl when a text_target is detected.
+    """
+    os.makedirs(save_dir, exist_ok=True)
+    os.makedirs(f"{save_dir}/positives", exist_ok=True)
+    os.makedirs(f"{save_dir}/negatives", exist_ok=True)
+    # set up df for results
+    df  = pd.DataFrame(columns=["frame", "boxes", "scores", "labels", "count"])
+    # create new dirs and paths for results
+    filename = os.path.splitext(os.path.basename(vid_path))[0]
+    frames_dir = f"{save_dir}/{filename}_frames"
+    os.makedirs(frames_dir, exist_ok=True)
+    # process video and create a directory of video frames
+    fps = mp4_to_png(vid_path, frames_dir, scaling_factor)
+    # get all frame paths
+    frame_filenames = os.listdir(frames_dir)
+    frame_paths = []  # list of frame paths to process based on fps_processed
+    # for every frame processed, add to frame_paths
+    for i, frame in enumerate(frame_filenames):
+        if i % fps_processed == 0:
+            frame_paths.append(os.path.join(frames_dir, frame))
+    # run owl in batches
+    for i in tqdm(range(0, len(frame_paths), batch_size), desc="Running batches"):
+        frame_nums = [i*fps_processed for i in range(batch_size)]
+        batch_paths = frame_paths[i:i+batch_size]  # paths for this batch
+        images = [Image.open(image_path) for image_path in batch_paths]
+        # run owl on this batch of frames
+        text_queries = preprocess_text(text_prompt, len(batch_paths))
+        results = owl_batch_prediction(images, text_queries, threshold, processor, model, device)
+        # get the boxes, logits, and phrases for this batch
+        label_ids = []
+        for entry in results:
+            if entry['labels'].numel() > 0:
+                label_ids.append(entry['labels'].tolist())
+            else:
+                label_ids.append(None)
+        text = text_queries[0] # assuming that all texts in query are the same for each image
+        labels = []
+        # convert label_ids to phrases, if no phrases, append None
+        for idx in label_ids:
+            if idx is not None:
+                idx = [text[id] for id in idx]
+                labels.append(idx)
+            else:
+                labels.append([])
+        batch_pos = 0
+        for j, image in enumerate(batch_paths):
+            boxes = results[j]['boxes'].cpu().numpy()
+            scores = results[j]['scores'].cpu().numpy()
+            count = count_pos(labels[j], text_target)
+            row = pd.DataFrame({"frame": [image], "boxes": [boxes], "scores": [scores], "labels": [labels[j]], "count": count})
+            df = pd.concat([df, row], ignore_index=True)
+            # if there are detections, save the frame replacing the original frame
+            if count > 0:
+                annotated_frame = plot_predictions(image, labels[j], scores, boxes)
+                cv2.imwrite(image, annotated_frame)
+                batch_pos += 1
+        # if more than 2/3 batch frames are positive, return True
+        if batch_pos > math.ceil(2/3*batch_size):
+            vid_stitcher(frames_dir, f"{save_dir}/positives/{filename}_{threshold}.mp4", fps)
+            shutil.rmtree(frames_dir) # delete the frames to save space
+            df.to_csv(f"{save_dir}/positives/{filename}_{threshold}.csv", index=False)
+            return True
+    shutil.rmtree(frames_dir) # delete the frames to save space
+    df.to_csv(f"{save_dir}/negatives/{filename}_{threshold}.csv", index=False)
+    return False

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+gradio==5.9.1
+numpy
+opencv-python
+pandas
+Pillow
+supervision
+torch
+tqdm
+transformers

utils.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import torch
+import subprocess
+import numpy as np
+import supervision as sv
+import cv2
+import os
+from glob import glob
+from tqdm import tqdm
+import math
+def plot_predictions(
+        image: str,
+        labels: list[str],
+        scores: list[float],
+        boxes: list[float],
+        opacity: float = 1.0
+    ) -> np.ndarray:
+    image_source = cv2.imread(image)
+    image_source_rgb = cv2.cvtColor(image_source, cv2.COLOR_BGR2RGB)
+    boxes = sv.Detections(xyxy=boxes)
+    labels = [
+        f"{phrase} {logit:.2f}"
+        for phrase, logit
+        in zip(labels, scores)
+    ]
+    height, width, _ = image_source_rgb.shape
+    thickness = math.ceil(width/200)
+    text_scale = width/1500
+    text_thickness = math.ceil(text_scale*1.5)
+    bbox_annotator = sv.BoxAnnotator(color_lookup=sv.ColorLookup.INDEX, thickness=thickness)
+    label_annotator = sv.LabelAnnotator(color_lookup=sv.ColorLookup.INDEX, text_scale=text_scale, text_thickness=text_thickness)
+    # Create a semi-transparent overlay
+    overlay = image_source_rgb.copy()
+    # Apply bounding box annotations to the overlay
+    overlay = bbox_annotator.annotate(scene=overlay, detections=boxes)
+    overlay = label_annotator.annotate(scene=overlay, detections=boxes, labels=labels)
+    # Blend overlay with original image using the specified opacity
+    annotated_frame = cv2.addWeighted(overlay, opacity, image_source_rgb, 1 - opacity, 0)
+    annotated_frame_bgr = cv2.cvtColor(annotated_frame, cv2.COLOR_RGB2BGR)
+    return annotated_frame_bgr
+def mp4_to_png(input_path: str, save_path: str, scale_factor: float) -> str:
+    """ Converts mp4 to pngs for each frame of the video.
+        Args: input_path is the path to the mp4 file, save_path is the directory to save the frames.
+        Returns: save_path, fps the number of frames per second.
+    """
+    # get frames per second
+    fps = int(cv2.VideoCapture(input_path).get(cv2.CAP_PROP_FPS))
+    # run subprocess to convert mp4 to pngs
+    os.system(f"ffmpeg -i {input_path} -vf 'fps={fps},scale=iw*{scale_factor}:ih*{scale_factor}' {save_path}/frame%08d.png")
+    # subprocess.run(["ffmpeg", "-i", input_path, "-vf", f"scale=iw*{scale_factor}:ih*{scale_factor}, fps={fps}", f"{save_path}/frame%08d.png"])
+    return fps
+def vid_stitcher(frames_dir: str, output_path: str, fps: int = 30) -> str:
+    """
+    Takes a list of frames as numpy arrays and writes them to a video file.
+    """
+    # Get the list of frames
+    frame_list = sorted(glob(os.path.join(frames_dir, 'frame*.png')))
+    # Prepare the VideoWriter
+    frame = cv2.imread(frame_list[0])
+    height, width, _ = frame.shape
+    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
+    # Use multithreading to read frames faster
+    from concurrent.futures import ThreadPoolExecutor
+    with ThreadPoolExecutor() as executor:
+        frames = list(executor.map(cv2.imread, frame_list))
+    # Write frames to the video
+    with tqdm(total=len(frame_list), desc='Stitching frames') as pbar:
+        for frame in frames:
+            out.write(frame)
+            pbar.update(1)
+    return output_path
+def count_pos(phrases, text_target):
+    """
+    Takes a list of list of phrases and calculates the number of lists that have at least one entry that is the target phrase
+    """
+    num_pos = 0
+    for sublist in phrases:
+        if sublist == None:
+            continue
+        for phrase in sublist:
+            if phrase == text_target:
+                num_pos += 1
+                break
+    return num_pos