import torch import numpy as np import gradio as gr import cv2 import time import os import threading from queue import Queue from pathlib import Path # Create cache directory for models os.makedirs("models", exist_ok=True) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"Using device: {device}") # Use YOLOv5n (nano) for higher FPS model_path = Path("models/yolov5n.pt") if model_path.exists(): print(f"Loading model from cache: {model_path}") model = torch.hub.load("ultralytics/yolov5", "yolov5n", pretrained=True, source="local", path=str(model_path)).to(device) else: print("Downloading YOLOv5n model and caching...") model = torch.hub.load("ultralytics/yolov5", "yolov5n", pretrained=True).to(device) torch.save(model.state_dict(), model_path) # Model configurations for better performance model.conf = 0.5 # Confidence threshold model.iou = 0.45 # IOU threshold model.classes = None # Detect all classes model.max_det = 20 # Limit detections for speed if device.type == "cuda": model.half() # Half precision for CUDA else: torch.set_num_threads(os.cpu_count()) model.eval() # Precompute colors for bounding boxes np.random.seed(42) colors = np.random.uniform(0, 255, size=(len(model.names), 3)) # Performance tracking total_inference_time = 0 inference_count = 0 last_fps_values = [] # Store recent FPS values def detect_objects(image): """Process a single image for object detection""" global total_inference_time, inference_count if image is None: return None start_time = time.time() output_image = image.copy() input_size = 640 # Optimize input for inference with torch.no_grad(): results = model(image, size=input_size) inference_time = time.time() - start_time total_inference_time += inference_time inference_count += 1 avg_inference_time = total_inference_time / inference_count detections = results.pred[0].cpu().numpy() # Draw detections for *xyxy, conf, cls in detections: x1, y1, x2, y2 = map(int, xyxy) class_id = int(cls) color = colors[class_id].tolist() # Bounding box cv2.rectangle(output_image, (x1, y1), (x2, y2), color, 3, lineType=cv2.LINE_AA) # Label with class name and confidence label = f"{model.names[class_id]} {conf:.2f}" font_scale, font_thickness = 0.9, 2 (w, h), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, font_scale, font_thickness) cv2.rectangle(output_image, (x1, y1 - h - 10), (x1 + w + 10, y1), color, -1) cv2.putText(output_image, label, (x1 + 5, y1 - 5), cv2.FONT_HERSHEY_SIMPLEX, font_scale, (255, 255, 255), font_thickness, lineType=cv2.LINE_AA) fps = 1 / inference_time # Stylish FPS display overlay = output_image.copy() cv2.rectangle(overlay, (10, 10), (300, 80), (0, 0, 0), -1) output_image = cv2.addWeighted(overlay, 0.6, output_image, 0.4, 0) cv2.putText(output_image, f"FPS: {fps:.2f}", (20, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, lineType=cv2.LINE_AA) cv2.putText(output_image, f"Avg FPS: {1/avg_inference_time:.2f}", (20, 70), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, lineType=cv2.LINE_AA) return output_image def process_webcam_frame(frame): """Process a single frame from webcam""" global last_fps_values if frame is None: return None start_time = time.time() # Use a smaller size for real-time input_size = 384 # Process the frame with torch.no_grad(): results = model(frame, size=input_size) # Calculate FPS inference_time = time.time() - start_time current_fps = 1 / inference_time if inference_time > 0 else 30 # Update FPS history (keep last 30 values) last_fps_values.append(current_fps) if len(last_fps_values) > 30: last_fps_values.pop(0) avg_fps = sum(last_fps_values) / len(last_fps_values) # Create output image output = frame.copy() # Draw detections detections = results.pred[0].cpu().numpy() for *xyxy, conf, cls in detections: x1, y1, x2, y2 = map(int, xyxy) class_id = int(cls) color = colors[class_id].tolist() # Draw rectangle and label cv2.rectangle(output, (x1, y1), (x2, y2), color, 2, lineType=cv2.LINE_AA) label = f"{model.names[class_id]} {conf:.2f}" font_scale, font_thickness = 0.6, 1 (w, h), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, font_scale, font_thickness) cv2.rectangle(output, (x1, y1 - h - 5), (x1 + w + 5, y1), color, -1) cv2.putText(output, label, (x1 + 3, y1 - 3), cv2.FONT_HERSHEY_SIMPLEX, font_scale, (255, 255, 255), font_thickness, lineType=cv2.LINE_AA) # Add FPS counter cv2.rectangle(output, (10, 10), (210, 80), (0, 0, 0), -1) cv2.putText(output, f"FPS: {current_fps:.1f}", (20, 40), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 0), 2, lineType=cv2.LINE_AA) cv2.putText(output, f"Avg FPS: {avg_fps:.1f}", (20, 70), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 0), 2, lineType=cv2.LINE_AA) return output def process_uploaded_image(image): """Process an uploaded image""" return detect_objects(image) # Setup Gradio interface example_images = ["spring_street_after.jpg", "pexels-hikaique-109919.jpg"] os.makedirs("examples", exist_ok=True) # Simplified interface with proper webcam handling with gr.Blocks(title="YOLOv5 Object Detection - Real-time & Image Upload") as demo: gr.Markdown(""" # YOLOv5 Object Detection ## Real-time webcam detection and image upload processing """) with gr.Tabs(): with gr.TabItem("Real-time Detection"): gr.Markdown(""" ### Real-time Object Detection Using your webcam for continuous object detection at 30+ FPS. """) # Use Gradio's webcam component with processing function webcam = gr.Webcam(label="Webcam Input") webcam_output = gr.Image(label="Real-time Detection") detect_button = gr.Button("Detect Objects") # Connect webcam to processor detect_button.click( fn=process_webcam_frame, inputs=webcam, outputs=webcam_output ) with gr.TabItem("Image Upload"): gr.Markdown(""" ### Image Upload Detection Upload an image to detect objects. """) with gr.Row(): with gr.Column(scale=1): input_image = gr.Image(label="Input Image", type="numpy") submit_button = gr.Button("Submit", variant="primary") clear_button = gr.Button("Clear") with gr.Column(scale=1): output_image = gr.Image(label="Detected Objects", type="numpy") gr.Examples( examples=example_images, inputs=input_image, outputs=output_image, fn=process_uploaded_image, cache_examples=True ) # Set up event handlers submit_button.click(fn=process_uploaded_image, inputs=input_image, outputs=output_image) clear_button.click(lambda: (None, None), None, [input_image, output_image]) demo.launch(share=False)