import torch
import numpy as np
import gradio as gr
import cv2
import time
import os
from pathlib import Path

# Create cache directory for models
os.makedirs("models", exist_ok=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Use smaller YOLOv5n model instead of x-large
model_path = Path("models/yolov5n.pt")
if not model_path.exists():
    print("Downloading and caching YOLOv5n...")
    torch.hub.download_url_to_file("https://github.com/ultralytics/yolov5/releases/download/v6.2/yolov5n.pt", "models/yolov5n.pt")

# Optimized model loading
model = torch.hub.load("ultralytics/yolov5", "custom", path=str(model_path), autoshape=False).to(device)

# Model optimizations
model.conf = 0.5  # Slightly lower confidence threshold
model.iou = 0.45  # Lower IoU threshold for faster NMS
model.classes = None  # Detect all classes

# Precision optimizations
if device.type == "cuda":
    model.half()  # FP16 inference
    torch.backends.cudnn.benchmark = True  # Better CUDA performance
else:
    model.float()
    torch.set_num_threads(2)  # Limit CPU threads for better resource management

model.eval()

# Simplified color generation
colors = np.random.rand(len(model.names), 3) * 255

total_time = 0
frame_count = 0

def detect_objects(image):
    global total_time, frame_count
    
    if image is None:
        return None
    
    start = time.perf_counter()
    
    # Reduce input size and use optimized preprocessing
    input_size = 320  # Reduced from 640
    im = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    im = cv2.resize(im, (input_size, input_size))
    
    with torch.no_grad():
        if device.type == "cuda":
            im = torch.from_numpy(im).to(device).half().permute(2, 0, 1).unsqueeze(0) / 255
        else:
            im = torch.from_numpy(im).to(device).float().permute(2, 0, 1).unsqueeze(0) / 255
        
        pred = model(im, augment=False)[0]
    
    # Faster post-processing
    pred = pred.float() if device.type == "cpu" else pred.half()
    pred = non_max_suppression(pred, model.conf, model.iou, agnostic=False)[0]
    
    # Optimized visualization
    output = image.copy()
    if pred is not None and len(pred):
        pred[:, :4] = scale_coords(im.shape[2:], pred[:, :4], output.shape).round()
        for *xyxy, conf, cls in pred:
            x1, y1, x2, y2 = map(int, xyxy)
            cv2.rectangle(output, (x1, y1), (x2, y2), colors[int(cls)].tolist(), 2)
    
    # FPS calculation
    dt = time.perf_counter() - start
    total_time += dt
    frame_count += 1
    fps = 1 / dt
    avg_fps = frame_count / total_time
    
    # Simplified FPS display
    cv2.putText(output, f"FPS: {fps:.1f}", (10, 30), 
                cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 0), 2)
    
    return output

# Use smaller example images
example_images = ["pexels-hikaique-109919.jpg", "spring_street_after.jpg"]

with gr.Blocks(title="Optimized YOLOv5") as demo:
    gr.Markdown("# Real-Time YOLOv5 Object Detection")
    with gr.Row():
        input_img = gr.Image(label="Input", source="webcam" if os.getenv('SPACE_ID') else None)
        output_img = gr.Image(label="Output")
    gr.Examples(examples=example_images, inputs=input_img, outputs=output_img, fn=detect_objects)
    input_img.change(fn=detect_objects, inputs=input_img, outputs=output_img)

demo.launch()