import gradio as gr import torch import cv2 import numpy as np from PIL import Image, ImageEnhance from ultralytics import YOLO model_path = "best.pt" model = YOLO(model_path) def preprocessing(image): image = Image.fromarray(np.array(image)) image = ImageEnhance.Sharpness(image).enhance(2.0) image = ImageEnhance.Contrast(image).enhance(1.5) image = ImageEnhance.Brightness(image).enhance(0.8) width = 800 aspect_ratio = image.height / image.width height = int(width * aspect_ratio) image = image.resize((width, height)) return image def imageRotation(image): """Dummy function for image rotation.""" return image def detect_document(image): """Detects front and back of the document using YOLO.""" image = np.array(image) results = model(image, conf=0.85) detected_classes = set() labels = [] bounding_boxes = [] for result in results: for box in result.boxes: x1, y1, x2, y2 = map(int, box.xyxy[0]) conf = box.conf[0] cls = int(box.cls[0]) class_name = model.names[cls] detected_classes.add(class_name) label = f"{class_name} {conf:.2f}" labels.append(label) bounding_boxes.append((x1, y1, x2, y2, class_name, conf)) # Store bounding box with class and confidence cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0), 2) cv2.putText(image, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2) possible_classes = {"front", "back"} missing_classes = possible_classes - detected_classes if missing_classes: labels.append(f"Missing: {', '.join(missing_classes)}") return Image.fromarray(image), labels, bounding_boxes def crop_image(image, bounding_boxes): """Crops detected bounding boxes from the image.""" cropped_images = {} image = np.array(image) for (x1, y1, x2, y2, class_name, conf) in bounding_boxes: cropped = image[y1:y2, x1:x2] cropped_images[class_name] = Image.fromarray(cropped) return cropped_images def vision_ai_api(image, doc_type): """Dummy API call for Vision AI, returns a fake JSON response.""" return { "document_type": doc_type, "extracted_text": "Dummy OCR result for " + doc_type, "confidence": 0.99 } # ---------------- Prediction Function ---------------- # def predict(image): """Pipeline: Preprocess -> Detect -> Crop -> Vision AI API.""" processed_image = preprocessing(image) rotated_image = imageRotation(processed_image) detected_image, labels, bounding_boxes = detect_document(rotated_image) cropped_images = crop_image(rotated_image, bounding_boxes) # Call Vision AI separately for front and back if detected front_result, back_result = None, None if "front" in cropped_images: front_result = vision_ai_api(cropped_images["front"], "front") if "back" in cropped_images: back_result = vision_ai_api(cropped_images["back"], "back") api_results = { "front": front_result, "back": back_result } return detected_image, labels, api_results iface = gr.Interface( fn=predict, inputs="image", outputs=["image", "text", "json"], title="License Field Detection (Front & Back Card)" ) iface.launch()