|
import gradio as gr |
|
import torch |
|
import cv2 |
|
import os |
|
import numpy as np |
|
from PIL import Image, ImageEnhance |
|
from ultralytics import YOLO |
|
from decord import VideoReader, cpu |
|
from torchvision.transforms.functional import InterpolationMode |
|
from transformers import AutoModel, AutoTokenizer |
|
from backPrompt import main as main_b |
|
from frontPrompt import main as main_f |
|
import sentencepiece as spm |
|
|
|
model_path = "best.pt" |
|
modelY = YOLO(model_path) |
|
os.environ["TRANSFORMERS_CACHE"] = "./.cache" |
|
cache_folder = "./.cache" |
|
path = "OpenGVLab/InternVL2_5-2B" |
|
|
|
model = AutoModel.from_pretrained( |
|
path, |
|
cache_dir=cache_folder, |
|
torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32, |
|
|
|
low_cpu_mem_usage=True, |
|
use_flash_attn=True, |
|
trust_remote_code=True |
|
).eval().cpu() |
|
|
|
tokenizer = AutoTokenizer.from_pretrained( |
|
path, |
|
cache_dir=cache_folder, |
|
trust_remote_code=True, |
|
use_fast=False |
|
) |
|
|
|
|
|
def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size): |
|
best_ratio_diff = float('inf') |
|
best_ratio = (1, 1) |
|
area = width * height |
|
for ratio in target_ratios: |
|
target_aspect_ratio = ratio[0] / ratio[1] |
|
ratio_diff = abs(aspect_ratio - target_aspect_ratio) |
|
if ratio_diff < best_ratio_diff: |
|
best_ratio_diff = ratio_diff |
|
best_ratio = ratio |
|
elif ratio_diff == best_ratio_diff: |
|
if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]: |
|
best_ratio = ratio |
|
return best_ratio |
|
|
|
def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False): |
|
|
|
if not isinstance(image, Image.Image): |
|
image = Image.fromarray(image) |
|
|
|
orig_width, orig_height = image.size |
|
aspect_ratio = orig_width / orig_height |
|
|
|
|
|
target_ratios = set( |
|
(i, j) for n in range(min_num, max_num + 1) |
|
for i in range(1, n + 1) |
|
for j in range(1, n + 1) |
|
if i * j <= max_num and i * j >= min_num |
|
) |
|
target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1]) |
|
|
|
|
|
target_aspect_ratio = find_closest_aspect_ratio( |
|
aspect_ratio, target_ratios, orig_width, orig_height, image_size |
|
) |
|
|
|
|
|
target_width = image_size * target_aspect_ratio[0] |
|
target_height = image_size * target_aspect_ratio[1] |
|
blocks = target_aspect_ratio[0] * target_aspect_ratio[1] |
|
|
|
|
|
resized_img = image.resize((target_width, target_height)) |
|
processed_images = [] |
|
for i in range(blocks): |
|
|
|
box = ( |
|
(i % (target_width // image_size)) * image_size, |
|
(i // (target_width // image_size)) * image_size, |
|
((i % (target_width // image_size)) + 1) * image_size, |
|
((i // (target_width // image_size)) + 1) * image_size |
|
) |
|
|
|
split_img = resized_img.crop(box) |
|
processed_images.append(split_img) |
|
assert len(processed_images) == blocks |
|
if use_thumbnail and len(processed_images) != 1: |
|
thumbnail_img = image.resize((image_size, image_size)) |
|
processed_images.append(thumbnail_img) |
|
return processed_images[0] |
|
|
|
|
|
|
|
def imageRotation(image): |
|
if image.height > image.width: |
|
return image.rotate(90, expand=True) |
|
return image |
|
|
|
|
|
def detect_document(image): |
|
"""Detects front and back of the document using YOLO.""" |
|
image = np.array(image) |
|
results = modelY(image, conf=0.85) |
|
|
|
detected_classes = set() |
|
labels = [] |
|
bounding_boxes = [] |
|
|
|
for result in results: |
|
for box in result.boxes: |
|
x1, y1, x2, y2 = map(int, box.xyxy[0]) |
|
conf = box.conf[0] |
|
cls = int(box.cls[0]) |
|
class_name = modelY.names[cls] |
|
|
|
detected_classes.add(class_name) |
|
label = f"{class_name} {conf:.2f}" |
|
labels.append(label) |
|
bounding_boxes.append((x1, y1, x2, y2, class_name, conf)) |
|
|
|
cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0), 2) |
|
cv2.putText(image, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2) |
|
|
|
possible_classes = {"front", "back"} |
|
missing_classes = possible_classes - detected_classes |
|
if missing_classes: |
|
labels.append(f"Missing: {', '.join(missing_classes)}") |
|
|
|
return Image.fromarray(image), labels, bounding_boxes |
|
|
|
|
|
def crop_image(image, bounding_boxes): |
|
"""Crops detected bounding boxes from the image.""" |
|
cropped_images = {} |
|
image = np.array(image) |
|
|
|
for (x1, y1, x2, y2, class_name, conf) in bounding_boxes: |
|
cropped = image[y1:y2, x1:x2] |
|
cropped_images[class_name] = Image.fromarray(cropped) |
|
|
|
return cropped_images |
|
|
|
|
|
def vision_ai_api(image, doc_type): |
|
|
|
if doc_type == "front": |
|
results = main_f(image,model,tokenizer) |
|
if doc_type == "back": |
|
results = main_b(image,model,tokenizer) |
|
|
|
return results |
|
|
|
|
|
def predict(image): |
|
"""Pipeline: Preprocess -> Detect -> Crop -> Vision AI API.""" |
|
processed_image = dynamic_preprocess(image) |
|
rotated_image = imageRotation(processed_image) |
|
detected_image, labels, bounding_boxes = detect_document(rotated_image) |
|
|
|
cropped_images = crop_image(rotated_image, bounding_boxes) |
|
|
|
|
|
front_result, back_result = None, None |
|
if "front" in cropped_images: |
|
front_result = vision_ai_api(cropped_images["front"], "front") |
|
if "back" in cropped_images: |
|
back_result = vision_ai_api(cropped_images["back"], "back") |
|
|
|
|
|
api_results = { |
|
"front": front_result, |
|
"back": back_result |
|
} |
|
single_image = cropped_images.get("front") or cropped_images.get("back") or detected_image |
|
return single_image, labels, api_results |
|
|
|
|
|
iface = gr.Interface( |
|
fn=predict, |
|
inputs="image", |
|
outputs=["image", "text", "json"], |
|
title="License Field Detection (Front & Back Card)" |
|
) |
|
|
|
iface.launch() |
|
|