Spaces:

syedfaisalabrar
/

License_Classification

Sleeping

App Files Files Community

License_Classification / app.py

syedfaisalabrar

Update app.py

708bb4e verified 3 months ago

raw

history blame

6.36 kB

	import gradio as gr
	import torch
	import cv2
	import os
	import numpy as np
	from PIL import Image, ImageEnhance
	from ultralytics import YOLO
	from decord import VideoReader, cpu
	from torchvision.transforms.functional import InterpolationMode
	from transformers import AutoModel, AutoTokenizer
	from backPrompt import main as main_b
	from frontPrompt import main as main_f
	import sentencepiece as spm

	model_path = "best.pt"
	modelY = YOLO(model_path)
	os.environ["TRANSFORMERS_CACHE"] = "./.cache"
	cache_folder = "./.cache"
	path = "OpenGVLab/InternVL2_5-2B"
	# Load the Hugging Face model and tokenizer globally (downloaded only once)
	model = AutoModel.from_pretrained(
	path,
	cache_dir=cache_folder,
	torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
	# load_in_8bit=True,
	low_cpu_mem_usage=True,
	use_flash_attn=True,
	trust_remote_code=True
	).eval().cpu()

	tokenizer = AutoTokenizer.from_pretrained(
	path,
	cache_dir=cache_folder,
	trust_remote_code=True,
	use_fast=False
	)


	def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
	best_ratio_diff = float('inf')
	best_ratio = (1, 1)
	area = width * height
	for ratio in target_ratios:
	target_aspect_ratio = ratio[0] / ratio[1]
	ratio_diff = abs(aspect_ratio - target_aspect_ratio)
	if ratio_diff < best_ratio_diff:
	best_ratio_diff = ratio_diff
	best_ratio = ratio
	elif ratio_diff == best_ratio_diff:
	if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
	best_ratio = ratio
	return best_ratio

	def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
	# Ensure the image is a PIL Image
	if not isinstance(image, Image.Image):
	image = Image.fromarray(image)

	orig_width, orig_height = image.size
	aspect_ratio = orig_width / orig_height

	# Calculate the existing image aspect ratio
	target_ratios = set(
	(i, j) for n in range(min_num, max_num + 1)
	for i in range(1, n + 1)
	for j in range(1, n + 1)
	if i * j <= max_num and i * j >= min_num
	)
	target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])

	# Find the closest aspect ratio to the target
	target_aspect_ratio = find_closest_aspect_ratio(
	aspect_ratio, target_ratios, orig_width, orig_height, image_size
	)

	# Calculate the target width and height
	target_width = image_size * target_aspect_ratio[0]
	target_height = image_size * target_aspect_ratio[1]
	blocks = target_aspect_ratio[0] * target_aspect_ratio[1]

	# Resize the image
	resized_img = image.resize((target_width, target_height))
	processed_images = []
	for i in range(blocks):
	# Calculate the crop box for each block
	box = (
	(i % (target_width // image_size)) * image_size,
	(i // (target_width // image_size)) * image_size,
	((i % (target_width // image_size)) + 1) * image_size,
	((i // (target_width // image_size)) + 1) * image_size
	)
	# Split the image
	split_img = resized_img.crop(box)
	processed_images.append(split_img)
	assert len(processed_images) == blocks
	if use_thumbnail and len(processed_images) != 1:
	thumbnail_img = image.resize((image_size, image_size))
	processed_images.append(thumbnail_img)
	return processed_images[0]



	def imageRotation(image):
	if image.height > image.width:
	return image.rotate(90, expand=True)
	return image


	def detect_document(image):
	"""Detects front and back of the document using YOLO."""
	image = np.array(image)
	results = modelY(image, conf=0.85)

	detected_classes = set()
	labels = []
	bounding_boxes = []

	for result in results:
	for box in result.boxes:
	x1, y1, x2, y2 = map(int, box.xyxy[0])
	conf = box.conf[0]
	cls = int(box.cls[0])
	class_name = modelY.names[cls]

	detected_classes.add(class_name)
	label = f"{class_name} {conf:.2f}"
	labels.append(label)
	bounding_boxes.append((x1, y1, x2, y2, class_name, conf)) # Store bounding box with class and confidence

	cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0), 2)
	cv2.putText(image, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

	possible_classes = {"front", "back"}
	missing_classes = possible_classes - detected_classes
	if missing_classes:
	labels.append(f"Missing: {', '.join(missing_classes)}")

	return Image.fromarray(image), labels, bounding_boxes


	def crop_image(image, bounding_boxes):
	"""Crops detected bounding boxes from the image."""
	cropped_images = {}
	image = np.array(image)

	for (x1, y1, x2, y2, class_name, conf) in bounding_boxes:
	cropped = image[y1:y2, x1:x2]
	cropped_images[class_name] = Image.fromarray(cropped)

	return cropped_images


	def vision_ai_api(image, doc_type):

	if doc_type == "front":
	results = main_f(image,model,tokenizer)
	if doc_type == "back":
	results = main_b(image,model,tokenizer)

	return results


	def predict(image):
	"""Pipeline: Preprocess -> Detect -> Crop -> Vision AI API."""
	processed_image = dynamic_preprocess(image)
	rotated_image = imageRotation(processed_image) # Placeholder for rotation
	detected_image, labels, bounding_boxes = detect_document(rotated_image)

	cropped_images = crop_image(rotated_image, bounding_boxes)

	# Call Vision AI separately for front and back if detected
	front_result, back_result = None, None
	if "front" in cropped_images:
	front_result = vision_ai_api(cropped_images["front"], "front")
	if "back" in cropped_images:
	back_result = vision_ai_api(cropped_images["back"], "back")


	api_results = {
	"front": front_result,
	"back": back_result
	}
	single_image = cropped_images.get("front") or cropped_images.get("back") or detected_image
	return single_image, labels, api_results


	iface = gr.Interface(
	fn=predict,
	inputs="image",
	outputs=["image", "text", "json"],
	title="License Field Detection (Front & Back Card)"
	)

	iface.launch()