Spaces:

syedfaisalabrar
/

License_Classification

Sleeping

App Files Files Community

syedfaisalabrar commited on Feb 26

Commit

0709139

verified ·

1 Parent(s): 2ad2276

Update app.py

Browse files

Files changed (1) hide show

app.py +96 -91

app.py CHANGED Viewed

@@ -1,32 +1,35 @@
 import gradio as gr
 import torch
 import cv2
-import os
 import numpy as np
 from PIL import Image, ImageEnhance
 from ultralytics import YOLO
-from decord import VideoReader, cpu
 from torchvision.transforms.functional import InterpolationMode
 from transformers import AutoModel, AutoTokenizer
-from backPrompt import main as main_b
-from frontPrompt import main as main_f
-import sentencepiece as spm
-model_path = "best.pt"
-modelY = YOLO(model_path)
-os.environ["TRANSFORMERS_CACHE"] = "./.cache"
 cache_folder = "./.cache"
-path = "OpenGVLab/InternVL2_5-2B"
-# Load the Hugging Face model and tokenizer globally (downloaded only once)
 model = AutoModel.from_pretrained(
     path,
     cache_dir=cache_folder,
-    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
-    # load_in_8bit=True,
-    low_cpu_mem_usage=True,
-    use_flash_attn=True,
     trust_remote_code=True
-).eval().cpu()
 tokenizer = AutoTokenizer.from_pretrained(
     path,
@@ -36,36 +39,36 @@ tokenizer = AutoTokenizer.from_pretrained(
 )
-def preprocessing(image):
-    """Apply three enhancement filters without resizing or cropping."""
-    # Ensure the image is a PIL Image
-    if not isinstance(image, Image.Image):
-        image = Image.fromarray(np.array(image))
-    # Apply enhancements
-    image = ImageEnhance.Sharpness(image).enhance(2.0)  # Increase sharpness
-    image = ImageEnhance.Contrast(image).enhance(1.5)   # Increase contrast
-    image = ImageEnhance.Brightness(image).enhance(0.8) # Reduce brightness
-    # Convert to tensor without resizing
-    # image_tensor = torch.tensor(np.array(image)).permute(2, 0, 1).float() / 255.0  # Shape: [C, H, W]
     return image
 def imageRotation(image):
     return image
 def detect_document(image):
-    """Detects front and back of the document using YOLO."""
-    image = ensure_numpy(image)  # Ensure valid format
-    results = modelY(image, conf=0.85)
     detected_classes = set()
     labels = []
@@ -83,86 +86,88 @@ def detect_document(image):
             labels.append(label)
             bounding_boxes.append((x1, y1, x2, y2, class_name, conf))
-            # Draw bounding box
-            cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0), 2)
-            cv2.putText(image, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
     possible_classes = {"front", "back"}
     missing_classes = possible_classes - detected_classes
     if missing_classes:
         labels.append(f"Missing: {', '.join(missing_classes)}")
-    return Image.fromarray(image.astype(np.uint8)), labels, bounding_boxes
 def crop_image(image, bounding_boxes):
-    """Crops detected bounding boxes from the image safely."""
-    image = ensure_numpy(image)  # Ensure image is NumPy format
     cropped_images = {}
     for (x1, y1, x2, y2, class_name, conf) in bounding_boxes:
-        # Ensure the bounding box is within image bounds
-        x1, y1, x2, y2 = max(0, x1), max(0, y1), min(image.shape[1], x2), min(image.shape[0], y2)
-        cropped = image[y1:y2, x1:x2]
-        if cropped.size > 0:  # Check if valid
-            cropped_images[class_name] = Image.fromarray(cropped)
-    return cropped_images
-def vision_ai_api(image, doc_type):
-    if doc_type == "front":
-        results = main_f(image,model,tokenizer)
-    if doc_type == "back":
-        results = main_b(image,model,tokenizer)
-    return results
-def ensure_numpy(image):
-    """Ensure image is a valid NumPy array."""
-    if isinstance(image, torch.Tensor):
-        # Convert PyTorch tensor to NumPy array
-        image = image.permute(1, 2, 0).cpu().numpy()
-    elif isinstance(image, Image.Image):
-        # Convert PIL image to NumPy array
-        image = np.array(image)
-    if len(image.shape) == 2:
-        # Convert grayscale to 3-channel image
-        image = np.stack([image] * 3, axis=-1)
-    # return image
-    return image.astype(np.uint8)
 def predict(image):
-    """Pipeline: Preprocess -> Detect -> Crop -> Vision AI API."""
-    processed_image = preprocessing(image)  # Enhanced PIL image
-    rotated_image = ensure_numpy(processed_image)  # Convert to NumPy
     detected_image, labels, bounding_boxes = detect_document(rotated_image)
-    if not bounding_boxes:
-        return detected_image, labels, {"error": "No document detected!"}
     cropped_images = crop_image(rotated_image, bounding_boxes)
-    # Call Vision AI separately for front and back if detected
-    front_result = back_result = None
     if "front" in cropped_images:
         front_result = vision_ai_api(cropped_images["front"], "front")
     if "back" in cropped_images:
         back_result = vision_ai_api(cropped_images["back"], "back")
-    api_results = {
-        "front": front_result,
-        "back": back_result
-    }
-    return detected_image, labels, api_results
 iface = gr.Interface(
     fn=predict,
     inputs="image",
@@ -170,4 +175,4 @@ iface = gr.Interface(
     title="License Field Detection (Front & Back Card)"
 )
-iface.launch()

+import os
+# Set up caching for Hugging Face models
+os.environ["TRANSFORMERS_CACHE"] = "./.cache"
+os.environ["CUDA_VISIBLE_DEVICES"] = "-1"  # Disable GPU usage
 import gradio as gr
 import torch
 import cv2
 import numpy as np
 from PIL import Image, ImageEnhance
 from ultralytics import YOLO
 from torchvision.transforms.functional import InterpolationMode
+import torchvision.transforms as T
 from transformers import AutoModel, AutoTokenizer
+import gc
+# Import prompts from prompts.py
+from prompts import front as front_prompt, back as back_prompt
+# ---------------------------
+# HUGGING FACE MODEL SETUP (CPU)
+# ---------------------------
+path = "OpenGVLab/InternVL2_5-1B"
 cache_folder = "./.cache"
+# Load the Vision AI model and tokenizer globally.
 model = AutoModel.from_pretrained(
     path,
     cache_dir=cache_folder,
+    torch_dtype=torch.float32,
     trust_remote_code=True
+).eval().to("cpu")
 tokenizer = AutoTokenizer.from_pretrained(
     path,
 )
+# ---------------------------
+# YOLO MODEL INITIALIZATION
+# ---------------------------
+model_path = "best.pt"
+modelY = YOLO(model_path)
+modelY.to('cpu')  # Explicitly move model to CPU
+def preprocessing(image):
+    """Apply enhancement filters and resize."""
+    image = Image.fromarray(np.array(image))
+    image = ImageEnhance.Sharpness(image).enhance(2.0)   # Increase sharpness
+    image = ImageEnhance.Contrast(image).enhance(1.5)     # Increase contrast
+    image = ImageEnhance.Brightness(image).enhance(0.8)   # Reduce brightness
+    width = 448
+    aspect_ratio = image.height / image.width
+    height = int(width * aspect_ratio)
+    image = image.resize((width, height))
     return image
 def imageRotation(image):
+    """Rotate image if height exceeds width."""
+    if image.height > image.width:
+        return image.rotate(90, expand=True)
     return image
 def detect_document(image):
+    """Detect front/back of the document using YOLO."""
+    image_np = np.array(image)
+    results = modelY(image_np, conf=0.85, device='cpu')
     detected_classes = set()
     labels = []
             labels.append(label)
             bounding_boxes.append((x1, y1, x2, y2, class_name, conf))
+            cv2.rectangle(image_np, (x1, y1), (x2, y2), (0, 255, 0), 2)
+            cv2.putText(image_np, label, (x1, y1 - 10),
+                        cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
     possible_classes = {"front", "back"}
     missing_classes = possible_classes - detected_classes
     if missing_classes:
         labels.append(f"Missing: {', '.join(missing_classes)}")
+    return Image.fromarray(image_np), labels, bounding_boxes
 def crop_image(image, bounding_boxes):
+    """Crop detected bounding boxes from the image."""
     cropped_images = {}
+    image_np = np.array(image)
     for (x1, y1, x2, y2, class_name, conf) in bounding_boxes:
+        cropped = image_np[y1:y2, x1:x2]
+        cropped_images[class_name] = Image.fromarray(cropped)
+    return cropped_images
+# ---------------------------
+# VISION AI API FUNCTIONS
+# ---------------------------
+IMAGENET_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_STD = (0.229, 0.224, 0.225)
+def build_transform(input_size):
+    transform = T.Compose([
+        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
+        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
+        T.ToTensor(),
+        T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
+    ])
+    return transform
+def load_image(image_file):
+    transform = build_transform(input_size=448)
+    pixel_values = transform(image_file).unsqueeze(0)  # Add batch dimension
+    return pixel_values
+def vision_ai_api(image, doc_type):
+    """Run the model using a dynamic prompt based on detected doc type."""
+    pixel_values = load_image(image).to(torch.float32).to("cpu")
+    generation_config = dict(max_new_tokens=1024, do_sample=True)
+    question = front_prompt if doc_type == "front" else back_prompt if doc_type == "back" else "Please provide document details."
+    print("Before requesting model...")
+    response = model.chat(tokenizer, pixel_values, question, generation_config)
+    print("After requesting model...", response)
+    # Clear memory
+    del pixel_values
+    gc.collect()  # Force garbage collection
+    torch.cuda.empty_cache()
+    return f'User: {question}\nAssistant: {response}'
+# ---------------------------
+# PREDICTION PIPELINE
+# ---------------------------
 def predict(image):
+    """Pipeline: Preprocess → Detect → Crop → Vision AI API call."""
+    processed_image = preprocessing(image)
+    rotated_image = imageRotation(processed_image)
     detected_image, labels, bounding_boxes = detect_document(rotated_image)
     cropped_images = crop_image(rotated_image, bounding_boxes)
+    front_result, back_result = None, None
     if "front" in cropped_images:
         front_result = vision_ai_api(cropped_images["front"], "front")
     if "back" in cropped_images:
         back_result = vision_ai_api(cropped_images["back"], "back")
+    api_results = {"front": front_result, "back": back_result}
+    single_image = cropped_images.get("front") or cropped_images.get("back") or detected_image
+    return single_image, labels, api_results
+# ---------------------------
+# GRADIO INTERFACE LAUNCH
+# ---------------------------
 iface = gr.Interface(
     fn=predict,
     inputs="image",
     title="License Field Detection (Front & Back Card)"
 )
+iface.launch()