syedfaisalabrar commited on
Commit
0709139
·
verified ·
1 Parent(s): 2ad2276

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +96 -91
app.py CHANGED
@@ -1,32 +1,35 @@
 
 
 
 
 
1
  import gradio as gr
2
  import torch
3
  import cv2
4
- import os
5
  import numpy as np
6
  from PIL import Image, ImageEnhance
7
  from ultralytics import YOLO
8
- from decord import VideoReader, cpu
9
  from torchvision.transforms.functional import InterpolationMode
 
10
  from transformers import AutoModel, AutoTokenizer
11
- from backPrompt import main as main_b
12
- from frontPrompt import main as main_f
13
- import sentencepiece as spm
14
 
15
- model_path = "best.pt"
16
- modelY = YOLO(model_path)
17
- os.environ["TRANSFORMERS_CACHE"] = "./.cache"
 
 
 
 
18
  cache_folder = "./.cache"
19
- path = "OpenGVLab/InternVL2_5-2B"
20
- # Load the Hugging Face model and tokenizer globally (downloaded only once)
21
  model = AutoModel.from_pretrained(
22
  path,
23
  cache_dir=cache_folder,
24
- torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
25
- # load_in_8bit=True,
26
- low_cpu_mem_usage=True,
27
- use_flash_attn=True,
28
  trust_remote_code=True
29
- ).eval().cpu()
30
 
31
  tokenizer = AutoTokenizer.from_pretrained(
32
  path,
@@ -36,36 +39,36 @@ tokenizer = AutoTokenizer.from_pretrained(
36
  )
37
 
38
 
39
- def preprocessing(image):
40
- """Apply three enhancement filters without resizing or cropping."""
41
-
42
- # Ensure the image is a PIL Image
43
- if not isinstance(image, Image.Image):
44
- image = Image.fromarray(np.array(image))
45
-
46
- # Apply enhancements
47
- image = ImageEnhance.Sharpness(image).enhance(2.0) # Increase sharpness
48
- image = ImageEnhance.Contrast(image).enhance(1.5) # Increase contrast
49
- image = ImageEnhance.Brightness(image).enhance(0.8) # Reduce brightness
50
-
51
- # Convert to tensor without resizing
52
- # image_tensor = torch.tensor(np.array(image)).permute(2, 0, 1).float() / 255.0 # Shape: [C, H, W]
53
 
 
 
 
 
 
 
 
 
 
 
 
54
  return image
55
 
56
-
57
-
58
-
59
-
60
  def imageRotation(image):
61
-
 
 
62
  return image
63
 
64
-
65
  def detect_document(image):
66
- """Detects front and back of the document using YOLO."""
67
- image = ensure_numpy(image) # Ensure valid format
68
- results = modelY(image, conf=0.85)
69
 
70
  detected_classes = set()
71
  labels = []
@@ -83,86 +86,88 @@ def detect_document(image):
83
  labels.append(label)
84
  bounding_boxes.append((x1, y1, x2, y2, class_name, conf))
85
 
86
- # Draw bounding box
87
- cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0), 2)
88
- cv2.putText(image, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
89
 
90
  possible_classes = {"front", "back"}
91
  missing_classes = possible_classes - detected_classes
92
  if missing_classes:
93
  labels.append(f"Missing: {', '.join(missing_classes)}")
94
 
95
- return Image.fromarray(image.astype(np.uint8)), labels, bounding_boxes
96
-
97
 
98
  def crop_image(image, bounding_boxes):
99
- """Crops detected bounding boxes from the image safely."""
100
- image = ensure_numpy(image) # Ensure image is NumPy format
101
  cropped_images = {}
102
-
103
  for (x1, y1, x2, y2, class_name, conf) in bounding_boxes:
104
- # Ensure the bounding box is within image bounds
105
- x1, y1, x2, y2 = max(0, x1), max(0, y1), min(image.shape[1], x2), min(image.shape[0], y2)
106
- cropped = image[y1:y2, x1:x2]
107
 
108
- if cropped.size > 0: # Check if valid
109
- cropped_images[class_name] = Image.fromarray(cropped)
 
 
 
110
 
111
- return cropped_images
 
 
 
 
 
 
 
112
 
 
 
 
 
113
 
114
- def vision_ai_api(image, doc_type):
115
 
116
- if doc_type == "front":
117
- results = main_f(image,model,tokenizer)
118
- if doc_type == "back":
119
- results = main_b(image,model,tokenizer)
120
-
121
- return results
122
-
123
- def ensure_numpy(image):
124
- """Ensure image is a valid NumPy array."""
125
- if isinstance(image, torch.Tensor):
126
- # Convert PyTorch tensor to NumPy array
127
- image = image.permute(1, 2, 0).cpu().numpy()
128
- elif isinstance(image, Image.Image):
129
- # Convert PIL image to NumPy array
130
- image = np.array(image)
131
 
132
- if len(image.shape) == 2:
133
- # Convert grayscale to 3-channel image
134
- image = np.stack([image] * 3, axis=-1)
135
 
136
- # return image
137
- return image.astype(np.uint8)
 
 
138
 
 
 
 
 
 
139
  def predict(image):
140
- """Pipeline: Preprocess -> Detect -> Crop -> Vision AI API."""
141
- processed_image = preprocessing(image) # Enhanced PIL image
142
- rotated_image = ensure_numpy(processed_image) # Convert to NumPy
143
  detected_image, labels, bounding_boxes = detect_document(rotated_image)
144
-
145
- if not bounding_boxes:
146
- return detected_image, labels, {"error": "No document detected!"}
147
-
148
  cropped_images = crop_image(rotated_image, bounding_boxes)
149
 
150
- # Call Vision AI separately for front and back if detected
151
- front_result = back_result = None
152
  if "front" in cropped_images:
153
  front_result = vision_ai_api(cropped_images["front"], "front")
154
  if "back" in cropped_images:
155
  back_result = vision_ai_api(cropped_images["back"], "back")
156
 
157
- api_results = {
158
- "front": front_result,
159
- "back": back_result
160
- }
161
-
162
- return detected_image, labels, api_results
163
-
164
-
165
 
 
 
 
166
  iface = gr.Interface(
167
  fn=predict,
168
  inputs="image",
@@ -170,4 +175,4 @@ iface = gr.Interface(
170
  title="License Field Detection (Front & Back Card)"
171
  )
172
 
173
- iface.launch()
 
1
+ import os
2
+ # Set up caching for Hugging Face models
3
+ os.environ["TRANSFORMERS_CACHE"] = "./.cache"
4
+ os.environ["CUDA_VISIBLE_DEVICES"] = "-1" # Disable GPU usage
5
+
6
  import gradio as gr
7
  import torch
8
  import cv2
 
9
  import numpy as np
10
  from PIL import Image, ImageEnhance
11
  from ultralytics import YOLO
 
12
  from torchvision.transforms.functional import InterpolationMode
13
+ import torchvision.transforms as T
14
  from transformers import AutoModel, AutoTokenizer
15
+ import gc
 
 
16
 
17
+ # Import prompts from prompts.py
18
+ from prompts import front as front_prompt, back as back_prompt
19
+
20
+ # ---------------------------
21
+ # HUGGING FACE MODEL SETUP (CPU)
22
+ # ---------------------------
23
+ path = "OpenGVLab/InternVL2_5-1B"
24
  cache_folder = "./.cache"
25
+
26
+ # Load the Vision AI model and tokenizer globally.
27
  model = AutoModel.from_pretrained(
28
  path,
29
  cache_dir=cache_folder,
30
+ torch_dtype=torch.float32,
 
 
 
31
  trust_remote_code=True
32
+ ).eval().to("cpu")
33
 
34
  tokenizer = AutoTokenizer.from_pretrained(
35
  path,
 
39
  )
40
 
41
 
42
+ # ---------------------------
43
+ # YOLO MODEL INITIALIZATION
44
+ # ---------------------------
45
+ model_path = "best.pt"
46
+ modelY = YOLO(model_path)
47
+ modelY.to('cpu') # Explicitly move model to CPU
 
 
 
 
 
 
 
 
48
 
49
+ def preprocessing(image):
50
+ """Apply enhancement filters and resize."""
51
+ image = Image.fromarray(np.array(image))
52
+ image = ImageEnhance.Sharpness(image).enhance(2.0) # Increase sharpness
53
+ image = ImageEnhance.Contrast(image).enhance(1.5) # Increase contrast
54
+ image = ImageEnhance.Brightness(image).enhance(0.8) # Reduce brightness
55
+
56
+ width = 448
57
+ aspect_ratio = image.height / image.width
58
+ height = int(width * aspect_ratio)
59
+ image = image.resize((width, height))
60
  return image
61
 
 
 
 
 
62
  def imageRotation(image):
63
+ """Rotate image if height exceeds width."""
64
+ if image.height > image.width:
65
+ return image.rotate(90, expand=True)
66
  return image
67
 
 
68
  def detect_document(image):
69
+ """Detect front/back of the document using YOLO."""
70
+ image_np = np.array(image)
71
+ results = modelY(image_np, conf=0.85, device='cpu')
72
 
73
  detected_classes = set()
74
  labels = []
 
86
  labels.append(label)
87
  bounding_boxes.append((x1, y1, x2, y2, class_name, conf))
88
 
89
+ cv2.rectangle(image_np, (x1, y1), (x2, y2), (0, 255, 0), 2)
90
+ cv2.putText(image_np, label, (x1, y1 - 10),
91
+ cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
92
 
93
  possible_classes = {"front", "back"}
94
  missing_classes = possible_classes - detected_classes
95
  if missing_classes:
96
  labels.append(f"Missing: {', '.join(missing_classes)}")
97
 
98
+ return Image.fromarray(image_np), labels, bounding_boxes
 
99
 
100
  def crop_image(image, bounding_boxes):
101
+ """Crop detected bounding boxes from the image."""
 
102
  cropped_images = {}
103
+ image_np = np.array(image)
104
  for (x1, y1, x2, y2, class_name, conf) in bounding_boxes:
105
+ cropped = image_np[y1:y2, x1:x2]
106
+ cropped_images[class_name] = Image.fromarray(cropped)
107
+ return cropped_images
108
 
109
+ # ---------------------------
110
+ # VISION AI API FUNCTIONS
111
+ # ---------------------------
112
+ IMAGENET_MEAN = (0.485, 0.456, 0.406)
113
+ IMAGENET_STD = (0.229, 0.224, 0.225)
114
 
115
+ def build_transform(input_size):
116
+ transform = T.Compose([
117
+ T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
118
+ T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
119
+ T.ToTensor(),
120
+ T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
121
+ ])
122
+ return transform
123
 
124
+ def load_image(image_file):
125
+ transform = build_transform(input_size=448)
126
+ pixel_values = transform(image_file).unsqueeze(0) # Add batch dimension
127
+ return pixel_values
128
 
 
129
 
130
+ def vision_ai_api(image, doc_type):
131
+ """Run the model using a dynamic prompt based on detected doc type."""
132
+ pixel_values = load_image(image).to(torch.float32).to("cpu")
133
+ generation_config = dict(max_new_tokens=1024, do_sample=True)
134
+
135
+ question = front_prompt if doc_type == "front" else back_prompt if doc_type == "back" else "Please provide document details."
 
 
 
 
 
 
 
 
 
136
 
137
+ print("Before requesting model...")
138
+ response = model.chat(tokenizer, pixel_values, question, generation_config)
139
+ print("After requesting model...", response)
140
 
141
+ # Clear memory
142
+ del pixel_values
143
+ gc.collect() # Force garbage collection
144
+ torch.cuda.empty_cache()
145
 
146
+ return f'User: {question}\nAssistant: {response}'
147
+
148
+ # ---------------------------
149
+ # PREDICTION PIPELINE
150
+ # ---------------------------
151
  def predict(image):
152
+ """Pipeline: Preprocess Detect Crop Vision AI API call."""
153
+ processed_image = preprocessing(image)
154
+ rotated_image = imageRotation(processed_image)
155
  detected_image, labels, bounding_boxes = detect_document(rotated_image)
 
 
 
 
156
  cropped_images = crop_image(rotated_image, bounding_boxes)
157
 
158
+ front_result, back_result = None, None
 
159
  if "front" in cropped_images:
160
  front_result = vision_ai_api(cropped_images["front"], "front")
161
  if "back" in cropped_images:
162
  back_result = vision_ai_api(cropped_images["back"], "back")
163
 
164
+ api_results = {"front": front_result, "back": back_result}
165
+ single_image = cropped_images.get("front") or cropped_images.get("back") or detected_image
166
+ return single_image, labels, api_results
 
 
 
 
 
167
 
168
+ # ---------------------------
169
+ # GRADIO INTERFACE LAUNCH
170
+ # ---------------------------
171
  iface = gr.Interface(
172
  fn=predict,
173
  inputs="image",
 
175
  title="License Field Detection (Front & Back Card)"
176
  )
177
 
178
+ iface.launch()