alvarobartt HF Staff commited on
Commit
acd5c25
·
verified ·
1 Parent(s): 09fae83

Add `handler.py` and `requirements.txt`

Browse files

This PR adds the `handler.py` and the `requirements.txt` files required to run Microsoft OmniParser v2 on Inference Endpoints and on Azure ML. This PR contains the code adapted from both the [Microsoft OmniParser v2 Gradio demo](https://huggingface.co/spaces/microsoft/OmniParser-v2) and the code shared by

@ThomasDh-C
on [his fork of OmniParser v2](https://huggingface.co/ThomasDh-C/OmniParser-v2.0).

Any contributions are welcomed, and note that the code has been slightly modified as in organization and variable naming, but the code remains the same otherwise. Also note that the code can be modified at any point, so

@ThomasDh-C
feel free to jump in if you feel like changing anything. Thanks in advance 🤗

Files changed (2) hide show
  1. handler.py +655 -0
  2. requirements.txt +7 -0
handler.py ADDED
@@ -0,0 +1,655 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import io
3
+ from typing import Any, Dict, List, Literal, Optional, Tuple, Union
4
+
5
+ import cv2
6
+ import easyocr
7
+ import numpy as np
8
+ import torch
9
+ from PIL import Image
10
+ from PIL.Image import Image as ImageType
11
+ from supervision.detection.core import Detections
12
+ from supervision.draw.color import Color, ColorPalette
13
+ from torchvision.ops import box_convert
14
+ from torchvision.transforms import ToPILImage
15
+ from transformers import AutoModelForCausalLM, AutoProcessor
16
+ from transformers.image_utils import load_image
17
+ from ultralytics import YOLO
18
+
19
+ # NOTE: here so that it's downloaded before hand so that the endpoint it not stuck listening, whilst the required
20
+ # files are still being downloaded
21
+ easyocr.Reader(["en"])
22
+
23
+
24
+ class EndpointHandler:
25
+ def __init__(self, model_dir: str = "/repository") -> None:
26
+ self.device = (
27
+ torch.device("cuda")
28
+ if torch.cuda.is_available()
29
+ else torch.device("mps")
30
+ if torch.backends.mps.is_available()
31
+ else torch.device("cpu")
32
+ )
33
+
34
+ # bounding box detection model
35
+ self.yolo = YOLO(f"{model_dir}/icon_detect/model.pt")
36
+
37
+ # captioning model
38
+ self.processor = AutoProcessor.from_pretrained(
39
+ "microsoft/Florence-2-base", trust_remote_code=True
40
+ )
41
+ self.model = AutoModelForCausalLM.from_pretrained(
42
+ f"{model_dir}/icon_caption",
43
+ torch_dtype=torch.float16,
44
+ trust_remote_code=True,
45
+ ).to(self.device)
46
+
47
+ # ocr
48
+ self.ocr = easyocr.Reader(["en"])
49
+
50
+ # box annotator
51
+ self.annotator = BoxAnnotator()
52
+
53
+ def __call__(self, data: Dict[str, Any]) -> Any:
54
+ # inputs => {"inputs": data}
55
+ # data => {
56
+ # "image": url/base64,
57
+ # "image_size": tuple(int, int) / list(int),
58
+ # "bbox_threshold": float,
59
+ # "iou_threshold": float,
60
+ # [disabled] "use_paddleocr": bool,
61
+ # "draw_bboxes": bool,
62
+ # }
63
+ data = data.pop("inputs")
64
+
65
+ # read image from either url or base64 encoding
66
+ image = load_image(data["image"])
67
+
68
+ # box_overlay_ratio = image.size[0] / 3200
69
+ # bbox_config = {
70
+ # "text_scale": 0.8 * box_overlay_ratio,
71
+ # "text_thickness": max(int(2 * box_overlay_ratio), 1),
72
+ # "text_padding": max(int(3 * box_overlay_ratio), 1),
73
+ # "thickness": max(int(3 * box_overlay_ratio), 1),
74
+ # }
75
+
76
+ ocr_texts, ocr_bboxes = self.check_ocr_bboxes(
77
+ image,
78
+ out_format="xyxy",
79
+ ocr_kwargs={"paragraph": False, "text_threshold": 0.8}, # 0.9
80
+ )
81
+ annotated_image, filtered_bboxes_out = self.get_som_labeled_img(
82
+ image,
83
+ image_size=data.get("image_size", None),
84
+ ocr_texts=ocr_texts,
85
+ ocr_bboxes=ocr_bboxes,
86
+ bbox_threshold=data.get("bbox_threshold", 0.01),
87
+ iou_threshold=data.get("iou_threshold", None),
88
+ )
89
+ return {
90
+ "image": annotated_image,
91
+ "bboxes": filtered_bboxes_out,
92
+ }
93
+
94
+ def check_ocr_bboxes(
95
+ self,
96
+ image: ImageType,
97
+ out_format: Literal["xywh", "xyxy"] = "xywh",
98
+ ocr_kwargs: Optional[Dict[str, Any]] = {},
99
+ ) -> Tuple[List[str], List[List[int]]]:
100
+ if image.mode == "RBGA":
101
+ image = image.convert("RGB")
102
+
103
+ result = self.ocr.readtext(np.array(image), **ocr_kwargs) # type: ignore
104
+ texts = [str(item[1]) for item in result]
105
+ bboxes = [
106
+ self.coordinates_to_bbox(item[0], format=out_format) for item in result
107
+ ]
108
+ return (texts, bboxes)
109
+
110
+ @staticmethod
111
+ def coordinates_to_bbox(
112
+ coordinates: np.ndarray, format: Literal["xywh", "xyxy"] = "xywh"
113
+ ) -> List[int]:
114
+ match format:
115
+ case "xywh":
116
+ return [
117
+ int(coordinates[0][0]),
118
+ int(coordinates[0][1]),
119
+ int(coordinates[2][0] - coordinates[0][0]),
120
+ int(coordinates[2][1] - coordinates[0][1]),
121
+ ]
122
+ case "xyxy":
123
+ return [
124
+ int(coordinates[0][0]),
125
+ int(coordinates[0][1]),
126
+ int(coordinates[2][0]),
127
+ int(coordinates[2][1]),
128
+ ]
129
+
130
+ @staticmethod
131
+ def bbox_area(bbox: List[int], w: int, h: int) -> int:
132
+ bbox = [bbox[0] * w, bbox[1] * h, bbox[2] * w, bbox[3] * h]
133
+ return (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
134
+
135
+ @staticmethod
136
+ def remove_bbox_overlap(
137
+ xyxy_bboxes: List[Dict[str, Any]],
138
+ ocr_bboxes: Optional[List[Dict[str, Any]]] = None,
139
+ iou_threshold: Optional[float] = 0.7,
140
+ ) -> List[Dict[str, Any]]:
141
+ filtered_bboxes = []
142
+ if ocr_bboxes is not None:
143
+ filtered_bboxes.extend(ocr_bboxes)
144
+
145
+ for i, bbox_outter in enumerate(xyxy_bboxes):
146
+ bbox_left = bbox_outter["bbox"]
147
+ valid_bbox = True
148
+
149
+ for j, bbox_inner in enumerate(xyxy_bboxes):
150
+ if i == j:
151
+ continue
152
+
153
+ bbox_right = bbox_inner["bbox"]
154
+ if (
155
+ intersection_over_union(
156
+ bbox_left,
157
+ bbox_right,
158
+ )
159
+ > iou_threshold # type: ignore
160
+ ) and (area(bbox_left) > area(bbox_right)):
161
+ valid_bbox = False
162
+ break
163
+
164
+ if valid_bbox is False:
165
+ continue
166
+
167
+ if ocr_bboxes is None:
168
+ filtered_bboxes.append(bbox_outter)
169
+ continue
170
+
171
+ box_added = False
172
+ ocr_labels = []
173
+ for ocr_bbox in ocr_bboxes:
174
+ if not box_added:
175
+ bbox_right = ocr_bbox["bbox"]
176
+ if overlap(bbox_right, bbox_left):
177
+ try:
178
+ ocr_labels.append(ocr_bbox["content"])
179
+ filtered_bboxes.remove(ocr_bbox)
180
+ except Exception:
181
+ continue
182
+ elif overlap(bbox_left, bbox_right):
183
+ box_added = True
184
+ break
185
+
186
+ if not box_added:
187
+ filtered_bboxes.append(
188
+ {
189
+ "type": "icon",
190
+ "bbox": bbox_outter["bbox"],
191
+ "interactivity": True,
192
+ "content": " ".join(ocr_labels) if ocr_labels else None,
193
+ }
194
+ )
195
+
196
+ return filtered_bboxes
197
+
198
+ def get_som_labeled_img(
199
+ self,
200
+ image: ImageType,
201
+ image_size: Optional[List[int]] = None,
202
+ ocr_texts: Optional[List[str]] = None,
203
+ ocr_bboxes: Optional[List[List[int]]] = None,
204
+ bbox_threshold: float = 0.01,
205
+ iou_threshold: Optional[float] = None,
206
+ caption_prompt: Optional[str] = None,
207
+ caption_batch_size: int = 64, # ~2GiB of GPU VRAM (can be increased to 128 which is ~4GiB of GPU VRAM)
208
+ ) -> Tuple[str, List[Dict[str, Any]]]:
209
+ if image.mode == "RBGA":
210
+ image = image.convert("RGB")
211
+
212
+ w, h = image.size
213
+ if image_size is None:
214
+ image_size = [h, w]
215
+
216
+ out = self.yolo.predict(
217
+ image,
218
+ imgsz=image_size,
219
+ conf=bbox_threshold,
220
+ iou=iou_threshold or 0.7,
221
+ verbose=False,
222
+ )[0]
223
+ if out.boxes is None:
224
+ raise RuntimeError(
225
+ "YOLO prediction failed to produce the bounding boxes..."
226
+ )
227
+
228
+ xyxy_bboxes = out.boxes.xyxy
229
+ xyxy_bboxes = xyxy_bboxes / torch.Tensor([w, h, w, h]).to(xyxy_bboxes.device)
230
+ image_np = np.asarray(image) # type: ignore
231
+
232
+ if ocr_bboxes:
233
+ ocr_bboxes = torch.tensor(ocr_bboxes) / torch.Tensor([w, h, w, h]) # type: ignore
234
+ ocr_bboxes = ocr_bboxes.tolist() # type: ignore
235
+
236
+ ocr_bboxes = [
237
+ {
238
+ "type": "text",
239
+ "bbox": bbox,
240
+ "interactivity": False,
241
+ "content": text,
242
+ "source": "box_ocr_content_ocr",
243
+ }
244
+ for bbox, text in zip(ocr_bboxes, ocr_texts) # type: ignore
245
+ if self.bbox_area(bbox, w, h) > 0
246
+ ]
247
+ xyxy_bboxes = [
248
+ {
249
+ "type": "icon",
250
+ "bbox": bbox,
251
+ "interactivity": True,
252
+ "content": None,
253
+ "source": "box_yolo_content_yolo",
254
+ }
255
+ for bbox in xyxy_bboxes.tolist()
256
+ if self.bbox_area(bbox, w, h) > 0
257
+ ]
258
+
259
+ filtered_bboxes = self.remove_bbox_overlap(
260
+ xyxy_bboxes=xyxy_bboxes,
261
+ ocr_bboxes=ocr_bboxes, # type: ignore
262
+ iou_threshold=iou_threshold or 0.7,
263
+ )
264
+
265
+ filtered_bboxes_out = sorted(
266
+ filtered_bboxes, key=lambda x: x["content"] is None
267
+ )
268
+ starting_idx = next(
269
+ (
270
+ idx
271
+ for idx, bbox in enumerate(filtered_bboxes_out)
272
+ if bbox["content"] is None
273
+ ),
274
+ -1,
275
+ )
276
+
277
+ filtered_bboxes = torch.tensor([box["bbox"] for box in filtered_bboxes_out])
278
+ non_ocr_bboxes = filtered_bboxes[starting_idx:]
279
+
280
+ bbox_images = []
281
+ for _, coordinates in enumerate(non_ocr_bboxes):
282
+ try:
283
+ xmin, xmax = (
284
+ int(coordinates[0] * image_np.shape[1]),
285
+ int(coordinates[2] * image_np.shape[1]),
286
+ )
287
+ ymin, ymax = (
288
+ int(coordinates[1] * image_np.shape[0]),
289
+ int(coordinates[3] * image_np.shape[0]),
290
+ )
291
+ cropped_image = image_np[ymin:ymax, xmin:xmax, :]
292
+ cropped_image = cv2.resize(cropped_image, (64, 64))
293
+ bbox_images.append(ToPILImage()(cropped_image))
294
+ except Exception:
295
+ continue
296
+
297
+ if caption_prompt is None:
298
+ caption_prompt = "<CAPTION>"
299
+
300
+ captions = []
301
+ for idx in range(0, len(bbox_images), caption_batch_size): # type: ignore
302
+ batch = bbox_images[idx : idx + caption_batch_size] # type: ignore
303
+ inputs = self.processor(
304
+ images=batch,
305
+ text=[caption_prompt] * len(batch),
306
+ return_tensors="pt",
307
+ do_resize=False,
308
+ )
309
+ if self.device.type in {"cuda", "mps"}:
310
+ inputs = inputs.to(device=self.device, dtype=torch.float16)
311
+
312
+ with torch.inference_mode():
313
+ generated_ids = self.model.generate(
314
+ input_ids=inputs["input_ids"],
315
+ pixel_values=inputs["pixel_values"],
316
+ max_new_tokens=20,
317
+ num_beams=1,
318
+ do_sample=False,
319
+ early_stopping=False,
320
+ )
321
+
322
+ generated_texts = self.processor.batch_decode(
323
+ generated_ids, skip_special_tokens=True
324
+ )
325
+ captions.extend([text.strip() for text in generated_texts])
326
+
327
+ ocr_texts = [f"Text Box ID {idx}: {text}" for idx, text in enumerate(ocr_texts)] # type: ignore
328
+ for _, bbox in enumerate(filtered_bboxes_out):
329
+ if bbox["content"] is None:
330
+ bbox["content"] = captions.pop(0)
331
+
332
+ filtered_bboxes = box_convert(
333
+ boxes=filtered_bboxes, in_fmt="xyxy", out_fmt="cxcywh"
334
+ )
335
+
336
+ annotated_image = image_np.copy()
337
+ bboxes_annotate = filtered_bboxes * torch.Tensor([w, h, w, h])
338
+ xyxy_annotate = box_convert(
339
+ bboxes_annotate, in_fmt="cxcywh", out_fmt="xyxy"
340
+ ).numpy()
341
+ detections = Detections(xyxy=xyxy_annotate)
342
+ labels = [str(idx) for idx in range(bboxes_annotate.shape[0])]
343
+
344
+ annotated_image = self.annotator.annotate(
345
+ scene=annotated_image,
346
+ detections=detections,
347
+ labels=labels,
348
+ image_size=(w, h),
349
+ )
350
+ assert w == annotated_image.shape[1] and h == annotated_image.shape[0]
351
+
352
+ out_image = Image.fromarray(annotated_image)
353
+ out_buffer = io.BytesIO()
354
+ out_image.save(out_buffer, format="PNG")
355
+ encoded_image = base64.b64encode(out_buffer.getvalue()).decode("ascii")
356
+
357
+ return encoded_image, filtered_bboxes_out
358
+
359
+
360
+ def area(bbox: List[int]) -> int:
361
+ return (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
362
+
363
+
364
+ def intersection_area(bbox_left: List[int], bbox_right: List[int]) -> int:
365
+ return max(
366
+ 0, min(bbox_left[2], bbox_right[2]) - min(bbox_left[0], bbox_right[0])
367
+ ) * max(0, min(bbox_left[3], bbox_right[3]) - min(bbox_left[1], bbox_right[1]))
368
+
369
+
370
+ def intersection_over_union(bbox_left: List[int], bbox_right: List[int]) -> float:
371
+ intersection = intersection_area(bbox_left, bbox_right)
372
+ bbox_left_area = area(bbox_left)
373
+ bbox_right_area = area(bbox_right)
374
+ union = bbox_left_area + bbox_right_area - intersection + 1e-6
375
+
376
+ ratio_left, ratio_right = 0, 0
377
+ if bbox_left_area > 0 and bbox_right_area > 0:
378
+ ratio_left = intersection / bbox_left_area
379
+ ratio_right = intersection / bbox_right_area
380
+ return max(intersection / union, ratio_left, ratio_right)
381
+
382
+
383
+ def overlap(bbox_left: List[int], bbox_right: List[int]) -> bool:
384
+ intersection = intersection_area(bbox_left, bbox_right)
385
+ ratio_left = intersection / area(bbox_left)
386
+ return ratio_left > 0.80
387
+
388
+
389
+ class BoxAnnotator:
390
+ def __init__(
391
+ self,
392
+ color: Union[Color, ColorPalette] = ColorPalette.DEFAULT, # type: ignore
393
+ thickness: int = 3,
394
+ text_color: Color = Color.BLACK, # type: ignore
395
+ text_scale: float = 0.5,
396
+ text_thickness: int = 2,
397
+ text_padding: int = 10,
398
+ avoid_overlap: bool = True,
399
+ ):
400
+ self.color: Union[Color, ColorPalette] = color
401
+ self.thickness: int = thickness
402
+ self.text_color: Color = text_color
403
+ self.text_scale: float = text_scale
404
+ self.text_thickness: int = text_thickness
405
+ self.text_padding: int = text_padding
406
+ self.avoid_overlap: bool = avoid_overlap
407
+
408
+ def annotate(
409
+ self,
410
+ scene: np.ndarray,
411
+ detections: Detections,
412
+ labels: Optional[List[str]] = None,
413
+ skip_label: bool = False,
414
+ image_size: Optional[Tuple[int, int]] = None,
415
+ ) -> np.ndarray:
416
+ font = cv2.FONT_HERSHEY_SIMPLEX
417
+ for i in range(len(detections)):
418
+ x1, y1, x2, y2 = detections.xyxy[i].astype(int)
419
+ class_id = (
420
+ detections.class_id[i] if detections.class_id is not None else None
421
+ )
422
+ idx = class_id if class_id is not None else i
423
+ color = (
424
+ self.color.by_idx(idx)
425
+ if isinstance(self.color, ColorPalette)
426
+ else self.color
427
+ )
428
+ cv2.rectangle(
429
+ img=scene,
430
+ pt1=(x1, y1),
431
+ pt2=(x2, y2),
432
+ color=color.as_bgr(),
433
+ thickness=self.thickness,
434
+ )
435
+ if skip_label:
436
+ continue
437
+
438
+ text = (
439
+ f"{class_id}"
440
+ if (labels is None or len(detections) != len(labels))
441
+ else labels[i]
442
+ )
443
+
444
+ text_width, text_height = cv2.getTextSize(
445
+ text=text,
446
+ fontFace=font,
447
+ fontScale=self.text_scale,
448
+ thickness=self.text_thickness,
449
+ )[0]
450
+
451
+ if not self.avoid_overlap:
452
+ text_x = x1 + self.text_padding
453
+ text_y = y1 - self.text_padding
454
+
455
+ text_background_x1 = x1
456
+ text_background_y1 = y1 - 2 * self.text_padding - text_height
457
+
458
+ text_background_x2 = x1 + 2 * self.text_padding + text_width
459
+ text_background_y2 = y1
460
+ else:
461
+ (
462
+ text_x,
463
+ text_y,
464
+ text_background_x1,
465
+ text_background_y1,
466
+ text_background_x2,
467
+ text_background_y2,
468
+ ) = self.get_optimal_label_pos(
469
+ self.text_padding,
470
+ text_width,
471
+ text_height,
472
+ x1,
473
+ y1,
474
+ x2,
475
+ y2,
476
+ detections,
477
+ image_size,
478
+ )
479
+
480
+ cv2.rectangle(
481
+ img=scene,
482
+ pt1=(text_background_x1, text_background_y1),
483
+ pt2=(text_background_x2, text_background_y2),
484
+ color=color.as_bgr(),
485
+ thickness=cv2.FILLED,
486
+ )
487
+ box_color = color.as_rgb()
488
+ luminance = (
489
+ 0.299 * box_color[0] + 0.587 * box_color[1] + 0.114 * box_color[2]
490
+ )
491
+ text_color = (0, 0, 0) if luminance > 160 else (255, 255, 255)
492
+ cv2.putText(
493
+ img=scene,
494
+ text=text,
495
+ org=(text_x, text_y),
496
+ fontFace=font,
497
+ fontScale=self.text_scale,
498
+ color=text_color,
499
+ thickness=self.text_thickness,
500
+ lineType=cv2.LINE_AA,
501
+ )
502
+ return scene
503
+
504
+ @staticmethod
505
+ def get_optimal_label_pos(
506
+ text_padding, text_width, text_height, x1, y1, x2, y2, detections, image_size
507
+ ):
508
+ def get_is_overlap(
509
+ detections,
510
+ text_background_x1,
511
+ text_background_y1,
512
+ text_background_x2,
513
+ text_background_y2,
514
+ image_size,
515
+ ):
516
+ is_overlap = False
517
+ for i in range(len(detections)):
518
+ detection = detections.xyxy[i].astype(int)
519
+ if (
520
+ intersection_over_union(
521
+ [
522
+ text_background_x1,
523
+ text_background_y1,
524
+ text_background_x2,
525
+ text_background_y2,
526
+ ],
527
+ detection,
528
+ )
529
+ > 0.3
530
+ ):
531
+ is_overlap = True
532
+ break
533
+ if (
534
+ text_background_x1 < 0
535
+ or text_background_x2 > image_size[0]
536
+ or text_background_y1 < 0
537
+ or text_background_y2 > image_size[1]
538
+ ):
539
+ is_overlap = True
540
+ return is_overlap
541
+
542
+ text_x = x1 + text_padding
543
+ text_y = y1 - text_padding
544
+
545
+ text_background_x1 = x1
546
+ text_background_y1 = y1 - 2 * text_padding - text_height
547
+
548
+ text_background_x2 = x1 + 2 * text_padding + text_width
549
+ text_background_y2 = y1
550
+ is_overlap = get_is_overlap(
551
+ detections,
552
+ text_background_x1,
553
+ text_background_y1,
554
+ text_background_x2,
555
+ text_background_y2,
556
+ image_size,
557
+ )
558
+ if not is_overlap:
559
+ return (
560
+ text_x,
561
+ text_y,
562
+ text_background_x1,
563
+ text_background_y1,
564
+ text_background_x2,
565
+ text_background_y2,
566
+ )
567
+
568
+ text_x = x1 - text_padding - text_width
569
+ text_y = y1 + text_padding + text_height
570
+
571
+ text_background_x1 = x1 - 2 * text_padding - text_width
572
+ text_background_y1 = y1
573
+
574
+ text_background_x2 = x1
575
+ text_background_y2 = y1 + 2 * text_padding + text_height
576
+ is_overlap = get_is_overlap(
577
+ detections,
578
+ text_background_x1,
579
+ text_background_y1,
580
+ text_background_x2,
581
+ text_background_y2,
582
+ image_size,
583
+ )
584
+ if not is_overlap:
585
+ return (
586
+ text_x,
587
+ text_y,
588
+ text_background_x1,
589
+ text_background_y1,
590
+ text_background_x2,
591
+ text_background_y2,
592
+ )
593
+
594
+ text_x = x2 + text_padding
595
+ text_y = y1 + text_padding + text_height
596
+
597
+ text_background_x1 = x2
598
+ text_background_y1 = y1
599
+
600
+ text_background_x2 = x2 + 2 * text_padding + text_width
601
+ text_background_y2 = y1 + 2 * text_padding + text_height
602
+
603
+ is_overlap = get_is_overlap(
604
+ detections,
605
+ text_background_x1,
606
+ text_background_y1,
607
+ text_background_x2,
608
+ text_background_y2,
609
+ image_size,
610
+ )
611
+ if not is_overlap:
612
+ return (
613
+ text_x,
614
+ text_y,
615
+ text_background_x1,
616
+ text_background_y1,
617
+ text_background_x2,
618
+ text_background_y2,
619
+ )
620
+
621
+ text_x = x2 - text_padding - text_width
622
+ text_y = y1 - text_padding
623
+
624
+ text_background_x1 = x2 - 2 * text_padding - text_width
625
+ text_background_y1 = y1 - 2 * text_padding - text_height
626
+
627
+ text_background_x2 = x2
628
+ text_background_y2 = y1
629
+
630
+ is_overlap = get_is_overlap(
631
+ detections,
632
+ text_background_x1,
633
+ text_background_y1,
634
+ text_background_x2,
635
+ text_background_y2,
636
+ image_size,
637
+ )
638
+ if not is_overlap:
639
+ return (
640
+ text_x,
641
+ text_y,
642
+ text_background_x1,
643
+ text_background_y1,
644
+ text_background_x2,
645
+ text_background_y2,
646
+ )
647
+
648
+ return (
649
+ text_x,
650
+ text_y,
651
+ text_background_x1,
652
+ text_background_y1,
653
+ text_background_x2,
654
+ text_background_y2,
655
+ )
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ easyocr
2
+ einops==0.8.0
3
+ opencv-python
4
+ opencv-python-headless
5
+ supervision==0.18.0
6
+ timm
7
+ ultralytics==8.3.70