wuhp commited on
Commit
e9206ba
·
verified ·
1 Parent(s): 344d9e8

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +257 -0
app.py ADDED
@@ -0,0 +1,257 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py – YOLOv8 Dataset Quality Evaluator for Hugging Face Spaces
2
+ """
3
+ Gradio application for evaluating the quality of YOLO‑format object‑detection datasets exported from Roboflow (or any
4
+ other labeling tool). The app runs a configurable pipeline of automated checks and returns a structured report plus
5
+ visual artefacts that make it easy to spot problems.
6
+
7
+ Designed for **Hugging Face Spaces**:
8
+ * Keep the file name `app.py` (Spaces’ default entry‑point).
9
+ * Add a `requirements.txt` (see README) so Spaces installs the right deps.
10
+ * The app binds to `0.0.0.0` and picks up the port from the `PORT` env var (set by Spaces).
11
+
12
+ Checks implemented
13
+ ------------------
14
+ 1. **Dataset integrity** – verify that every image has a label file (or an allowed empty‑label exemption) and that each
15
+ label file parses correctly.
16
+ 2. **Class stats / balance** – count instances per class and per‑image instance distribution.
17
+ 3. **Image quality** – flag blurry, too‑dark or over‑bright images using simple OpenCV heuristics.
18
+ 4. **Duplicate & near‑duplicate images** – perceptual‑hash pass (fallback) or FastDup if available.
19
+ 5. **Duplicate boxes** – IoU > 0.9 duplicates in the same image.
20
+ 6. **Optional model‑assisted label QA** – if the user provides a YOLO weights file, run inference and compute IoU‑based
21
+ agreement metrics plus Cleanlab label‑quality scores when the library is installed.
22
+ 7. **Composite scoring** – combine sub‑scores (with adjustable weights) into a final 0‑100 quality score.
23
+
24
+ The code is intentionally modular: each check lives in its own function that returns a `dict` of metrics; adding new
25
+ checks is as simple as creating another function that follows the same signature and adding it to the `CHECKS` list.
26
+ """
27
+ from __future__ import annotations
28
+
29
+ import imghdr
30
+ import json
31
+ import os
32
+ import shutil
33
+ import tempfile
34
+ from collections import Counter
35
+ from dataclasses import dataclass
36
+ from pathlib import Path
37
+ from typing import Dict, List, Tuple
38
+ from concurrent.futures import ThreadPoolExecutor, as_completed
39
+
40
+ import gradio as gr
41
+ import numpy as np
42
+ import pandas as pd
43
+ import yaml
44
+ from PIL import Image
45
+ from tqdm import tqdm
46
+
47
+ # Optional imports (wrapped so the app still works without them)
48
+ try:
49
+ import cv2 # type: ignore
50
+ except ImportError:
51
+ cv2 = None # pragma: no cover
52
+
53
+ try:
54
+ import imagehash # type: ignore
55
+ except ImportError:
56
+ imagehash = None # pragma: no cover
57
+
58
+ try:
59
+ from ultralytics import YOLO # type: ignore
60
+ except ImportError:
61
+ YOLO = None # noqa: N806
62
+
63
+ try:
64
+ from cleanlab.object_detection import rank as cl_rank # type: ignore
65
+ except ImportError:
66
+ cl_rank = None
67
+
68
+ FASTDUP_AVAILABLE = False # lazy‑loaded if requested
69
+
70
+ # --------------------------------------------------------------------------------------
71
+ # Utility dataclasses
72
+ # --------------------------------------------------------------------------------------
73
+ @dataclass
74
+ class ImageMetrics:
75
+ path: Path
76
+ width: int
77
+ height: int
78
+ blur_score: float | None = None
79
+ brightness: float | None = None
80
+
81
+ @property
82
+ def aspect_ratio(self) -> float:
83
+ return self.width / self.height if self.height else 0
84
+
85
+
86
+ @dataclass
87
+ class DuplicateGroup:
88
+ hash_val: str
89
+ paths: List[Path]
90
+
91
+
92
+ # --------------------------------------------------------------------------------------
93
+ # Core helpers
94
+ # --------------------------------------------------------------------------------------
95
+
96
+ def load_yaml(yaml_path: Path) -> Dict:
97
+ with yaml_path.open("r", encoding="utf-8") as f:
98
+ return yaml.safe_load(f)
99
+
100
+
101
+ def parse_label_file(label_path: Path) -> List[Tuple[int, float, float, float, float]]:
102
+ """Return list of (class_id, x_center, y_center, width, height)."""
103
+ entries: List[Tuple[int, float, float, float, float]] = []
104
+ with label_path.open("r", encoding="utf-8") as f:
105
+ for line in f:
106
+ parts = line.strip().split()
107
+ if len(parts) != 5:
108
+ raise ValueError(f"Malformed label line in {label_path}: {line}")
109
+ class_id, *coords = parts
110
+ entries.append((int(class_id), *map(float, coords)))
111
+ return entries
112
+
113
+
114
+ def guess_image_dirs(root: Path) -> List[Path]:
115
+ """Return potential images sub‑directories under a Roboflow/YOLO export."""
116
+ candidates = [
117
+ root / "images",
118
+ root / "train" / "images",
119
+ root / "valid" / "images",
120
+ root / "val" / "images",
121
+ root / "test" / "images",
122
+ ]
123
+ return [p for p in candidates if p.exists()]
124
+
125
+
126
+ def gather_dataset(root: Path, yaml_path: Path | None = None) -> Tuple[List[Path], List[Path], Dict]:
127
+ """Return (image_paths, label_paths, yaml_dict)."""
128
+ if yaml_path is None:
129
+ yaml_candidates = list(root.glob("*.yaml"))
130
+ if not yaml_candidates:
131
+ raise FileNotFoundError("Could not find a YAML config in dataset root; please supply explicitly.")
132
+ yaml_path = yaml_candidates[0]
133
+ meta = load_yaml(yaml_path)
134
+
135
+ image_dirs = guess_image_dirs(root)
136
+ if not image_dirs:
137
+ raise FileNotFoundError("No images directory found under dataset root; expected images/ subfolder(s).")
138
+
139
+ image_paths: List[Path] = [p for d in image_dirs for p in d.rglob("*.*") if imghdr.what(p) is not None]
140
+ label_paths: List[Path] = []
141
+ for img_path in image_paths:
142
+ # <split>/images/img123.jpg -> <split>/labels/img123.txt
143
+ label_path = img_path.parent.parent / "labels" / f"{img_path.stem}.txt"
144
+ label_paths.append(label_path)
145
+ return image_paths, label_paths, meta
146
+
147
+
148
+ # --------------------------------------------------------------------------------------
149
+ # Individual checks
150
+ # --------------------------------------------------------------------------------------
151
+
152
+ def _is_corrupt(img_path: Path) -> bool:
153
+ try:
154
+ with Image.open(img_path) as im:
155
+ im.verify()
156
+ return False
157
+ except Exception: # noqa: BLE001
158
+ return True
159
+
160
+
161
+ def check_integrity(image_paths: List[Path], label_paths: List[Path]) -> Dict:
162
+ """Verify that images and labels exist and are readable."""
163
+ missing_labels = [img for img, lbl in zip(image_paths, label_paths) if not lbl.exists()]
164
+ missing_images = [lbl for lbl in label_paths if lbl.exists() and not lbl.with_name("images").exists()]
165
+
166
+ # Parallel corruption check for speed on Spaces CPU boxes
167
+ corrupt_images = []
168
+ with ThreadPoolExecutor(max_workers=os.cpu_count() or 4) as ex:
169
+ futures = {ex.submit(_is_corrupt, p): p for p in image_paths}
170
+ for fut in tqdm(as_completed(futures), total=len(futures), desc="Integrity", leave=False):
171
+ if fut.result():
172
+ corrupt_images.append(futures[fut])
173
+
174
+ score = 100 - (len(missing_labels) + len(missing_images) + len(corrupt_images)) / max(len(image_paths), 1) * 100
175
+ return {
176
+ "name": "Integrity",
177
+ "score": max(score, 0),
178
+ "details": {
179
+ "missing_label_files": [str(p) for p in missing_labels],
180
+ "missing_image_files": [str(p) for p in missing_images],
181
+ "corrupt_images": [str(p) for p in corrupt_images],
182
+ },
183
+ }
184
+
185
+
186
+ def compute_class_stats(label_paths: List[Path]) -> Dict:
187
+ class_counts = Counter()
188
+ boxes_per_image = []
189
+ for lbl in label_paths:
190
+ if not lbl.exists():
191
+ continue
192
+ boxes = parse_label_file(lbl)
193
+ boxes_per_image.append(len(boxes))
194
+ class_counts.update([b[0] for b in boxes])
195
+ if not class_counts:
196
+ return {"name": "Class balance", "score": 0, "details": {"message": "No labels found"}}
197
+ max_count, min_count = max(class_counts.values()), min(class_counts.values())
198
+ balance_score = min_count / max_count * 100 if max_count else 0
199
+ return {
200
+ "name": "Class balance",
201
+ "score": balance_score,
202
+ "details": {
203
+ "class_counts": dict(class_counts),
204
+ "boxes_per_image_stats": {
205
+ "min": int(np.min(boxes_per_image) if boxes_per_image else 0),
206
+ "max": int(np.max(boxes_per_image) if boxes_per_image else 0),
207
+ "mean": float(np.mean(boxes_per_image) if boxes_per_image else 0),
208
+ },
209
+ },
210
+ }
211
+
212
+
213
+ def image_quality_metrics(image_paths: List[Path], blur_thresh: float = 100.0) -> Dict:
214
+ if cv2 is None:
215
+ return {"name": "Image quality", "score": 100, "details": {"message": "cv2 not installed – check skipped"}}
216
+ blurry, dark, bright = [], [], []
217
+ for p in tqdm(image_paths, desc="Image quality", leave=False):
218
+ img = cv2.imread(str(p))
219
+ if img is None:
220
+ continue
221
+ gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
222
+ lap_var = cv2.Laplacian(gray, cv2.CV_64F).var()
223
+ brightness = np.mean(gray)
224
+ if lap_var < blur_thresh:
225
+ blurry.append(p)
226
+ if brightness < 25:
227
+ dark.append(p)
228
+ if brightness > 230:
229
+ bright.append(p)
230
+ total = len(image_paths)
231
+ bad = len(set(blurry + dark + bright))
232
+ score = 100 - bad / max(total, 1) * 100
233
+ return {
234
+ "name": "Image quality",
235
+ "score": score,
236
+ "details": {
237
+ "blurry": [str(p) for p in blurry],
238
+ "dark": [str(p) for p in dark],
239
+ "bright": [str(p) for p in bright],
240
+ },
241
+ }
242
+
243
+
244
+ def detect_duplicates(image_paths: List[Path], use_fastdup: bool = False) -> Dict:
245
+ if use_fastdup:
246
+ global FASTDUP_AVAILABLE
247
+ try:
248
+ import fastdup # type: ignore
249
+
250
+ FASTDUP_AVAILABLE = True
251
+ except ImportError:
252
+ use_fastdup = False
253
+ duplicate_groups: List[DuplicateGroup] = []
254
+ if use_fastdup and FASTDUP_AVAILABLE and len(image_paths):
255
+ fd = fastdup.create(input_dir=str(image_paths[0].parent.parent), work_dir="fastdup_work")
256
+ fd.run(num_images=0)
257
+ clusters = fd.clusters # type: ignore[attr