Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,257 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# app.py – YOLOv8 Dataset Quality Evaluator for Hugging Face Spaces
|
2 |
+
"""
|
3 |
+
Gradio application for evaluating the quality of YOLO‑format object‑detection datasets exported from Roboflow (or any
|
4 |
+
other labeling tool). The app runs a configurable pipeline of automated checks and returns a structured report plus
|
5 |
+
visual artefacts that make it easy to spot problems.
|
6 |
+
|
7 |
+
Designed for **Hugging Face Spaces**:
|
8 |
+
* Keep the file name `app.py` (Spaces’ default entry‑point).
|
9 |
+
* Add a `requirements.txt` (see README) so Spaces installs the right deps.
|
10 |
+
* The app binds to `0.0.0.0` and picks up the port from the `PORT` env var (set by Spaces).
|
11 |
+
|
12 |
+
Checks implemented
|
13 |
+
------------------
|
14 |
+
1. **Dataset integrity** – verify that every image has a label file (or an allowed empty‑label exemption) and that each
|
15 |
+
label file parses correctly.
|
16 |
+
2. **Class stats / balance** – count instances per class and per‑image instance distribution.
|
17 |
+
3. **Image quality** – flag blurry, too‑dark or over‑bright images using simple OpenCV heuristics.
|
18 |
+
4. **Duplicate & near‑duplicate images** – perceptual‑hash pass (fallback) or FastDup if available.
|
19 |
+
5. **Duplicate boxes** – IoU > 0.9 duplicates in the same image.
|
20 |
+
6. **Optional model‑assisted label QA** – if the user provides a YOLO weights file, run inference and compute IoU‑based
|
21 |
+
agreement metrics plus Cleanlab label‑quality scores when the library is installed.
|
22 |
+
7. **Composite scoring** – combine sub‑scores (with adjustable weights) into a final 0‑100 quality score.
|
23 |
+
|
24 |
+
The code is intentionally modular: each check lives in its own function that returns a `dict` of metrics; adding new
|
25 |
+
checks is as simple as creating another function that follows the same signature and adding it to the `CHECKS` list.
|
26 |
+
"""
|
27 |
+
from __future__ import annotations
|
28 |
+
|
29 |
+
import imghdr
|
30 |
+
import json
|
31 |
+
import os
|
32 |
+
import shutil
|
33 |
+
import tempfile
|
34 |
+
from collections import Counter
|
35 |
+
from dataclasses import dataclass
|
36 |
+
from pathlib import Path
|
37 |
+
from typing import Dict, List, Tuple
|
38 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
39 |
+
|
40 |
+
import gradio as gr
|
41 |
+
import numpy as np
|
42 |
+
import pandas as pd
|
43 |
+
import yaml
|
44 |
+
from PIL import Image
|
45 |
+
from tqdm import tqdm
|
46 |
+
|
47 |
+
# Optional imports (wrapped so the app still works without them)
|
48 |
+
try:
|
49 |
+
import cv2 # type: ignore
|
50 |
+
except ImportError:
|
51 |
+
cv2 = None # pragma: no cover
|
52 |
+
|
53 |
+
try:
|
54 |
+
import imagehash # type: ignore
|
55 |
+
except ImportError:
|
56 |
+
imagehash = None # pragma: no cover
|
57 |
+
|
58 |
+
try:
|
59 |
+
from ultralytics import YOLO # type: ignore
|
60 |
+
except ImportError:
|
61 |
+
YOLO = None # noqa: N806
|
62 |
+
|
63 |
+
try:
|
64 |
+
from cleanlab.object_detection import rank as cl_rank # type: ignore
|
65 |
+
except ImportError:
|
66 |
+
cl_rank = None
|
67 |
+
|
68 |
+
FASTDUP_AVAILABLE = False # lazy‑loaded if requested
|
69 |
+
|
70 |
+
# --------------------------------------------------------------------------------------
|
71 |
+
# Utility dataclasses
|
72 |
+
# --------------------------------------------------------------------------------------
|
73 |
+
@dataclass
|
74 |
+
class ImageMetrics:
|
75 |
+
path: Path
|
76 |
+
width: int
|
77 |
+
height: int
|
78 |
+
blur_score: float | None = None
|
79 |
+
brightness: float | None = None
|
80 |
+
|
81 |
+
@property
|
82 |
+
def aspect_ratio(self) -> float:
|
83 |
+
return self.width / self.height if self.height else 0
|
84 |
+
|
85 |
+
|
86 |
+
@dataclass
|
87 |
+
class DuplicateGroup:
|
88 |
+
hash_val: str
|
89 |
+
paths: List[Path]
|
90 |
+
|
91 |
+
|
92 |
+
# --------------------------------------------------------------------------------------
|
93 |
+
# Core helpers
|
94 |
+
# --------------------------------------------------------------------------------------
|
95 |
+
|
96 |
+
def load_yaml(yaml_path: Path) -> Dict:
|
97 |
+
with yaml_path.open("r", encoding="utf-8") as f:
|
98 |
+
return yaml.safe_load(f)
|
99 |
+
|
100 |
+
|
101 |
+
def parse_label_file(label_path: Path) -> List[Tuple[int, float, float, float, float]]:
|
102 |
+
"""Return list of (class_id, x_center, y_center, width, height)."""
|
103 |
+
entries: List[Tuple[int, float, float, float, float]] = []
|
104 |
+
with label_path.open("r", encoding="utf-8") as f:
|
105 |
+
for line in f:
|
106 |
+
parts = line.strip().split()
|
107 |
+
if len(parts) != 5:
|
108 |
+
raise ValueError(f"Malformed label line in {label_path}: {line}")
|
109 |
+
class_id, *coords = parts
|
110 |
+
entries.append((int(class_id), *map(float, coords)))
|
111 |
+
return entries
|
112 |
+
|
113 |
+
|
114 |
+
def guess_image_dirs(root: Path) -> List[Path]:
|
115 |
+
"""Return potential images sub‑directories under a Roboflow/YOLO export."""
|
116 |
+
candidates = [
|
117 |
+
root / "images",
|
118 |
+
root / "train" / "images",
|
119 |
+
root / "valid" / "images",
|
120 |
+
root / "val" / "images",
|
121 |
+
root / "test" / "images",
|
122 |
+
]
|
123 |
+
return [p for p in candidates if p.exists()]
|
124 |
+
|
125 |
+
|
126 |
+
def gather_dataset(root: Path, yaml_path: Path | None = None) -> Tuple[List[Path], List[Path], Dict]:
|
127 |
+
"""Return (image_paths, label_paths, yaml_dict)."""
|
128 |
+
if yaml_path is None:
|
129 |
+
yaml_candidates = list(root.glob("*.yaml"))
|
130 |
+
if not yaml_candidates:
|
131 |
+
raise FileNotFoundError("Could not find a YAML config in dataset root; please supply explicitly.")
|
132 |
+
yaml_path = yaml_candidates[0]
|
133 |
+
meta = load_yaml(yaml_path)
|
134 |
+
|
135 |
+
image_dirs = guess_image_dirs(root)
|
136 |
+
if not image_dirs:
|
137 |
+
raise FileNotFoundError("No images directory found under dataset root; expected images/ subfolder(s).")
|
138 |
+
|
139 |
+
image_paths: List[Path] = [p for d in image_dirs for p in d.rglob("*.*") if imghdr.what(p) is not None]
|
140 |
+
label_paths: List[Path] = []
|
141 |
+
for img_path in image_paths:
|
142 |
+
# <split>/images/img123.jpg -> <split>/labels/img123.txt
|
143 |
+
label_path = img_path.parent.parent / "labels" / f"{img_path.stem}.txt"
|
144 |
+
label_paths.append(label_path)
|
145 |
+
return image_paths, label_paths, meta
|
146 |
+
|
147 |
+
|
148 |
+
# --------------------------------------------------------------------------------------
|
149 |
+
# Individual checks
|
150 |
+
# --------------------------------------------------------------------------------------
|
151 |
+
|
152 |
+
def _is_corrupt(img_path: Path) -> bool:
|
153 |
+
try:
|
154 |
+
with Image.open(img_path) as im:
|
155 |
+
im.verify()
|
156 |
+
return False
|
157 |
+
except Exception: # noqa: BLE001
|
158 |
+
return True
|
159 |
+
|
160 |
+
|
161 |
+
def check_integrity(image_paths: List[Path], label_paths: List[Path]) -> Dict:
|
162 |
+
"""Verify that images and labels exist and are readable."""
|
163 |
+
missing_labels = [img for img, lbl in zip(image_paths, label_paths) if not lbl.exists()]
|
164 |
+
missing_images = [lbl for lbl in label_paths if lbl.exists() and not lbl.with_name("images").exists()]
|
165 |
+
|
166 |
+
# Parallel corruption check for speed on Spaces CPU boxes
|
167 |
+
corrupt_images = []
|
168 |
+
with ThreadPoolExecutor(max_workers=os.cpu_count() or 4) as ex:
|
169 |
+
futures = {ex.submit(_is_corrupt, p): p for p in image_paths}
|
170 |
+
for fut in tqdm(as_completed(futures), total=len(futures), desc="Integrity", leave=False):
|
171 |
+
if fut.result():
|
172 |
+
corrupt_images.append(futures[fut])
|
173 |
+
|
174 |
+
score = 100 - (len(missing_labels) + len(missing_images) + len(corrupt_images)) / max(len(image_paths), 1) * 100
|
175 |
+
return {
|
176 |
+
"name": "Integrity",
|
177 |
+
"score": max(score, 0),
|
178 |
+
"details": {
|
179 |
+
"missing_label_files": [str(p) for p in missing_labels],
|
180 |
+
"missing_image_files": [str(p) for p in missing_images],
|
181 |
+
"corrupt_images": [str(p) for p in corrupt_images],
|
182 |
+
},
|
183 |
+
}
|
184 |
+
|
185 |
+
|
186 |
+
def compute_class_stats(label_paths: List[Path]) -> Dict:
|
187 |
+
class_counts = Counter()
|
188 |
+
boxes_per_image = []
|
189 |
+
for lbl in label_paths:
|
190 |
+
if not lbl.exists():
|
191 |
+
continue
|
192 |
+
boxes = parse_label_file(lbl)
|
193 |
+
boxes_per_image.append(len(boxes))
|
194 |
+
class_counts.update([b[0] for b in boxes])
|
195 |
+
if not class_counts:
|
196 |
+
return {"name": "Class balance", "score": 0, "details": {"message": "No labels found"}}
|
197 |
+
max_count, min_count = max(class_counts.values()), min(class_counts.values())
|
198 |
+
balance_score = min_count / max_count * 100 if max_count else 0
|
199 |
+
return {
|
200 |
+
"name": "Class balance",
|
201 |
+
"score": balance_score,
|
202 |
+
"details": {
|
203 |
+
"class_counts": dict(class_counts),
|
204 |
+
"boxes_per_image_stats": {
|
205 |
+
"min": int(np.min(boxes_per_image) if boxes_per_image else 0),
|
206 |
+
"max": int(np.max(boxes_per_image) if boxes_per_image else 0),
|
207 |
+
"mean": float(np.mean(boxes_per_image) if boxes_per_image else 0),
|
208 |
+
},
|
209 |
+
},
|
210 |
+
}
|
211 |
+
|
212 |
+
|
213 |
+
def image_quality_metrics(image_paths: List[Path], blur_thresh: float = 100.0) -> Dict:
|
214 |
+
if cv2 is None:
|
215 |
+
return {"name": "Image quality", "score": 100, "details": {"message": "cv2 not installed – check skipped"}}
|
216 |
+
blurry, dark, bright = [], [], []
|
217 |
+
for p in tqdm(image_paths, desc="Image quality", leave=False):
|
218 |
+
img = cv2.imread(str(p))
|
219 |
+
if img is None:
|
220 |
+
continue
|
221 |
+
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
222 |
+
lap_var = cv2.Laplacian(gray, cv2.CV_64F).var()
|
223 |
+
brightness = np.mean(gray)
|
224 |
+
if lap_var < blur_thresh:
|
225 |
+
blurry.append(p)
|
226 |
+
if brightness < 25:
|
227 |
+
dark.append(p)
|
228 |
+
if brightness > 230:
|
229 |
+
bright.append(p)
|
230 |
+
total = len(image_paths)
|
231 |
+
bad = len(set(blurry + dark + bright))
|
232 |
+
score = 100 - bad / max(total, 1) * 100
|
233 |
+
return {
|
234 |
+
"name": "Image quality",
|
235 |
+
"score": score,
|
236 |
+
"details": {
|
237 |
+
"blurry": [str(p) for p in blurry],
|
238 |
+
"dark": [str(p) for p in dark],
|
239 |
+
"bright": [str(p) for p in bright],
|
240 |
+
},
|
241 |
+
}
|
242 |
+
|
243 |
+
|
244 |
+
def detect_duplicates(image_paths: List[Path], use_fastdup: bool = False) -> Dict:
|
245 |
+
if use_fastdup:
|
246 |
+
global FASTDUP_AVAILABLE
|
247 |
+
try:
|
248 |
+
import fastdup # type: ignore
|
249 |
+
|
250 |
+
FASTDUP_AVAILABLE = True
|
251 |
+
except ImportError:
|
252 |
+
use_fastdup = False
|
253 |
+
duplicate_groups: List[DuplicateGroup] = []
|
254 |
+
if use_fastdup and FASTDUP_AVAILABLE and len(image_paths):
|
255 |
+
fd = fastdup.create(input_dir=str(image_paths[0].parent.parent), work_dir="fastdup_work")
|
256 |
+
fd.run(num_images=0)
|
257 |
+
clusters = fd.clusters # type: ignore[attr
|