Update app.py
Browse files
app.py
CHANGED
@@ -1,18 +1,16 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
# HuggingΒ Face Spaces picks up `app.py` automatically. Dependencies go in `requirements.txt`.
|
15 |
-
# Spaces injects the port as $PORT β we pass it to demo.launch().
|
16 |
|
17 |
from __future__ import annotations
|
18 |
|
@@ -23,7 +21,7 @@ import re
|
|
23 |
import shutil
|
24 |
import tempfile
|
25 |
from collections import Counter, defaultdict
|
26 |
-
from concurrent.futures import
|
27 |
from dataclasses import dataclass
|
28 |
from pathlib import Path
|
29 |
from typing import Dict, List, Tuple
|
@@ -35,9 +33,7 @@ import yaml
|
|
35 |
from PIL import Image
|
36 |
from tqdm import tqdm
|
37 |
|
38 |
-
#
|
39 |
-
# Optional heavy deps β present locally, but fineβgrained to keep Spaces slim #
|
40 |
-
# --------------------------------------------------------------------------- #
|
41 |
try:
|
42 |
import cv2 # type: ignore
|
43 |
except ImportError:
|
@@ -48,6 +44,11 @@ try:
|
|
48 |
except ImportError:
|
49 |
imagehash = None
|
50 |
|
|
|
|
|
|
|
|
|
|
|
51 |
try:
|
52 |
from ultralytics import YOLO # type: ignore
|
53 |
except ImportError:
|
@@ -58,36 +59,44 @@ try:
|
|
58 |
except ImportError:
|
59 |
Roboflow = None # type: ignore
|
60 |
|
61 |
-
#
|
62 |
TMP_ROOT = Path(tempfile.gettempdir()) / "rf_datasets"
|
63 |
TMP_ROOT.mkdir(parents=True, exist_ok=True)
|
64 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
|
66 |
@dataclass
|
67 |
class DuplicateGroup:
|
68 |
hash_val: str
|
69 |
paths: List[Path]
|
70 |
|
|
|
71 |
|
72 |
-
# --------------------------------------------------------------------------- #
|
73 |
-
# Generic helpers #
|
74 |
-
# --------------------------------------------------------------------------- #
|
75 |
def load_yaml(path: Path) -> Dict:
|
76 |
with path.open(encoding="utf-8") as f:
|
77 |
return yaml.safe_load(f)
|
78 |
|
79 |
|
80 |
-
def parse_label_file(path: Path) ->
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
return out
|
91 |
|
92 |
|
93 |
def guess_image_dirs(root: Path) -> List[Path]:
|
@@ -114,13 +123,13 @@ def gather_dataset(root: Path, yaml_path: Path | None = None):
|
|
114 |
raise FileNotFoundError("images/ directory hierarchy missing")
|
115 |
|
116 |
imgs = [p for d in img_dirs for p in d.rglob("*.*") if imghdr.what(p) is not None]
|
117 |
-
|
|
|
118 |
return imgs, lbls, meta
|
119 |
|
|
|
|
|
120 |
|
121 |
-
# --------------------------------------------------------------------------- #
|
122 |
-
# Qualityβcheck stages #
|
123 |
-
# --------------------------------------------------------------------------- #
|
124 |
def _is_corrupt(path: Path) -> bool:
|
125 |
try:
|
126 |
with Image.open(path) as im:
|
@@ -130,40 +139,38 @@ def _is_corrupt(path: Path) -> bool:
|
|
130 |
return True
|
131 |
|
132 |
|
133 |
-
def qc_integrity(imgs: List[Path], lbls: List[Path])
|
134 |
-
miss_lbl = [i for i, l in zip(imgs, lbls) if
|
135 |
-
miss_img = [l for l in lbls if l.exists() and not (l.parent.parent / "images" / f"{l.stem}{l.suffix}").exists()]
|
136 |
-
|
137 |
corrupt: List[Path] = []
|
138 |
-
with
|
139 |
fut = {ex.submit(_is_corrupt, p): p for p in imgs}
|
140 |
for f in tqdm(as_completed(fut), total=len(fut), desc="integrity", leave=False):
|
141 |
if f.result():
|
142 |
corrupt.append(fut[f])
|
143 |
|
144 |
-
score = 100 - (len(miss_lbl) + len(
|
145 |
return {
|
146 |
"name": "Integrity",
|
147 |
"score": max(score, 0),
|
148 |
"details": {
|
149 |
"missing_label_files": [str(p) for p in miss_lbl],
|
150 |
-
"missing_image_files": [str(p) for p in miss_img],
|
151 |
"corrupt_images": [str(p) for p in corrupt],
|
152 |
},
|
153 |
}
|
154 |
|
|
|
155 |
|
156 |
-
def qc_class_balance(lbls: List[Path])
|
157 |
cls_counts = Counter()
|
158 |
boxes_per_img = []
|
159 |
for l in lbls:
|
160 |
-
bs = parse_label_file(l)
|
161 |
boxes_per_img.append(len(bs))
|
162 |
cls_counts.update(b[0] for b in bs)
|
163 |
|
164 |
if not cls_counts:
|
165 |
return {"name": "Class balance", "score": 0, "details": "No labels"}
|
166 |
-
bal = min(cls_counts.values()) / max(cls_counts.values()) * 100
|
167 |
return {
|
168 |
"name": "Class balance",
|
169 |
"score": bal,
|
@@ -177,24 +184,39 @@ def qc_class_balance(lbls: List[Path]) -> Dict:
|
|
177 |
},
|
178 |
}
|
179 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
180 |
|
181 |
-
|
|
|
182 |
if cv2 is None:
|
183 |
return {"name": "Image quality", "score": 100, "details": "cv2 not installed"}
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
|
|
|
|
|
|
|
|
198 |
|
199 |
bad = len(set(blurry + dark + bright))
|
200 |
score = 100 - bad / max(len(imgs), 1) * 100
|
@@ -208,15 +230,40 @@ def qc_image_quality(imgs: List[Path], blur_thr: float = 100.0) -> Dict:
|
|
208 |
},
|
209 |
}
|
210 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
211 |
|
212 |
-
def qc_duplicates(imgs: List[Path]) -> Dict:
|
213 |
if imagehash is None:
|
214 |
-
return {"name": "Duplicates", "score": 100, "details": "
|
|
|
|
|
|
|
215 |
|
216 |
hashes: Dict[str, List[Path]] = defaultdict(list)
|
217 |
-
|
218 |
-
h
|
219 |
-
|
|
|
|
|
|
|
|
|
|
|
220 |
|
221 |
groups = [g for g in hashes.values() if len(g) > 1]
|
222 |
dup = sum(len(g) - 1 for g in groups)
|
@@ -224,9 +271,10 @@ def qc_duplicates(imgs: List[Path]) -> Dict:
|
|
224 |
return {
|
225 |
"name": "Duplicates",
|
226 |
"score": score,
|
227 |
-
"details": {"groups": [[str(p) for p in g] for g in groups]},
|
228 |
}
|
229 |
|
|
|
230 |
|
231 |
def _rel_iou(b1, b2):
|
232 |
x1, y1, w1, h1 = b1
|
@@ -234,58 +282,48 @@ def _rel_iou(b1, b2):
|
|
234 |
xa1, ya1, xa2, ya2 = x1 - w1 / 2, y1 - h1 / 2, x1 + w1 / 2, y1 + h1 / 2
|
235 |
xb1, yb1, xb2, yb2 = x2 - w2 / 2, y2 - h2 / 2, x2 + w2 / 2, y2 + h2 / 2
|
236 |
ix1, iy1, ix2, iy2 = max(xa1, xb1), max(ya1, yb1), min(xa2, xb2), min(ya2, yb2)
|
237 |
-
|
238 |
-
inter = iw * ih
|
239 |
union = w1 * h1 + w2 * h2 - inter
|
240 |
-
return inter / union if union else 0
|
241 |
|
242 |
|
243 |
-
def qc_model_qa(imgs: List[Path], weights: str | None, lbls: List[Path], iou_thr: float = 0.5)
|
244 |
if weights is None or YOLO is None:
|
245 |
-
return {"name": "Model QA", "score": 100, "details": "
|
246 |
|
247 |
model = YOLO(weights)
|
248 |
ious, mism = [], []
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
res
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
|
|
|
|
|
|
263 |
|
264 |
miou = float(np.mean(ious)) if ious else 1.0
|
265 |
return {
|
266 |
"name": "Model QA",
|
267 |
"score": miou * 100,
|
268 |
-
"details": {"mean_iou": miou, "mismatched_images":
|
269 |
}
|
270 |
|
271 |
-
|
272 |
-
# --------------------------------------------------------------------------- #
|
273 |
-
DEFAULT_W = {
|
274 |
-
"Integrity": 0.30,
|
275 |
-
"Class balance": 0.15,
|
276 |
-
"Image quality": 0.15,
|
277 |
-
"Duplicates": 0.10,
|
278 |
-
"Model QA": 0.30,
|
279 |
-
}
|
280 |
-
|
281 |
|
282 |
def aggregate(scores):
|
283 |
return sum(DEFAULT_W.get(r["name"], 0) * r["score"] for r in scores)
|
284 |
|
285 |
-
|
286 |
-
# --------------------------------------------------------------------------- #
|
287 |
-
# Roboflow helpers #
|
288 |
-
# --------------------------------------------------------------------------- #
|
289 |
RF_RE = re.compile(r"https://universe\.roboflow\.com/([^/]+)/([^/]+)/dataset/(\d+)")
|
290 |
|
291 |
def download_rf_dataset(url: str, rf_api: "Roboflow", dest: Path) -> Path:
|
@@ -302,10 +340,8 @@ def download_rf_dataset(url: str, rf_api: "Roboflow", dest: Path) -> Path:
|
|
302 |
project.version(int(ver)).download("yolov8", location=str(ds_dir))
|
303 |
return ds_dir
|
304 |
|
|
|
305 |
|
306 |
-
# --------------------------------------------------------------------------- #
|
307 |
-
# Main evaluation logic #
|
308 |
-
# --------------------------------------------------------------------------- #
|
309 |
def run_quality(root: Path, yaml_override: Path | None, weights: Path | None):
|
310 |
imgs, lbls, meta = gather_dataset(root, yaml_override)
|
311 |
res = [
|
@@ -316,8 +352,8 @@ def run_quality(root: Path, yaml_override: Path | None, weights: Path | None):
|
|
316 |
qc_model_qa(imgs, str(weights) if weights else None, lbls),
|
317 |
]
|
318 |
final = aggregate(res)
|
319 |
-
|
320 |
-
md = [f"## **{meta.get('name', root.name)}**
|
321 |
for r in res:
|
322 |
md.append(f"### {r['name']}Β Β {r['score']:.1f}")
|
323 |
md.append("<details><summary>details</summary>\n\n```json")
|
@@ -330,10 +366,8 @@ def run_quality(root: Path, yaml_override: Path | None, weights: Path | None):
|
|
330 |
df.index.name = "class"
|
331 |
return md_str, df
|
332 |
|
|
|
333 |
|
334 |
-
# --------------------------------------------------------------------------- #
|
335 |
-
# Gradio interface #
|
336 |
-
# --------------------------------------------------------------------------- #
|
337 |
def evaluate(
|
338 |
api_key: str,
|
339 |
url_txt: gr.File | None,
|
@@ -347,7 +381,7 @@ def evaluate(
|
|
347 |
|
348 |
reports, dfs = [], []
|
349 |
|
350 |
-
#
|
351 |
if url_txt:
|
352 |
if Roboflow is None:
|
353 |
return "`roboflow` not installed", pd.DataFrame()
|
@@ -355,8 +389,7 @@ def evaluate(
|
|
355 |
return "Enter Roboflow API key", pd.DataFrame()
|
356 |
|
357 |
rf = Roboflow(api_key=api_key.strip())
|
358 |
-
|
359 |
-
for line in txt_lines:
|
360 |
if not line.strip():
|
361 |
continue
|
362 |
try:
|
@@ -367,7 +400,7 @@ def evaluate(
|
|
367 |
except Exception as e:
|
368 |
reports.append(f"### {line}\n\nβ οΈΒ {e}")
|
369 |
|
370 |
-
#
|
371 |
if zip_file:
|
372 |
tmp_dir = Path(tempfile.mkdtemp())
|
373 |
shutil.unpack_archive(zip_file.name, tmp_dir)
|
@@ -376,7 +409,7 @@ def evaluate(
|
|
376 |
dfs.append(df)
|
377 |
shutil.rmtree(tmp_dir, ignore_errors=True)
|
378 |
|
379 |
-
#
|
380 |
if server_path:
|
381 |
md, df = run_quality(Path(server_path), Path(yaml_file.name) if yaml_file else None, Path(weights.name) if weights else None)
|
382 |
reports.append(md)
|
@@ -386,7 +419,7 @@ def evaluate(
|
|
386 |
combined_df = pd.concat(dfs).groupby(level=0).sum() if dfs else pd.DataFrame()
|
387 |
return summary_md, combined_df
|
388 |
|
389 |
-
|
390 |
with gr.Blocks(title="YOLO Dataset Quality Evaluator") as demo:
|
391 |
gr.Markdown(
|
392 |
"""
|
@@ -396,7 +429,7 @@ with gr.Blocks(title="YOLO Dataset Quality Evaluator") as demo:
|
|
396 |
1. Paste your **Roboflow API key**
|
397 |
2. Upload a **.txt** file β one `https://universe.roboflow.com/.../dataset/x` per line
|
398 |
|
399 |
-
### Manual
|
400 |
* Upload a dataset **ZIP** or type a dataset **path** on the server
|
401 |
* Optionally supply a custom **data.yaml** and/or a **YOLOΒ .pt** weights file for modelβassisted QA
|
402 |
"""
|
|
|
1 |
+
"""
|
2 |
+
app.py β Roboflowβaware YOLOv8 Dataset Quality Evaluator (v2)
|
3 |
+
|
4 |
+
Changelog (2025β04β17)
|
5 |
+
ββββββββββββββββββββββ
|
6 |
+
β’ **CPUβbound loops parallelised** with `concurrent.futures.ProcessPoolExecutor`.
|
7 |
+
β’ **Batch inference** in `qc_model_qa()` (GPU util β, latency β).
|
8 |
+
β’ Optional **fastdup** path for duplicate detection (ββ―10Γ faster on large sets).
|
9 |
+
β’ Faster NumPyβbased `parse_label_file()`.
|
10 |
+
β’ Small refactors β clearer separation of stages & fewer globals.
|
11 |
+
β’ Graceful degradation if heavy deps unavailable (cv2, imagehash, fastdup).
|
12 |
+
β’ Tunable `CPU_COUNT` + envβvar guard for HF Spaces quota.
|
13 |
+
"""
|
|
|
|
|
14 |
|
15 |
from __future__ import annotations
|
16 |
|
|
|
21 |
import shutil
|
22 |
import tempfile
|
23 |
from collections import Counter, defaultdict
|
24 |
+
from concurrent.futures import ProcessPoolExecutor, as_completed
|
25 |
from dataclasses import dataclass
|
26 |
from pathlib import Path
|
27 |
from typing import Dict, List, Tuple
|
|
|
33 |
from PIL import Image
|
34 |
from tqdm import tqdm
|
35 |
|
36 |
+
# βββββββββββββββββββββββββββββββββββββββββ Heavy optional deps ββ
|
|
|
|
|
37 |
try:
|
38 |
import cv2 # type: ignore
|
39 |
except ImportError:
|
|
|
44 |
except ImportError:
|
45 |
imagehash = None
|
46 |
|
47 |
+
try:
|
48 |
+
import fastdup # type: ignore
|
49 |
+
except ImportError:
|
50 |
+
fastdup = None
|
51 |
+
|
52 |
try:
|
53 |
from ultralytics import YOLO # type: ignore
|
54 |
except ImportError:
|
|
|
59 |
except ImportError:
|
60 |
Roboflow = None # type: ignore
|
61 |
|
62 |
+
# βββββββββββββββββββββββββββββββββββββββββ Config & constants ββ
|
63 |
TMP_ROOT = Path(tempfile.gettempdir()) / "rf_datasets"
|
64 |
TMP_ROOT.mkdir(parents=True, exist_ok=True)
|
65 |
|
66 |
+
# Limit CPU workers on HF Spaces (feel free to raise locally)
|
67 |
+
CPU_COUNT = int(os.getenv("QC_CPU", max(1, (os.cpu_count() or 4) // 2)))
|
68 |
+
BATCH = int(os.getenv("QC_BATCH", 16))
|
69 |
+
|
70 |
+
DEFAULT_W = {
|
71 |
+
"Integrity": 0.30,
|
72 |
+
"Class balance": 0.15,
|
73 |
+
"Image quality": 0.15,
|
74 |
+
"Duplicates": 0.10,
|
75 |
+
"Model QA": 0.30,
|
76 |
+
}
|
77 |
|
78 |
@dataclass
|
79 |
class DuplicateGroup:
|
80 |
hash_val: str
|
81 |
paths: List[Path]
|
82 |
|
83 |
+
# βββββββββββββββββββββββββββββββββββββββββ Generic helpers βββββ
|
84 |
|
|
|
|
|
|
|
85 |
def load_yaml(path: Path) -> Dict:
|
86 |
with path.open(encoding="utf-8") as f:
|
87 |
return yaml.safe_load(f)
|
88 |
|
89 |
|
90 |
+
def parse_label_file(path: Path) -> list[tuple[int, float, float, float, float]]:
|
91 |
+
if not path.exists() or path.stat().st_size == 0:
|
92 |
+
return []
|
93 |
+
try:
|
94 |
+
arr = np.loadtxt(path, dtype=float)
|
95 |
+
if arr.ndim == 1:
|
96 |
+
arr = arr.reshape(1, -1)
|
97 |
+
return [tuple(row) for row in arr]
|
98 |
+
except Exception:
|
99 |
+
return []
|
|
|
100 |
|
101 |
|
102 |
def guess_image_dirs(root: Path) -> List[Path]:
|
|
|
123 |
raise FileNotFoundError("images/ directory hierarchy missing")
|
124 |
|
125 |
imgs = [p for d in img_dirs for p in d.rglob("*.*") if imghdr.what(p) is not None]
|
126 |
+
labels_root = {d.parent / "labels" for d in img_dirs}
|
127 |
+
lbls = [next((lr / f"{p.stem}.txt" for lr in labels_root if (lr / f"{p.stem}.txt").exists()), None) for p in imgs]
|
128 |
return imgs, lbls, meta
|
129 |
|
130 |
+
# βββββββββββββββββββββββββββββββββββββββββ Quality checks βββββ
|
131 |
+
# Integrity -----------------------------------------------------
|
132 |
|
|
|
|
|
|
|
133 |
def _is_corrupt(path: Path) -> bool:
|
134 |
try:
|
135 |
with Image.open(path) as im:
|
|
|
139 |
return True
|
140 |
|
141 |
|
142 |
+
def qc_integrity(imgs: List[Path], lbls: List[Path]):
|
143 |
+
miss_lbl = [i for i, l in zip(imgs, lbls) if l is None]
|
|
|
|
|
144 |
corrupt: List[Path] = []
|
145 |
+
with ProcessPoolExecutor(max_workers=CPU_COUNT) as ex:
|
146 |
fut = {ex.submit(_is_corrupt, p): p for p in imgs}
|
147 |
for f in tqdm(as_completed(fut), total=len(fut), desc="integrity", leave=False):
|
148 |
if f.result():
|
149 |
corrupt.append(fut[f])
|
150 |
|
151 |
+
score = 100 - (len(miss_lbl) + len(corrupt)) / max(len(imgs), 1) * 100
|
152 |
return {
|
153 |
"name": "Integrity",
|
154 |
"score": max(score, 0),
|
155 |
"details": {
|
156 |
"missing_label_files": [str(p) for p in miss_lbl],
|
|
|
157 |
"corrupt_images": [str(p) for p in corrupt],
|
158 |
},
|
159 |
}
|
160 |
|
161 |
+
# Class balance -------------------------------------------------
|
162 |
|
163 |
+
def qc_class_balance(lbls: List[Path]):
|
164 |
cls_counts = Counter()
|
165 |
boxes_per_img = []
|
166 |
for l in lbls:
|
167 |
+
bs = parse_label_file(l) if l else []
|
168 |
boxes_per_img.append(len(bs))
|
169 |
cls_counts.update(b[0] for b in bs)
|
170 |
|
171 |
if not cls_counts:
|
172 |
return {"name": "Class balance", "score": 0, "details": "No labels"}
|
173 |
+
bal = (min(cls_counts.values()) / max(cls_counts.values())) * 100
|
174 |
return {
|
175 |
"name": "Class balance",
|
176 |
"score": bal,
|
|
|
184 |
},
|
185 |
}
|
186 |
|
187 |
+
# Image quality -------------------------------------------------
|
188 |
+
|
189 |
+
def _quality_stat(path: Path, blur_thr: float):
|
190 |
+
im = cv2.imread(str(path)) if cv2 else None
|
191 |
+
if im is None:
|
192 |
+
return path, False, False, False
|
193 |
+
gray = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
|
194 |
+
lap = cv2.Laplacian(gray, cv2.CV_64F).var()
|
195 |
+
br = gray.mean()
|
196 |
+
return path, lap < blur_thr, br < 25, br > 230
|
197 |
|
198 |
+
|
199 |
+
def qc_image_quality(imgs: List[Path], blur_thr: float = 100.0):
|
200 |
if cv2 is None:
|
201 |
return {"name": "Image quality", "score": 100, "details": "cv2 not installed"}
|
202 |
+
|
203 |
+
blurry: list[Path] = []
|
204 |
+
dark: list[Path] = []
|
205 |
+
bright: list[Path] = []
|
206 |
+
|
207 |
+
with ProcessPoolExecutor(max_workers=CPU_COUNT) as ex:
|
208 |
+
for p, is_blur, is_dark, is_bright in tqdm(
|
209 |
+
ex.map(lambda x: _quality_stat(x, blur_thr), imgs),
|
210 |
+
total=len(imgs),
|
211 |
+
desc="imgβquality",
|
212 |
+
leave=False,
|
213 |
+
):
|
214 |
+
if is_blur:
|
215 |
+
blurry.append(p)
|
216 |
+
if is_dark:
|
217 |
+
dark.append(p)
|
218 |
+
if is_bright:
|
219 |
+
bright.append(p)
|
220 |
|
221 |
bad = len(set(blurry + dark + bright))
|
222 |
score = 100 - bad / max(len(imgs), 1) * 100
|
|
|
230 |
},
|
231 |
}
|
232 |
|
233 |
+
# Duplicate images ---------------------------------------------
|
234 |
+
|
235 |
+
def qc_duplicates(imgs: List[Path]):
|
236 |
+
# Fast path β use fastdup if installed & enough images
|
237 |
+
if fastdup is not None and len(imgs) > 50:
|
238 |
+
try:
|
239 |
+
fd = fastdup.create(input_dir=str(Path(imgs[0]).parent.parent), work_dir=str(TMP_ROOT / "fastdup"))
|
240 |
+
fd.run()
|
241 |
+
clusters = fd.get_clusters()
|
242 |
+
dup = sum(len(c) - 1 for c in clusters)
|
243 |
+
score = 100 - dup / max(len(imgs), 1) * 100
|
244 |
+
return {
|
245 |
+
"name": "Duplicates",
|
246 |
+
"score": score,
|
247 |
+
"details": {"groups": clusters[:50]},
|
248 |
+
}
|
249 |
+
except Exception:
|
250 |
+
pass # fallback to hash
|
251 |
|
|
|
252 |
if imagehash is None:
|
253 |
+
return {"name": "Duplicates", "score": 100, "details": "skipped (deps)"}
|
254 |
+
|
255 |
+
def _hash(p):
|
256 |
+
return str(imagehash.average_hash(Image.open(p)))
|
257 |
|
258 |
hashes: Dict[str, List[Path]] = defaultdict(list)
|
259 |
+
with ProcessPoolExecutor(max_workers=CPU_COUNT) as ex:
|
260 |
+
for h, p in tqdm(
|
261 |
+
zip(ex.map(_hash, imgs), imgs),
|
262 |
+
total=len(imgs),
|
263 |
+
desc="hashing",
|
264 |
+
leave=False,
|
265 |
+
):
|
266 |
+
hashes[h].append(p)
|
267 |
|
268 |
groups = [g for g in hashes.values() if len(g) > 1]
|
269 |
dup = sum(len(g) - 1 for g in groups)
|
|
|
271 |
return {
|
272 |
"name": "Duplicates",
|
273 |
"score": score,
|
274 |
+
"details": {"groups": [[str(p) for p in g] for g in groups[:50]]},
|
275 |
}
|
276 |
|
277 |
+
# Modelβassisted QA --------------------------------------------
|
278 |
|
279 |
def _rel_iou(b1, b2):
|
280 |
x1, y1, w1, h1 = b1
|
|
|
282 |
xa1, ya1, xa2, ya2 = x1 - w1 / 2, y1 - h1 / 2, x1 + w1 / 2, y1 + h1 / 2
|
283 |
xb1, yb1, xb2, yb2 = x2 - w2 / 2, y2 - h2 / 2, x2 + w2 / 2, y2 + h2 / 2
|
284 |
ix1, iy1, ix2, iy2 = max(xa1, xb1), max(ya1, yb1), min(xa2, xb2), min(ya2, yb2)
|
285 |
+
inter = max(ix2 - ix1, 0) * max(iy2 - iy1, 0)
|
|
|
286 |
union = w1 * h1 + w2 * h2 - inter
|
287 |
+
return inter / union if union else 0.0
|
288 |
|
289 |
|
290 |
+
def qc_model_qa(imgs: List[Path], weights: str | None, lbls: List[Path], iou_thr: float = 0.5):
|
291 |
if weights is None or YOLO is None:
|
292 |
+
return {"name": "Model QA", "score": 100, "details": "skipped (no weights)"}
|
293 |
|
294 |
model = YOLO(weights)
|
295 |
ious, mism = [], []
|
296 |
+
|
297 |
+
for i in range(0, len(imgs), BATCH):
|
298 |
+
batch_paths = imgs[i : i + BATCH]
|
299 |
+
results = model.predict(batch_paths, verbose=False)
|
300 |
+
for p, res in zip(batch_paths, results):
|
301 |
+
gtb = parse_label_file(p.parent.parent / "labels" / f"{p.stem}.txt")
|
302 |
+
if not gtb:
|
303 |
+
continue
|
304 |
+
for cls, x, y, w, h in gtb:
|
305 |
+
best = 0.0
|
306 |
+
for b, c in zip(res.boxes.xywh.cpu().numpy(), res.boxes.cls.cpu().numpy()):
|
307 |
+
if int(c) != cls:
|
308 |
+
continue
|
309 |
+
best = max(best, _rel_iou((x, y, w, h), tuple(b)))
|
310 |
+
ious.append(best)
|
311 |
+
if best < iou_thr:
|
312 |
+
mism.append(str(p))
|
313 |
|
314 |
miou = float(np.mean(ious)) if ious else 1.0
|
315 |
return {
|
316 |
"name": "Model QA",
|
317 |
"score": miou * 100,
|
318 |
+
"details": {"mean_iou": miou, "mismatched_images": mism[:50]},
|
319 |
}
|
320 |
|
321 |
+
# Aggregate -----------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
322 |
|
323 |
def aggregate(scores):
|
324 |
return sum(DEFAULT_W.get(r["name"], 0) * r["score"] for r in scores)
|
325 |
|
326 |
+
# βββββββββββββββββββββββββββββββββββββββββ Roboflow helpers ββββ
|
|
|
|
|
|
|
327 |
RF_RE = re.compile(r"https://universe\.roboflow\.com/([^/]+)/([^/]+)/dataset/(\d+)")
|
328 |
|
329 |
def download_rf_dataset(url: str, rf_api: "Roboflow", dest: Path) -> Path:
|
|
|
340 |
project.version(int(ver)).download("yolov8", location=str(ds_dir))
|
341 |
return ds_dir
|
342 |
|
343 |
+
# βββββββββββββββββββββββββββββββββββββββββ Main logic ββββββββββ
|
344 |
|
|
|
|
|
|
|
345 |
def run_quality(root: Path, yaml_override: Path | None, weights: Path | None):
|
346 |
imgs, lbls, meta = gather_dataset(root, yaml_override)
|
347 |
res = [
|
|
|
352 |
qc_model_qa(imgs, str(weights) if weights else None, lbls),
|
353 |
]
|
354 |
final = aggregate(res)
|
355 |
+
|
356 |
+
md = [f"## **{meta.get('name', root.name)}**Β βΒ ScoreΒ {final:.1f}/100"]
|
357 |
for r in res:
|
358 |
md.append(f"### {r['name']}Β Β {r['score']:.1f}")
|
359 |
md.append("<details><summary>details</summary>\n\n```json")
|
|
|
366 |
df.index.name = "class"
|
367 |
return md_str, df
|
368 |
|
369 |
+
# βββββββββββββββββββββββββββββββββββββββββ Gradio UI βββββββββββ
|
370 |
|
|
|
|
|
|
|
371 |
def evaluate(
|
372 |
api_key: str,
|
373 |
url_txt: gr.File | None,
|
|
|
381 |
|
382 |
reports, dfs = [], []
|
383 |
|
384 |
+
# Roboflow batch ------------------------------------------
|
385 |
if url_txt:
|
386 |
if Roboflow is None:
|
387 |
return "`roboflow` not installed", pd.DataFrame()
|
|
|
389 |
return "Enter Roboflow API key", pd.DataFrame()
|
390 |
|
391 |
rf = Roboflow(api_key=api_key.strip())
|
392 |
+
for line in Path(url_txt.name).read_text().splitlines():
|
|
|
393 |
if not line.strip():
|
394 |
continue
|
395 |
try:
|
|
|
400 |
except Exception as e:
|
401 |
reports.append(f"### {line}\n\nβ οΈΒ {e}")
|
402 |
|
403 |
+
# Manual ZIP ----------------------------------------------
|
404 |
if zip_file:
|
405 |
tmp_dir = Path(tempfile.mkdtemp())
|
406 |
shutil.unpack_archive(zip_file.name, tmp_dir)
|
|
|
409 |
dfs.append(df)
|
410 |
shutil.rmtree(tmp_dir, ignore_errors=True)
|
411 |
|
412 |
+
# Manual path ---------------------------------------------
|
413 |
if server_path:
|
414 |
md, df = run_quality(Path(server_path), Path(yaml_file.name) if yaml_file else None, Path(weights.name) if weights else None)
|
415 |
reports.append(md)
|
|
|
419 |
combined_df = pd.concat(dfs).groupby(level=0).sum() if dfs else pd.DataFrame()
|
420 |
return summary_md, combined_df
|
421 |
|
422 |
+
# βββββββββββββββββββββββββββββββββββββββββ Launch ββββββββββββ
|
423 |
with gr.Blocks(title="YOLO Dataset Quality Evaluator") as demo:
|
424 |
gr.Markdown(
|
425 |
"""
|
|
|
429 |
1. Paste your **Roboflow API key**
|
430 |
2. Upload a **.txt** file β one `https://universe.roboflow.com/.../dataset/x` per line
|
431 |
|
432 |
+
### Manual
|
433 |
* Upload a dataset **ZIP** or type a dataset **path** on the server
|
434 |
* Optionally supply a custom **data.yaml** and/or a **YOLOΒ .pt** weights file for modelβassisted QA
|
435 |
"""
|