wuhp commited on
Commit
cb23c0f
Β·
verified Β·
1 Parent(s): 6620c2f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +285 -362
app.py CHANGED
@@ -1,15 +1,14 @@
1
- """
2
- app.py – Roboflow‑aware YOLOv8 Dataset Quality Evaluator (v2)
3
-
4
- Changelog (2025‑04‑17)
5
- ──────────────────────
6
- β€’ **CPU‑bound loops parallelised** with `concurrent.futures.ProcessPoolExecutor`.
7
- β€’ **Batch inference** in `qc_model_qa()` (GPU util ↑, latency ↓).
8
- β€’ Optional **fastdup** path for duplicate detection (β‰ˆβ€―10Γ— faster on large sets).
9
- β€’ Faster NumPy‑based `parse_label_file()`.
10
- β€’ Small refactors β†’ clearer separation of stages & fewer globals.
11
- β€’ Graceful degradation if heavy deps unavailable (cv2, imagehash, fastdup).
12
- β€’ Tunable `CPU_COUNT` + env‑var guard for HF Spaces quota.
13
  """
14
 
15
  from __future__ import annotations
@@ -33,62 +32,65 @@ import yaml
33
  from PIL import Image
34
  from tqdm import tqdm
35
 
36
- # ───────────────────────────────────────── Heavy optional deps ──
37
  try:
38
- import cv2 # type: ignore
39
  except ImportError:
40
  cv2 = None
41
-
42
  try:
43
- import imagehash # type: ignore
44
  except ImportError:
45
  imagehash = None
46
-
47
  try:
48
- import fastdup # type: ignore
49
  except ImportError:
50
  fastdup = None
51
-
52
  try:
53
- from ultralytics import YOLO # type: ignore
54
  except ImportError:
55
- YOLO = None # noqa: N806
56
-
57
  try:
58
- from roboflow import Roboflow # type: ignore
59
  except ImportError:
60
- Roboflow = None # type: ignore
 
 
 
 
61
 
62
- # ───────────────────────────────────────── Config & constants ──
63
  TMP_ROOT = Path(tempfile.gettempdir()) / "rf_datasets"
64
  TMP_ROOT.mkdir(parents=True, exist_ok=True)
65
-
66
- # Limit CPU workers on HF Spaces (feel free to raise locally)
67
  CPU_COUNT = int(os.getenv("QC_CPU", max(1, (os.cpu_count() or 4) // 2)))
68
- BATCH = int(os.getenv("QC_BATCH", 16))
69
 
70
  DEFAULT_W = {
71
- "Integrity": 0.30,
72
- "Class balance": 0.15,
73
- "Image quality": 0.15,
74
- "Duplicates": 0.10,
75
- "Model QA": 0.30,
 
76
  }
77
 
78
- @dataclass
79
- class DuplicateGroup:
80
- hash_val: str
81
- paths: List[Path]
82
-
83
- # ───────────────────────────────────────── Generic helpers ─────
84
 
 
 
 
 
 
 
 
 
 
 
85
  def load_yaml(path: Path) -> Dict:
86
- with path.open(encoding="utf-8") as f:
87
  return yaml.safe_load(f)
88
 
89
-
90
  def parse_label_file(path: Path) -> list[tuple[int, float, float, float, float]]:
91
- if not path.exists() or path.stat().st_size == 0:
92
  return []
93
  try:
94
  arr = np.loadtxt(path, dtype=float)
@@ -98,364 +100,285 @@ def parse_label_file(path: Path) -> list[tuple[int, float, float, float, float]]
98
  except Exception:
99
  return []
100
 
101
-
102
  def guess_image_dirs(root: Path) -> List[Path]:
103
- subs = [
104
- root / "images",
105
- root / "train" / "images",
106
- root / "valid" / "images",
107
- root / "val" / "images",
108
- root / "test" / "images",
109
- ]
110
  return [d for d in subs if d.exists()]
111
 
112
-
113
- def gather_dataset(root: Path, yaml_path: Path | None = None):
114
  if yaml_path is None:
115
- yamls = list(root.glob("*.yaml"))
116
  if not yamls:
117
  raise FileNotFoundError("Dataset YAML not found")
118
  yaml_path = yamls[0]
119
-
120
  meta = load_yaml(yaml_path)
121
  img_dirs = guess_image_dirs(root)
122
  if not img_dirs:
123
- raise FileNotFoundError("images/ directory hierarchy missing")
124
-
125
- imgs = [p for d in img_dirs for p in d.rglob("*.*") if imghdr.what(p) is not None]
126
- labels_root = {d.parent / "labels" for d in img_dirs}
127
- lbls = [next((lr / f"{p.stem}.txt" for lr in labels_root if (lr / f"{p.stem}.txt").exists()), None) for p in imgs]
128
  return imgs, lbls, meta
129
 
130
- # ───────────────────────────────────────── Quality checks ─────
131
- # Integrity -----------------------------------------------------
 
 
 
 
 
 
 
 
132
 
133
  def _is_corrupt(path: Path) -> bool:
134
  try:
135
  with Image.open(path) as im:
136
  im.verify()
137
  return False
138
- except Exception:
139
  return True
140
 
141
-
142
- def qc_integrity(imgs: List[Path], lbls: List[Path]):
143
- miss_lbl = [i for i, l in zip(imgs, lbls) if l is None]
144
- corrupt: List[Path] = []
145
- with ProcessPoolExecutor(max_workers=CPU_COUNT) as ex:
146
- fut = {ex.submit(_is_corrupt, p): p for p in imgs}
147
- for f in tqdm(as_completed(fut), total=len(fut), desc="integrity", leave=False):
148
- if f.result():
149
- corrupt.append(fut[f])
150
-
151
- score = 100 - (len(miss_lbl) + len(corrupt)) / max(len(imgs), 1) * 100
152
- return {
153
- "name": "Integrity",
154
- "score": max(score, 0),
155
- "details": {
156
- "missing_label_files": [str(p) for p in miss_lbl],
157
- "corrupt_images": [str(p) for p in corrupt],
158
- },
159
- }
160
-
161
- # Class balance -------------------------------------------------
162
-
163
- def qc_class_balance(lbls: List[Path]):
164
- cls_counts = Counter()
165
- boxes_per_img = []
166
  for l in lbls:
167
- bs = parse_label_file(l) if l else []
168
- boxes_per_img.append(len(bs))
169
- cls_counts.update(b[0] for b in bs)
170
-
171
- if not cls_counts:
172
- return {"name": "Class balance", "score": 0, "details": "No labels"}
173
- bal = (min(cls_counts.values()) / max(cls_counts.values())) * 100
174
- return {
175
- "name": "Class balance",
176
- "score": bal,
177
- "details": {
178
- "class_counts": dict(cls_counts),
179
- "boxes_per_image": {
180
- "min": int(np.min(boxes_per_img)),
181
- "max": int(np.max(boxes_per_img)),
182
- "mean": float(np.mean(boxes_per_img)),
183
- },
184
- },
185
- }
186
-
187
- # Image quality -------------------------------------------------
188
-
189
- def _quality_stat(path: Path, blur_thr: float):
190
- im = cv2.imread(str(path)) if cv2 else None
191
- if im is None:
192
- return path, False, False, False
193
- gray = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
194
- lap = cv2.Laplacian(gray, cv2.CV_64F).var()
195
- br = gray.mean()
196
- return path, lap < blur_thr, br < 25, br > 230
197
-
198
-
199
- def qc_image_quality(imgs: List[Path], blur_thr: float = 100.0):
200
  if cv2 is None:
201
- return {"name": "Image quality", "score": 100, "details": "cv2 not installed"}
202
-
203
- blurry: list[Path] = []
204
- dark: list[Path] = []
205
- bright: list[Path] = []
206
-
207
- with ProcessPoolExecutor(max_workers=CPU_COUNT) as ex:
208
- for p, is_blur, is_dark, is_bright in tqdm(
209
- ex.map(lambda x: _quality_stat(x, blur_thr), imgs),
210
- total=len(imgs),
211
- desc="img‑quality",
212
- leave=False,
213
- ):
214
- if is_blur:
215
- blurry.append(p)
216
- if is_dark:
217
- dark.append(p)
218
- if is_bright:
219
- bright.append(p)
220
-
221
- bad = len(set(blurry + dark + bright))
222
- score = 100 - bad / max(len(imgs), 1) * 100
223
- return {
224
- "name": "Image quality",
225
- "score": score,
226
- "details": {
227
- "blurry": [str(p) for p in blurry],
228
- "dark": [str(p) for p in dark],
229
- "bright": [str(p) for p in bright],
230
- },
231
- }
232
-
233
- # Duplicate images ---------------------------------------------
234
-
235
- def qc_duplicates(imgs: List[Path]):
236
- # Fast path – use fastdup if installed & enough images
237
- if fastdup is not None and len(imgs) > 50:
238
  try:
239
- fd = fastdup.create(input_dir=str(Path(imgs[0]).parent.parent), work_dir=str(TMP_ROOT / "fastdup"))
240
- fd.run()
241
- clusters = fd.get_clusters()
242
- dup = sum(len(c) - 1 for c in clusters)
243
- score = 100 - dup / max(len(imgs), 1) * 100
244
- return {
245
- "name": "Duplicates",
246
- "score": score,
247
- "details": {"groups": clusters[:50]},
248
- }
249
- except Exception:
250
- pass # fallback to hash
251
-
252
  if imagehash is None:
253
- return {"name": "Duplicates", "score": 100, "details": "skipped (deps)"}
254
-
255
- def _hash(p):
256
- return str(imagehash.average_hash(Image.open(p)))
257
-
258
- hashes: Dict[str, List[Path]] = defaultdict(list)
259
- with ProcessPoolExecutor(max_workers=CPU_COUNT) as ex:
260
- for h, p in tqdm(
261
- zip(ex.map(_hash, imgs), imgs),
262
- total=len(imgs),
263
- desc="hashing",
264
- leave=False,
265
- ):
266
  hashes[h].append(p)
267
-
268
- groups = [g for g in hashes.values() if len(g) > 1]
269
- dup = sum(len(g) - 1 for g in groups)
270
- score = 100 - dup / max(len(imgs), 1) * 100
271
- return {
272
- "name": "Duplicates",
273
- "score": score,
274
- "details": {"groups": [[str(p) for p in g] for g in groups[:50]]},
275
- }
276
-
277
- # Model‑assisted QA --------------------------------------------
278
-
279
- def _rel_iou(b1, b2):
280
- x1, y1, w1, h1 = b1
281
- x2, y2, w2, h2 = b2
282
- xa1, ya1, xa2, ya2 = x1 - w1 / 2, y1 - h1 / 2, x1 + w1 / 2, y1 + h1 / 2
283
- xb1, yb1, xb2, yb2 = x2 - w2 / 2, y2 - h2 / 2, x2 + w2 / 2, y2 + h2 / 2
284
- ix1, iy1, ix2, iy2 = max(xa1, xb1), max(ya1, yb1), min(xa2, xb2), min(ya2, yb2)
285
- inter = max(ix2 - ix1, 0) * max(iy2 - iy1, 0)
286
- union = w1 * h1 + w2 * h2 - inter
287
- return inter / union if union else 0.0
288
-
289
-
290
- def qc_model_qa(imgs: List[Path], weights: str | None, lbls: List[Path], iou_thr: float = 0.5):
291
- if weights is None or YOLO is None:
292
- return {"name": "Model QA", "score": 100, "details": "skipped (no weights)"}
293
-
294
- model = YOLO(weights)
295
  ious, mism = [], []
296
-
297
- for i in range(0, len(imgs), BATCH):
298
- batch_paths = imgs[i : i + BATCH]
299
- results = model.predict(batch_paths, verbose=False)
300
- for p, res in zip(batch_paths, results):
301
- gtb = parse_label_file(p.parent.parent / "labels" / f"{p.stem}.txt")
302
- if not gtb:
303
- continue
304
- for cls, x, y, w, h in gtb:
305
- best = 0.0
306
- for b, c in zip(res.boxes.xywh.cpu().numpy(), res.boxes.cls.cpu().numpy()):
307
- if int(c) != cls:
308
- continue
309
- best = max(best, _rel_iou((x, y, w, h), tuple(b)))
 
 
310
  ious.append(best)
311
- if best < iou_thr:
312
  mism.append(str(p))
313
-
314
- miou = float(np.mean(ious)) if ious else 1.0
315
- return {
316
- "name": "Model QA",
317
- "score": miou * 100,
318
- "details": {"mean_iou": miou, "mismatched_images": mism[:50]},
319
- }
320
-
321
- # Aggregate -----------------------------------------------------
322
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
323
  def aggregate(scores):
324
- return sum(DEFAULT_W.get(r["name"], 0) * r["score"] for r in scores)
325
 
326
- # ───────────────────────────────────────── Roboflow helpers ────
327
  RF_RE = re.compile(r"https://universe\.roboflow\.com/([^/]+)/([^/]+)/dataset/(\d+)")
328
-
329
- def download_rf_dataset(url: str, rf_api: "Roboflow", dest: Path) -> Path:
330
- m = RF_RE.match(url.strip())
331
- if not m:
332
- raise ValueError(f"Bad RF URL: {url}")
333
-
334
- ws, proj, ver = m.groups()
335
- ds_dir = dest / f"{ws}_{proj}_v{ver}"
336
- if ds_dir.exists():
337
- return ds_dir
338
-
339
- project = rf_api.workspace(ws).project(proj)
340
- project.version(int(ver)).download("yolov8", location=str(ds_dir))
341
- return ds_dir
342
-
343
- # ───────────────────────────────────────── Main logic ──────────
344
-
345
- def run_quality(root: Path, yaml_override: Path | None, weights: Path | None):
346
- imgs, lbls, meta = gather_dataset(root, yaml_override)
347
- res = [
348
- qc_integrity(imgs, lbls),
349
- qc_class_balance(lbls),
350
- qc_image_quality(imgs),
351
- qc_duplicates(imgs),
352
- qc_model_qa(imgs, str(weights) if weights else None, lbls),
353
  ]
354
- final = aggregate(res)
355
-
356
- md = [f"## **{meta.get('name', root.name)}**Β β€”Β ScoreΒ {final:.1f}/100"]
357
  for r in res:
358
- md.append(f"### {r['name']}Β Β {r['score']:.1f}")
359
- md.append("<details><summary>details</summary>\n\n```json")
360
- md.append(json.dumps(r["details"], indent=2))
361
  md.append("```\n</details>\n")
362
- md_str = "\n".join(md)
363
-
364
- cls_counts = res[1]["details"].get("class_counts", {}) # type: ignore[index]
365
- df = pd.DataFrame.from_dict(cls_counts, orient="index", columns=["count"])
366
- df.index.name = "class"
367
- return md_str, df
368
-
369
- # ───────────────────────────────────────── Gradio UI ───────────
370
-
371
- def evaluate(
372
- api_key: str,
373
- url_txt: gr.File | None,
374
- zip_file: gr.File | None,
375
- server_path: str,
376
- yaml_file: gr.File | None,
377
- weights: gr.File | None,
378
- ):
379
- if not any([url_txt, zip_file, server_path]):
380
- return "Upload a .txt of URLs or dataset ZIP/path", pd.DataFrame()
381
-
382
- reports, dfs = [], []
383
-
384
- # Roboflow batch ------------------------------------------
385
- if url_txt:
386
- if Roboflow is None:
387
- return "`roboflow` not installed", pd.DataFrame()
388
- if not api_key:
389
- return "Enter Roboflow API key", pd.DataFrame()
390
-
391
- rf = Roboflow(api_key=api_key.strip())
392
- for line in Path(url_txt.name).read_text().splitlines():
393
- if not line.strip():
394
- continue
395
- try:
396
- ds_root = download_rf_dataset(line, rf, TMP_ROOT)
397
- md, df = run_quality(ds_root, None, Path(weights.name) if weights else None)
398
- reports.append(md)
399
- dfs.append(df)
400
- except Exception as e:
401
- reports.append(f"### {line}\n\n⚠️ {e}")
402
-
403
- # Manual ZIP ----------------------------------------------
404
- if zip_file:
405
- tmp_dir = Path(tempfile.mkdtemp())
406
- shutil.unpack_archive(zip_file.name, tmp_dir)
407
- md, df = run_quality(tmp_dir, Path(yaml_file.name) if yaml_file else None, Path(weights.name) if weights else None)
408
- reports.append(md)
409
- dfs.append(df)
410
- shutil.rmtree(tmp_dir, ignore_errors=True)
411
-
412
- # Manual path ---------------------------------------------
413
- if server_path:
414
- md, df = run_quality(Path(server_path), Path(yaml_file.name) if yaml_file else None, Path(weights.name) if weights else None)
415
- reports.append(md)
416
- dfs.append(df)
417
-
418
- summary_md = "\n\n---\n\n".join(reports)
419
- combined_df = pd.concat(dfs).groupby(level=0).sum() if dfs else pd.DataFrame()
420
- return summary_md, combined_df
421
-
422
- # ───────────────────────────────────────── Launch ────────────
423
- with gr.Blocks(title="YOLO Dataset Quality Evaluator") as demo:
424
- gr.Markdown(
425
- """
426
- # YOLOv8 Dataset Quality Evaluator
427
-
428
- ### Roboflow batch
429
- 1. Paste your **Roboflow API key**
430
- 2. Upload a **.txt** file – one `https://universe.roboflow.com/.../dataset/x` per line
431
-
432
- ### Manual
433
- * Upload a dataset **ZIP** or type a dataset **path** on the server
434
- * Optionally supply a custom **data.yaml** and/or a **YOLOΒ .pt** weights file for model‑assisted QA
435
- """
436
  )
 
 
437
 
438
- with gr.Row():
439
- api_in = gr.Textbox(label="Roboflow API key", type="password", placeholder="rf_XXXXXXXXXXXXXXXX")
440
- url_txt_in = gr.File(label=".txt of RF dataset URLs", file_types=[".txt"])
 
441
 
 
 
442
  with gr.Row():
443
- zip_in = gr.File(label="Dataset ZIP")
444
- path_in = gr.Textbox(label="Path on server", placeholder="/data/my_dataset")
445
-
 
 
446
  with gr.Row():
447
- yaml_in = gr.File(label="Custom YAML", file_types=[".yaml"])
448
  weights_in = gr.File(label="YOLO weights (.pt)")
449
-
 
 
 
450
  run_btn = gr.Button("Evaluate")
451
- out_md = gr.Markdown()
452
- out_df = gr.Dataframe()
453
-
454
- run_btn.click(
455
- evaluate,
456
- inputs=[api_in, url_txt_in, zip_in, path_in, yaml_in, weights_in],
457
- outputs=[out_md, out_df],
458
- )
459
-
460
- if __name__ == "__main__":
461
- demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", 7860)))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ app.py – Roboflow‑aware YOLOv8 Dataset Quality Evaluator (v3)
3
+
4
+ Changelog (2025‑04‑17)
5
+ ──────────────────────
6
+ β€’ **YOLO model caching** to avoid repeated loads
7
+ β€’ **Exposed thresholds** (blur, IOU, confidence) as Gradio sliders
8
+ β€’ **Config dataclass** for unified settings
9
+ β€’ **Cleanlab integration** for label-issue detection
10
+ β€’ **Parallel label parsing** and expanded caching
11
+ β€’ **Adjusted QC weights** to include Label Issues stage
 
12
  """
13
 
14
  from __future__ import annotations
 
32
  from PIL import Image
33
  from tqdm import tqdm
34
 
35
+ # Optional heavy deps
36
  try:
37
+ import cv2
38
  except ImportError:
39
  cv2 = None
 
40
  try:
41
+ import imagehash
42
  except ImportError:
43
  imagehash = None
 
44
  try:
45
+ import fastdup
46
  except ImportError:
47
  fastdup = None
 
48
  try:
49
+ from ultralytics import YOLO
50
  except ImportError:
51
+ YOLO = None
 
52
  try:
53
+ from roboflow import Roboflow
54
  except ImportError:
55
+ Roboflow = None
56
+ try:
57
+ from cleanlab.pruning import get_noise_indices
58
+ except ImportError:
59
+ get_noise_indices = None
60
 
61
+ # ───────────────── Config & Constants ─────────────────
62
  TMP_ROOT = Path(tempfile.gettempdir()) / "rf_datasets"
63
  TMP_ROOT.mkdir(parents=True, exist_ok=True)
 
 
64
  CPU_COUNT = int(os.getenv("QC_CPU", max(1, (os.cpu_count() or 4) // 2)))
65
+ BATCH_SIZE = int(os.getenv("QC_BATCH", 16))
66
 
67
  DEFAULT_W = {
68
+ "Integrity": 0.25,
69
+ "Class balance":0.10,
70
+ "Image quality":0.15,
71
+ "Duplicates": 0.10,
72
+ "Model QA": 0.30,
73
+ "Label issues": 0.10,
74
  }
75
 
76
+ _model_cache: dict[str, YOLO] = {}
 
 
 
 
 
77
 
78
+ @dataclass
79
+ class QCConfig:
80
+ blur_thr: float
81
+ iou_thr: float
82
+ conf_thr: float
83
+ weights: str | None
84
+ cpu_count: int = CPU_COUNT
85
+ batch_size: int = BATCH_SIZE
86
+
87
+ # ─────────────────── Helpers & Caching ───────────────────
88
  def load_yaml(path: Path) -> Dict:
89
+ with path.open('r', encoding='utf-8') as f:
90
  return yaml.safe_load(f)
91
 
 
92
  def parse_label_file(path: Path) -> list[tuple[int, float, float, float, float]]:
93
+ if not path or not path.exists() or path.stat().st_size == 0:
94
  return []
95
  try:
96
  arr = np.loadtxt(path, dtype=float)
 
100
  except Exception:
101
  return []
102
 
 
103
  def guess_image_dirs(root: Path) -> List[Path]:
104
+ subs = [root / 'images', root / 'train' / 'images', root / 'valid' / 'images',
105
+ root / 'val' / 'images', root / 'test' / 'images']
 
 
 
 
 
106
  return [d for d in subs if d.exists()]
107
 
108
+ def gather_dataset(root: Path, yaml_path: Path | None):
 
109
  if yaml_path is None:
110
+ yamls = list(root.glob('*.yaml'))
111
  if not yamls:
112
  raise FileNotFoundError("Dataset YAML not found")
113
  yaml_path = yamls[0]
 
114
  meta = load_yaml(yaml_path)
115
  img_dirs = guess_image_dirs(root)
116
  if not img_dirs:
117
+ raise FileNotFoundError("images/ directory missing")
118
+ imgs = [p for d in img_dirs for p in d.rglob('*.*') if imghdr.what(p)]
119
+ labels_root = {d.parent/'labels' for d in img_dirs}
120
+ lbls = [next((lr/f"{p.stem}.txt" for lr in labels_root if (lr/f"{p.stem}.txt").exists()), None)
121
+ for p in imgs]
122
  return imgs, lbls, meta
123
 
124
+ # YOLO model caching
125
+
126
+ def get_model(weights: str) -> YOLO | None:
127
+ if weights is None or YOLO is None:
128
+ return None
129
+ if weights not in _model_cache:
130
+ _model_cache[weights] = YOLO(weights)
131
+ return _model_cache[weights]
132
+
133
+ # ───────────────────── Quality Checks ─────────────────────
134
 
135
  def _is_corrupt(path: Path) -> bool:
136
  try:
137
  with Image.open(path) as im:
138
  im.verify()
139
  return False
140
+ except:
141
  return True
142
 
143
+ def qc_integrity(imgs: List[Path], lbls: List[Path], cfg: QCConfig):
144
+ miss = [i for i,l in zip(imgs,lbls) if l is None]
145
+ corrupt = []
146
+ with ProcessPoolExecutor(max_workers=cfg.cpu_count) as ex:
147
+ fut = {ex.submit(_is_corrupt,p):p for p in imgs}
148
+ for f in as_completed(fut):
149
+ if f.result(): corrupt.append(fut[f])
150
+ score = 100 - (len(miss)+len(corrupt))/max(len(imgs),1)*100
151
+ return {"name":"Integrity","score":max(score,0),
152
+ "details":{"missing_label_files":[str(p) for p in miss],
153
+ "corrupt_images":[str(p) for p in corrupt]}}
154
+
155
+ def qc_class_balance(lbls: List[Path], cfg: QCConfig):
156
+ counts=Counter(); boxes=[]
 
 
 
 
 
 
 
 
 
 
 
157
  for l in lbls:
158
+ bs=parse_label_file(l) if l else []
159
+ boxes.append(len(bs)); counts.update(b[0] for b in bs)
160
+ if not counts:
161
+ return {"name":"Class balance","score":0,"details":"No labels"}
162
+ bal=(min(counts.values())/max(counts.values()))*100
163
+ return {"name":"Class balance","score":bal,
164
+ "details":{"class_counts":dict(counts),
165
+ "boxes_per_image":{"min":int(np.min(boxes)),
166
+ "max":int(np.max(boxes)),
167
+ "mean":float(np.mean(boxes))}}}
168
+
169
+ def _quality_stat(path:Path, blur_thr:float):
170
+ if cv2 is None: return path,False,False,False
171
+ im=cv2.imread(str(path));
172
+ gray=cv2.cvtColor(im,cv2.COLOR_BGR2GRAY)
173
+ lap=cv2.Laplacian(gray,cv2.CV_64F).var(); br=gray.mean()
174
+ return path, lap<blur_thr, br<25, br>230
175
+
176
+ def qc_image_quality(imgs:List[Path], cfg:QCConfig):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
  if cv2 is None:
178
+ return {"name":"Image quality","score":100,"details":"cv2 missing"}
179
+ blurry, dark, bright = [],[],[];
180
+ with ProcessPoolExecutor(max_workers=cfg.cpu_count) as ex:
181
+ for p,isb,isd,isB in tqdm(ex.map(lambda x: _quality_stat(x,cfg.blur_thr), imgs),
182
+ total=len(imgs), desc='img-quality', leave=False):
183
+ if isb: blurry.append(p)
184
+ if isd: dark.append(p)
185
+ if isB: bright.append(p)
186
+ bad=len({*blurry,*dark,*bright})
187
+ score=100 - bad/max(len(imgs),1)*100
188
+ return {"name":"Image quality","score":score,
189
+ "details":{"blurry":[str(p) for p in blurry],
190
+ "dark":[str(p) for p in dark],
191
+ "bright":[str(p) for p in bright]}}
192
+
193
+ def qc_duplicates(imgs:List[Path], cfg:QCConfig):
194
+ if fastdup and len(imgs)>50:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
195
  try:
196
+ fd=fastdup.create(input_dir=str(Path(imgs[0]).parent.parent),
197
+ work_dir=str(TMP_ROOT/'fastdup'))
198
+ fd.run(); clusters=fd.get_clusters()
199
+ dup=sum(len(c)-1 for c in clusters)
200
+ return {"name":"Duplicates","score":100-dup/len(imgs)*100,
201
+ "details":{"groups":clusters[:50]}}
202
+ except: pass
 
 
 
 
 
 
203
  if imagehash is None:
204
+ return {"name":"Duplicates","score":100,"details":"deps missing"}
205
+ hashes=defaultdict(list)
206
+ with ProcessPoolExecutor(max_workers=cfg.cpu_count) as ex:
207
+ for h,p in zip(ex.map(lambda x: str(imagehash.average_hash(Image.open(x))),imgs), imgs):
 
 
 
 
 
 
 
 
 
208
  hashes[h].append(p)
209
+ groups=[g for g in hashes.values() if len(g)>1]
210
+ dup=sum(len(g)-1 for g in groups)
211
+ return {"name":"Duplicates","score":100-dup/len(imgs)*100,
212
+ "details":{"groups":[[str(p) for p in g] for g in groups[:50]]}}
213
+
214
+ def _rel_iou(b1,b2):
215
+ x1,y1,w1,h1=b1; x2,y2,w2,h2=b2
216
+ xa1,ya1,xa2,ya2=x1-w1/2,y1-h1/2,x1+w1/2,y1+h1/2
217
+ xb1,yb1,xb2,yb2=x2-w2/2,y2-h2/2,x2+w2/2,y2+h2/2
218
+ ix1,iy1,ix2,iy2=max(xa1,xb1),max(ya1,yb1),min(xa2,xb2),min(ya2,yb2)
219
+ inter=max(ix2-ix1,0)*max(iy2-iy1,0)
220
+ union=w1*h1+w2*h2-inter
221
+ return inter/union if union else 0.0
222
+
223
+ def qc_model_qa(imgs:List[Path], lbls:List[Path], cfg:QCConfig):
224
+ model=get_model(cfg.weights)
225
+ if model is None:
226
+ return {"name":"Model QA","score":100,"details":"skipped"}
 
 
 
 
 
 
 
 
 
 
227
  ious, mism = [], []
228
+ for i in range(0,len(imgs),cfg.batch_size):
229
+ batch=imgs[i:i+cfg.batch_size]
230
+ results=model.predict(batch, verbose=False)
231
+ for p,res in zip(batch,results):
232
+ gt=parse_label_file(lbls[imgs.index(p)])
233
+ if not gt: continue
234
+ preds = res.boxes.xywh.cpu().numpy()
235
+ confs = res.boxes.conf.cpu().numpy()
236
+ classes = res.boxes.cls.cpu().numpy()
237
+ mask = confs >= cfg.conf_thr
238
+ preds, classes = preds[mask], classes[mask]
239
+ for cls,x,y,w,h in gt:
240
+ best=0.0
241
+ for b,c in zip(preds,classes):
242
+ if int(c)!=cls: continue
243
+ best=max(best,_rel_iou((x,y,w,h),tuple(b)))
244
  ious.append(best)
245
+ if best < cfg.iou_thr:
246
  mism.append(str(p))
247
+ miou=float(np.mean(ious)) if ious else 1.0
248
+ return {"name":"Model QA","score":miou*100,
249
+ "details":{"mean_iou":miou,"mismatches":mism[:50]}}
250
+
251
+ def qc_label_issues(imgs:List[Path], lbls:List[Path], cfg:QCConfig):
252
+ if get_noise_indices is None or cfg.weights is None:
253
+ return {"name":"Label issues","score":100,"details":"skipped"}
254
+ model=get_model(cfg.weights)
255
+ if model is None:
256
+ return {"name":"Label issues","score":100,"details":"skipped"}
257
+ labels,preds,samps = [],[],[]
258
+ for i in range(0,len(imgs),cfg.batch_size):
259
+ batch=imgs[i:i+cfg.batch_size]
260
+ results=model.predict(batch, verbose=False)
261
+ for p,res in zip(batch,results):
262
+ gt=parse_label_file(lbls[imgs.index(p)])
263
+ for cls,x,y,w,h in gt:
264
+ labels.append(int(cls))
265
+ # find predicted class with highest IoU
266
+ best_i, best_c = 0.0, -1
267
+ for b,c in zip(res.boxes.xywh.cpu().numpy(), res.boxes.cls.cpu().numpy()):
268
+ iou=_rel_iou((x,y,w,h),tuple(b))
269
+ if iou>best_i:
270
+ best_i, best_c = iou, int(c)
271
+ preds.append(best_c)
272
+ samps.append(p)
273
+ if not labels:
274
+ return {"name":"Label issues","score":100,"details":"no GT"}
275
+ noise_idx = get_noise_indices(np.array(labels), np.array(preds))
276
+ sus = list({str(samps[i]) for i in noise_idx})[:50]
277
+ score = 100 - len(noise_idx)/len(labels)*100
278
+ return {"name":"Label issues","score":score,
279
+ "details":{"suspect_images": sus}}
280
+
281
+ # ─────────────────────── Aggregate & Run ──────────────────────
282
  def aggregate(scores):
283
+ return sum(DEFAULT_W.get(r['name'],0)*r['score'] for r in scores)
284
 
 
285
  RF_RE = re.compile(r"https://universe\.roboflow\.com/([^/]+)/([^/]+)/dataset/(\d+)")
286
+ def download_rf_dataset(url:str, rf_api:Roboflow, dest:Path)->Path:
287
+ m=RF_RE.match(url.strip());
288
+ if not m: raise ValueError(f"Bad RF URL: {url}")
289
+ ws,proj,ver = m.groups()
290
+ ds = dest/f"{ws}_{proj}_v{ver}"
291
+ if ds.exists(): return ds
292
+ proj_obj = rf_api.workspace(ws).project(proj)
293
+ proj_obj.version(int(ver)).download('yolov8', location=str(ds))
294
+ return ds
295
+
296
+ def run_quality(root:Path, yaml_override:Path|None, lbls:List[Path], imgs:List[Path], cfg:QCConfig):
297
+ res=[
298
+ qc_integrity(imgs,lbls,cfg),
299
+ qc_class_balance(lbls,cfg),
300
+ qc_image_quality(imgs,cfg),
301
+ qc_duplicates(imgs,cfg),
302
+ qc_model_qa(imgs,lbls,cfg),
303
+ qc_label_issues(imgs,lbls,cfg),
 
 
 
 
 
 
 
304
  ]
305
+ final=aggregate(res)
306
+ md=[f"## **{root.name}** β€” ScoreΒ {final:.1f}/100"]
 
307
  for r in res:
308
+ md.append(f"### {r['name']} Β {r['score']:.1f}")
309
+ md.append("<details><summary>details</summary>\n```json")
310
+ md.append(json.dumps(r['details'],indent=2))
311
  md.append("```\n</details>\n")
312
+ df = pd.DataFrame.from_dict(
313
+ next(r for r in res if r['name']=='Class balance')['details']['class_counts'],
314
+ orient='index', columns=['count']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
315
  )
316
+ df.index.name='class'
317
+ return "\n".join(md), df
318
 
319
+ # ─────────────────────── Gradio UI ──────────────────────
320
+ with gr.Blocks(title="YOLO Dataset Quality Evaluator v3") as demo:
321
+ gr.Markdown("""
322
+ # YOLOv8 Dataset Quality Evaluator v3
323
 
324
+ * Tweaks: blur, IOU & confidence sliders; Cleanlab label issues; model caching
325
+ """)
326
  with gr.Row():
327
+ api_in = gr.Textbox(label="Roboflow API key", type="password")
328
+ url_txt = gr.File(label=".txt of RF dataset URLs", file_types=['.txt'])
329
+ with gr.Row():
330
+ zip_in = gr.File(label="Dataset ZIP")
331
+ path_in = gr.Textbox(label="Server path")
332
  with gr.Row():
333
+ yaml_in = gr.File(label="Custom YAML", file_types=['.yaml'])
334
  weights_in = gr.File(label="YOLO weights (.pt)")
335
+ with gr.Row():
336
+ blur_sl = gr.Slider(0,500,value=100,label="Blur threshold")
337
+ iou_sl = gr.Slider(0.0,1.0,value=0.5,label="IOU threshold")
338
+ conf_sl = gr.Slider(0.0,1.0,value=0.25,label="Min detection confidence")
339
  run_btn = gr.Button("Evaluate")
340
+ out_md = gr.Markdown()
341
+ out_df = gr.Dataframe()
342
+
343
+ def evaluate(api_key, url_txt, zip_file, server_path, yaml_file, weights,
344
+ blur_thr, iou_thr, conf_thr):
345
+ reports, dfs = [], []
346
+ cfg = QCConfig(blur_thr, iou_thr, conf_thr,
347
+ weights.name if weights else None)
348
+ rf = Roboflow(api_key) if api_key and Roboflow else None
349
+ # Roboflow batch
350
+ if url_txt:
351
+ for line in Path(url_txt.name).read_text().splitlines():
352
+ if not line.strip(): continue
353
+ try:
354
+ ds = download_rf_dataset(line, rf, TMP_ROOT)
355
+ imgs,lbls,_ = gather_dataset(ds,None)
356
+ md, df = run_quality(ds,None,lbls,imgs,cfg)
357
+ reports.append(md); dfs.append(df)
358
+ except Exception as e:
359
+ reports.append(f"### {line}\n⚠️ {e}")
360
+ # ZIP
361
+ if zip_file:
362
+ tmp=Path(tempfile.mkdtemp())
363
+ shutil.unpack_archive(zip_file.name,tmp)
364
+ imgs,lbls,_=gather_dataset(tmp,Path(yaml_file.name) if yaml_file else None)
365
+ md,df=run_quality(tmp,None,lbls,imgs,cfg)
366
+ reports.append(md); dfs.append(df)
367
+ shutil.rmtree(tmp)
368
+ # Server path
369
+ if server_path:
370
+ ds=Path(server_path)
371
+ imgs,lbls,_=gather_dataset(ds,Path(yaml_file.name) if yaml_file else None)
372
+ md,df=run_quality(ds,None,lbls,imgs,cfg)
373
+ reports.append(md); dfs.append(df)
374
+ summary='\n---\n'.join(reports)
375
+ combined = pd.concat(dfs).groupby(level=0).sum() if dfs else pd.DataFrame()
376
+ return summary, combined
377
+
378
+ run_btn.click(evaluate,
379
+ inputs=[api_in, url_txt, zip_in, path_in, yaml_in, weights_in,
380
+ blur_sl, iou_sl, conf_sl],
381
+ outputs=[out_md, out_df])
382
+
383
+ if __name__ == '__main__':
384
+ demo.launch(server_name='0.0.0.0', server_port=int(os.getenv('PORT',7860)))