wuhp commited on
Commit
60db5ed
Β·
verified Β·
1 Parent(s): ea2cc6e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -52
app.py CHANGED
@@ -47,9 +47,9 @@ except ImportError:
47
  # ───────────────── Config & Constants ───────────────────────────────────────
48
  TMP_ROOT = Path(tempfile.gettempdir()) / "rf_datasets"
49
  TMP_ROOT.mkdir(parents=True, exist_ok=True)
50
- CPU_COUNT = int(os.getenv("QC_CPU", 1)) # force single-core by default
51
- BATCH_SIZE = int(os.getenv("QC_BATCH", 4)) # small batches
52
- SAMPLE_LIMIT = int(os.getenv("QC_SAMPLE", 200))
53
 
54
  DEFAULT_W = {
55
  "Integrity": 0.25,
@@ -70,7 +70,7 @@ class QCConfig:
70
  weights: str | None
71
  cpu_count: int = CPU_COUNT
72
  batch_size: int = BATCH_SIZE
73
- sample_limit:int = SAMPLE_LIMIT
74
 
75
  # ─────────── Helpers & Caching ─────────────────────────────────────────────
76
  def load_yaml(path: Path) -> Dict:
@@ -171,12 +171,12 @@ def qc_class_balance(lbls: List[Path], cfg: QCConfig) -> Dict:
171
  boxes.append(len(bs))
172
  counts.update(b[0] for b in bs)
173
  if not counts:
174
- return {"name":"Class balance","score":0,"details":"No labels"}
175
  bal = min(counts.values()) / max(counts.values()) * 100
176
  return {
177
- "name":"Class balance",
178
- "score":bal,
179
- "details":{
180
  "class_counts": dict(counts),
181
  "boxes_per_image": {
182
  "min": min(boxes),
@@ -188,7 +188,7 @@ def qc_class_balance(lbls: List[Path], cfg: QCConfig) -> Dict:
188
 
189
  def qc_image_quality(imgs: List[Path], cfg: QCConfig) -> Dict:
190
  if cv2 is None:
191
- return {"name":"Image quality","score":100,"details":"cv2 missing"}
192
  blurry, dark, bright = [], [], []
193
  sample = imgs[:cfg.sample_limit]
194
  with ThreadPoolExecutor(max_workers=cfg.cpu_count) as ex:
@@ -200,9 +200,9 @@ def qc_image_quality(imgs: List[Path], cfg: QCConfig) -> Dict:
200
  bad = len({*blurry, *dark, *bright})
201
  score = 100 - bad / max(len(sample), 1) * 100
202
  return {
203
- "name":"Image quality",
204
- "score":score,
205
- "details":{
206
  "blurry": [str(p) for p in blurry],
207
  "dark": [str(p) for p in dark],
208
  "bright": [str(p) for p in bright]
@@ -214,31 +214,36 @@ def qc_duplicates(imgs: List[Path], cfg: QCConfig) -> Dict:
214
  try:
215
  fd = fastdup.create(
216
  input_dir=str(Path(imgs[0]).parent.parent),
217
- work_dir=str(TMP_ROOT/'fastdup')
218
  )
219
  fd.run()
220
- clusters = fd.get_clusters()
221
- dup = sum(len(c)-1 for c in clusters)
 
222
  return {
223
- "name":"Duplicates",
224
- "score":100-dup/len(imgs)*100,
225
- "details":{"groups":clusters[:50]}
226
  }
227
  except Exception as e:
228
- return {"name":"Duplicates","score":100,"details":{"fastdup_error":str(e)}}
229
- return {"name":"Duplicates","score":100,"details":{"note":"skipped"}}
 
 
 
 
230
 
231
  def qc_model_qa(imgs: List[Path], lbls: List[Path], cfg: QCConfig) -> Dict:
232
  model = get_model(cfg.weights)
233
  if model is None:
234
- return {"name":"Model QA","score":100,"details":"skipped"}
235
  ious, mism = [], []
236
  sample = imgs[:cfg.sample_limit]
237
  for i in range(0, len(sample), cfg.batch_size):
238
- batch = sample[i:i+cfg.batch_size]
239
  results = model.predict(batch, verbose=False, half=True, dynamic=True)
240
  for p, res in zip(batch, results):
241
- gt = parse_label_file(Path(p).parent.parent/'labels'/f"{Path(p).stem}.txt")
242
  for cls, x, y, w, h in gt:
243
  best = 0.0
244
  for b, c, conf in zip(
@@ -254,52 +259,51 @@ def qc_model_qa(imgs: List[Path], lbls: List[Path], cfg: QCConfig) -> Dict:
254
  mism.append(str(p))
255
  miou = float(np.mean(ious)) if ious else 1.0
256
  return {
257
- "name":"Model QA",
258
- "score":miou*100,
259
- "details":{"mean_iou":miou, "mismatches":mism[:50]}
260
  }
261
 
262
  def qc_label_issues(imgs: List[Path], lbls: List[Path], cfg: QCConfig) -> Dict:
263
  if get_noise_indices is None:
264
- return {"name":"Label issues","score":100,"details":"skipped"}
265
  labels, idxs = [], []
266
  sample = imgs[:cfg.sample_limit]
267
- model = get_model(cfg.weights)
268
  for i, p in enumerate(sample):
269
  bs = parse_label_file(lbls[i]) if lbls[i] else []
270
  for cls, *_ in bs:
271
  labels.append(int(cls))
272
  idxs.append(i)
273
  if not labels:
274
- return {"name":"Label issues","score":100,"details":"no GT"}
275
  labels_arr = np.array(labels)
276
  uniq = sorted(set(labels_arr))
277
  probs = np.eye(len(uniq))[np.searchsorted(uniq, labels_arr)]
278
  noise = get_noise_indices(labels=labels_arr, probabilities=probs)
279
  flags = sorted({idxs[n] for n in noise})
280
  files = [str(sample[i]) for i in flags]
281
- score = 100 - len(flags)/len(labels)*100
282
  return {
283
- "name":"Label issues",
284
- "score":score,
285
- "details":{"files":files[:50]}
286
  }
287
 
288
  def _rel_iou(b1, b2):
289
  x1, y1, w1, h1 = b1
290
  x2, y2, w2, h2 = b2
291
- xa1, ya1 = x1-w1/2, y1-h1/2
292
- xa2, ya2 = x1+w1/2, y1+h1/2
293
- xb1, yb1 = x2-w2/2, y2-h2/2
294
- xb2, yb2 = x2+w2/2, y2+h2/2
295
  ix1 = max(xa1, xb1); iy1 = max(ya1, yb1)
296
  ix2 = min(xa2, xb2); iy2 = min(ya2, yb2)
297
- inter = max(ix2-ix1, 0) * max(iy2-iy1, 0)
298
- union = w1*h1 + w2*h2 - inter
299
- return inter/union if union else 0.0
300
 
301
  def aggregate(results: List[Dict]) -> float:
302
- return sum(DEFAULT_W[r['name']]*r['score'] for r in results)
303
 
304
  RF_RE = re.compile(r"https?://universe\.roboflow\.com/([^/]+)/([^/]+)/dataset/(\d+)")
305
 
@@ -308,7 +312,7 @@ def download_rf_dataset(url: str, rf_api: Roboflow, dest: Path) -> Path:
308
  if not m:
309
  raise ValueError(f"Bad RF URL: {url}")
310
  ws, proj, ver = m.groups()
311
- ds_dir = dest/f"{ws}_{proj}_v{ver}"
312
  if ds_dir.exists():
313
  return ds_dir
314
  pr = rf_api.workspace(ws).project(proj)
@@ -328,9 +332,9 @@ def run_quality(
328
  qc_integrity(imgs, lbls, cfg),
329
  qc_class_balance(lbls, cfg),
330
  qc_image_quality(imgs, cfg),
331
- qc_duplicates(imgs, cfg) if run_dup else {"name":"Duplicates","score":100,"details":"skipped"},
332
- qc_model_qa(imgs, lbls, cfg) if run_modelqa else {"name":"Model QA","score":100,"details":"skipped"},
333
- qc_label_issues(imgs, lbls, cfg) if run_modelqa else {"name":"Label issues","score":100,"details":"skipped"},
334
  ]
335
  final = aggregate(results)
336
 
@@ -338,14 +342,14 @@ def run_quality(
338
  for r in results:
339
  md.append(f"### {r['name']} {r['score']:.1f}")
340
  md.append("<details><summary>details</summary>\n```json")
341
- md.append(json.dumps(r['details'], indent=2))
342
  md.append("```\n</details>\n")
343
 
344
  df = pd.DataFrame.from_dict(
345
- next(r for r in results if r['name']=='Class balance')['details']['class_counts'],
346
- orient='index', columns=['count']
347
  )
348
- df.index.name = 'class'
349
  return "\n".join(md), df
350
 
351
  with gr.Blocks(title="YOLO Dataset Quality Evaluator v3") as demo:
@@ -394,9 +398,9 @@ with gr.Blocks(title="YOLO Dataset Quality Evaluator v3") as demo:
394
  if not line.strip():
395
  continue
396
  try:
397
- ds = download_rf_dataset(line, rf, TMP_ROOT)
398
- md, df = run_quality(
399
- ds, None,
400
  Path(weights.name) if weights else None,
401
  cfg, run_dup, run_modelqa
402
  )
 
47
  # ───────────────── Config & Constants ───────────────────────────────────────
48
  TMP_ROOT = Path(tempfile.gettempdir()) / "rf_datasets"
49
  TMP_ROOT.mkdir(parents=True, exist_ok=True)
50
+ CPU_COUNT = int(os.getenv("QC_CPU", 1)) # force single-core by default
51
+ BATCH_SIZE = int(os.getenv("QC_BATCH", 4)) # small batches
52
+ SAMPLE_LIMIT = int(os.getenv("QC_SAMPLE", 200))
53
 
54
  DEFAULT_W = {
55
  "Integrity": 0.25,
 
70
  weights: str | None
71
  cpu_count: int = CPU_COUNT
72
  batch_size: int = BATCH_SIZE
73
+ sample_limit: int = SAMPLE_LIMIT
74
 
75
  # ─────────── Helpers & Caching ─────────────────────────────────────────────
76
  def load_yaml(path: Path) -> Dict:
 
171
  boxes.append(len(bs))
172
  counts.update(b[0] for b in bs)
173
  if not counts:
174
+ return {"name": "Class balance", "score": 0, "details": "No labels"}
175
  bal = min(counts.values()) / max(counts.values()) * 100
176
  return {
177
+ "name": "Class balance",
178
+ "score": bal,
179
+ "details": {
180
  "class_counts": dict(counts),
181
  "boxes_per_image": {
182
  "min": min(boxes),
 
188
 
189
  def qc_image_quality(imgs: List[Path], cfg: QCConfig) -> Dict:
190
  if cv2 is None:
191
+ return {"name": "Image quality", "score": 100, "details": "cv2 missing"}
192
  blurry, dark, bright = [], [], []
193
  sample = imgs[:cfg.sample_limit]
194
  with ThreadPoolExecutor(max_workers=cfg.cpu_count) as ex:
 
200
  bad = len({*blurry, *dark, *bright})
201
  score = 100 - bad / max(len(sample), 1) * 100
202
  return {
203
+ "name": "Image quality",
204
+ "score": score,
205
+ "details": {
206
  "blurry": [str(p) for p in blurry],
207
  "dark": [str(p) for p in dark],
208
  "bright": [str(p) for p in bright]
 
214
  try:
215
  fd = fastdup.create(
216
  input_dir=str(Path(imgs[0]).parent.parent),
217
+ work_dir=str(TMP_ROOT / "fastdup")
218
  )
219
  fd.run()
220
+ cc = fd.connected_components_grouped(sort_by="comp_size", ascending=False)
221
+ clusters = cc["files"].tolist()
222
+ dup = sum(len(c) - 1 for c in clusters)
223
  return {
224
+ "name": "Duplicates",
225
+ "score": max(0.0, 100 - dup / len(imgs) * 100),
226
+ "details": {"groups": clusters[:50]}
227
  }
228
  except Exception as e:
229
+ return {
230
+ "name": "Duplicates",
231
+ "score": 100.0,
232
+ "details": {"fastdup_error": str(e)}
233
+ }
234
+ return {"name": "Duplicates", "score": 100.0, "details": {"note": "skipped"}}
235
 
236
  def qc_model_qa(imgs: List[Path], lbls: List[Path], cfg: QCConfig) -> Dict:
237
  model = get_model(cfg.weights)
238
  if model is None:
239
+ return {"name": "Model QA", "score": 100, "details": "skipped"}
240
  ious, mism = [], []
241
  sample = imgs[:cfg.sample_limit]
242
  for i in range(0, len(sample), cfg.batch_size):
243
+ batch = sample[i:i + cfg.batch_size]
244
  results = model.predict(batch, verbose=False, half=True, dynamic=True)
245
  for p, res in zip(batch, results):
246
+ gt = parse_label_file(Path(p).parent.parent / 'labels' / f"{Path(p).stem}.txt")
247
  for cls, x, y, w, h in gt:
248
  best = 0.0
249
  for b, c, conf in zip(
 
259
  mism.append(str(p))
260
  miou = float(np.mean(ious)) if ious else 1.0
261
  return {
262
+ "name": "Model QA",
263
+ "score": miou * 100,
264
+ "details": {"mean_iou": miou, "mismatches": mism[:50]}
265
  }
266
 
267
  def qc_label_issues(imgs: List[Path], lbls: List[Path], cfg: QCConfig) -> Dict:
268
  if get_noise_indices is None:
269
+ return {"name": "Label issues", "score": 100, "details": "skipped"}
270
  labels, idxs = [], []
271
  sample = imgs[:cfg.sample_limit]
 
272
  for i, p in enumerate(sample):
273
  bs = parse_label_file(lbls[i]) if lbls[i] else []
274
  for cls, *_ in bs:
275
  labels.append(int(cls))
276
  idxs.append(i)
277
  if not labels:
278
+ return {"name": "Label issues", "score": 100, "details": "no GT"}
279
  labels_arr = np.array(labels)
280
  uniq = sorted(set(labels_arr))
281
  probs = np.eye(len(uniq))[np.searchsorted(uniq, labels_arr)]
282
  noise = get_noise_indices(labels=labels_arr, probabilities=probs)
283
  flags = sorted({idxs[n] for n in noise})
284
  files = [str(sample[i]) for i in flags]
285
+ score = 100 - len(flags) / len(labels) * 100
286
  return {
287
+ "name": "Label issues",
288
+ "score": score,
289
+ "details": {"files": files[:50]}
290
  }
291
 
292
  def _rel_iou(b1, b2):
293
  x1, y1, w1, h1 = b1
294
  x2, y2, w2, h2 = b2
295
+ xa1, ya1 = x1 - w1/2, y1 - h1/2
296
+ xa2, ya2 = x1 + w1/2, y1 + h1/2
297
+ xb1, yb1 = x2 - w2/2, y2 - h2/2
298
+ xb2, yb2 = x2 + w2/2, y2 + h2/2
299
  ix1 = max(xa1, xb1); iy1 = max(ya1, yb1)
300
  ix2 = min(xa2, xb2); iy2 = min(ya2, yb2)
301
+ inter = max(ix2 - ix1, 0) * max(iy2 - iy1, 0)
302
+ union = w1 * h1 + w2 * h2 - inter
303
+ return inter / union if union else 0.0
304
 
305
  def aggregate(results: List[Dict]) -> float:
306
+ return sum(DEFAULT_W[r["name"]] * r["score"] for r in results)
307
 
308
  RF_RE = re.compile(r"https?://universe\.roboflow\.com/([^/]+)/([^/]+)/dataset/(\d+)")
309
 
 
312
  if not m:
313
  raise ValueError(f"Bad RF URL: {url}")
314
  ws, proj, ver = m.groups()
315
+ ds_dir = dest / f"{ws}_{proj}_v{ver}"
316
  if ds_dir.exists():
317
  return ds_dir
318
  pr = rf_api.workspace(ws).project(proj)
 
332
  qc_integrity(imgs, lbls, cfg),
333
  qc_class_balance(lbls, cfg),
334
  qc_image_quality(imgs, cfg),
335
+ qc_duplicates(imgs, cfg) if run_dup else {"name": "Duplicates", "score": 100, "details": "skipped"},
336
+ qc_model_qa(imgs, lbls, cfg) if run_modelqa else {"name": "Model QA", "score": 100, "details": "skipped"},
337
+ qc_label_issues(imgs, lbls, cfg) if run_modelqa else {"name": "Label issues", "score": 100, "details": "skipped"},
338
  ]
339
  final = aggregate(results)
340
 
 
342
  for r in results:
343
  md.append(f"### {r['name']} {r['score']:.1f}")
344
  md.append("<details><summary>details</summary>\n```json")
345
+ md.append(json.dumps(r["details"], indent=2))
346
  md.append("```\n</details>\n")
347
 
348
  df = pd.DataFrame.from_dict(
349
+ next(r for r in results if r["name"] == "Class balance")["details"]["class_counts"],
350
+ orient="index", columns=["count"]
351
  )
352
+ df.index.name = "class"
353
  return "\n".join(md), df
354
 
355
  with gr.Blocks(title="YOLO Dataset Quality Evaluator v3") as demo:
 
398
  if not line.strip():
399
  continue
400
  try:
401
+ ds, md, df = download_rf_dataset(line, rf, TMP_ROOT), *run_quality(
402
+ download_rf_dataset(line, rf, TMP_ROOT),
403
+ None,
404
  Path(weights.name) if weights else None,
405
  cfg, run_dup, run_modelqa
406
  )