wuhp commited on
Commit
bd771a4
Β·
verified Β·
1 Parent(s): 003a1e8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +169 -66
app.py CHANGED
@@ -72,7 +72,7 @@ _model_cache: dict[str, YOLO] = {}
72
  autoinc = 0 # helper for tmp‑dir names
73
 
74
  # ────────────────────────────────────────────────────────────────────────────
75
- # Data‑class & helpers reused from the original evaluation script
76
  # ────────────────────────────────────────────────────────────────────────────
77
  @dataclass
78
  class QCConfig:
@@ -88,6 +88,13 @@ def load_yaml(path: Path) -> Dict:
88
  with path.open('r', encoding='utf-8') as f:
89
  return yaml.safe_load(f)
90
 
 
 
 
 
 
 
 
91
  def parse_label_file(path: Path) -> list[tuple[int, float, float, float, float]]:
92
  if not path or not path.exists() or path.stat().st_size == 0:
93
  return []
@@ -134,7 +141,7 @@ def get_model(weights: str) -> YOLO | None:
134
  _model_cache[weights] = YOLO(weights)
135
  return _model_cache[weights]
136
 
137
- # ───────── Functions for I/O-bound concurrency ─────────────────────────────
138
  def _quality_stat_args(args: Tuple[Path, float]) -> Tuple[Path, bool, bool, bool]:
139
  path, thr = args
140
  if cv2 is None:
@@ -155,7 +162,6 @@ def _is_corrupt(path: Path) -> bool:
155
  except Exception:
156
  return True
157
 
158
- # ───────────────── Quality Checks ──────────────────────────────────────────
159
  def qc_integrity(imgs: List[Path], lbls: List[Path], cfg: QCConfig) -> Dict:
160
  missing = [i for i, l in zip(imgs, lbls) if l is None]
161
  corrupt = []
@@ -171,7 +177,7 @@ def qc_integrity(imgs: List[Path], lbls: List[Path], cfg: QCConfig) -> Dict:
171
  "score": max(score, 0),
172
  "details": {
173
  "missing_label_files": [str(p) for p in missing],
174
- "corrupt_images": [str(p) for p in corrupt],
175
  }
176
  }
177
 
@@ -182,12 +188,12 @@ def qc_class_balance(lbls: List[Path], cfg: QCConfig) -> Dict:
182
  boxes.append(len(bs))
183
  counts.update(b[0] for b in bs)
184
  if not counts:
185
- return {"name":"Class balance","score":0,"details":"No labels"}
186
  bal = min(counts.values()) / max(counts.values()) * 100
187
  return {
188
- "name":"Class balance",
189
- "score":bal,
190
- "details":{
191
  "class_counts": dict(counts),
192
  "boxes_per_image": {
193
  "min": min(boxes),
@@ -199,7 +205,7 @@ def qc_class_balance(lbls: List[Path], cfg: QCConfig) -> Dict:
199
 
200
  def qc_image_quality(imgs: List[Path], cfg: QCConfig) -> Dict:
201
  if cv2 is None:
202
- return {"name":"Image quality","score":100,"details":"cv2 missing"}
203
  blurry, dark, bright = [], [], []
204
  sample = imgs[:cfg.sample_limit]
205
  with ThreadPoolExecutor(max_workers=cfg.cpu_count) as ex:
@@ -211,11 +217,11 @@ def qc_image_quality(imgs: List[Path], cfg: QCConfig) -> Dict:
211
  bad = len({*blurry, *dark, *bright})
212
  score = 100 - bad / max(len(sample), 1) * 100
213
  return {
214
- "name":"Image quality",
215
- "score":score,
216
- "details":{
217
  "blurry": [str(p) for p in blurry],
218
- "dark": [str(p) for p in dark],
219
  "bright": [str(p) for p in bright]
220
  }
221
  }
@@ -230,33 +236,34 @@ def qc_duplicates(imgs: List[Path], cfg: QCConfig) -> Dict:
230
  fd.run()
231
  try:
232
  cc = fd.connected_components_grouped(sort_by="comp_size", ascending=False)
233
- clusters = cc["files"].tolist() if "files" in cc.columns else cc.groupby("component")["filename"].apply(list).tolist()
 
234
  except Exception:
235
  clusters = fd.connected_components()
236
  dup = sum(len(c) - 1 for c in clusters)
237
  score = max(0.0, 100 - dup / len(imgs) * 100)
238
- return {"name":"Duplicates","score":score,"details":{"groups":clusters[:50]}}
239
  except Exception as e:
240
- return {"name":"Duplicates","score":100.0,"details":{"fastdup_error":str(e)}}
241
- return {"name":"Duplicates","score":100.0,"details":{"note":"skipped"}}
242
 
243
  def _rel_iou(b1, b2):
244
  x1, y1, w1, h1 = b1
245
  x2, y2, w2, h2 = b2
246
- xa1, ya1 = x1-w1/2, y1-h1/2
247
- xa2, ya2 = x1+w1/2, y1+h1/2
248
- xb1, yb1 = x2-w2/2, y2-h2/2
249
- xb2, yb2 = x2+w2/2, y2+h2/2
250
  ix1 = max(xa1, xb1); iy1 = max(ya1, yb1)
251
  ix2 = min(xa2, xb2); iy2 = min(ya2, yb2)
252
- inter = max(ix2-ix1,0)*max(iy2-iy1,0)
253
  union = w1*h1 + w2*h2 - inter
254
  return inter/union if union else 0.0
255
 
256
  def qc_model_qa(imgs: List[Path], lbls: List[Path], cfg: QCConfig) -> Dict:
257
  model = get_model(cfg.weights)
258
  if model is None:
259
- return {"name":"Model QA","score":100,"details":"skipped"}
260
  ious, mism = [], []
261
  sample = imgs[:cfg.sample_limit]
262
  for i in range(0, len(sample), cfg.batch_size):
@@ -266,21 +273,23 @@ def qc_model_qa(imgs: List[Path], lbls: List[Path], cfg: QCConfig) -> Dict:
266
  gt = parse_label_file(Path(p).parent.parent/'labels'/f"{Path(p).stem}.txt")
267
  for cls, x, y, w, h in gt:
268
  best = 0.0
269
- for b, c, conf in zip(res.boxes.xywh.cpu().numpy(),
270
- res.boxes.cls.cpu().numpy(),
271
- res.boxes.conf.cpu().numpy()):
 
 
272
  if conf < cfg.conf_thr or int(c) != cls:
273
  continue
274
- best = max(best, _rel_iou((x,y,w,h), tuple(b)))
275
  ious.append(best)
276
  if best < cfg.iou_thr:
277
  mism.append(str(p))
278
  miou = float(np.mean(ious)) if ious else 1.0
279
- return {"name":"Model QA","score":miou*100,"details":{"mean_iou":miou,"mismatches":mism[:50]}}
280
 
281
  def qc_label_issues(imgs: List[Path], lbls: List[Path], cfg: QCConfig) -> Dict:
282
  if get_noise_indices is None:
283
- return {"name":"Label issues","score":100,"details":"skipped"}
284
  labels, idxs = [], []
285
  sample = imgs[:cfg.sample_limit]
286
  for i, p in enumerate(sample):
@@ -288,20 +297,20 @@ def qc_label_issues(imgs: List[Path], lbls: List[Path], cfg: QCConfig) -> Dict:
288
  for cls, *_ in bs:
289
  labels.append(int(cls)); idxs.append(i)
290
  if not labels:
291
- return {"name":"Label issues","score":100,"details":"no GT"}
292
  labels_arr = np.array(labels)
293
- uniq = sorted(set(labels_arr))
294
- probs = np.eye(len(uniq))[np.searchsorted(uniq, labels_arr)]
295
- noise = get_noise_indices(labels=labels_arr, probabilities=probs)
296
- flags = sorted({idxs[n] for n in noise})
297
- files = [str(sample[i]) for i in flags]
298
- score = 100 - len(flags)/len(labels)*100
299
- return {"name":"Label issues","score":score,"details":{"files":files[:50]}}
300
 
301
  def aggregate(results: List[Dict]) -> float:
302
  return sum(DEFAULT_W[r["name"]]*r["score"] for r in results)
303
 
304
- # ───────────────── Roboflow TXT‑loading logic (from v3) ────────────────────
305
  RF_RE = re.compile(r"https?://universe\.roboflow\.com/([^/]+)/([^/]+)/dataset/(\d+)")
306
 
307
  def download_rf_dataset(url: str, rf_api: Roboflow, dest: Path) -> Path:
@@ -347,6 +356,102 @@ def run_quality(
347
  df.index.name = "class"
348
  return "\n".join(md), df
349
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
350
  # ════════════════════════════════════════════════════════════════════════════
351
  # UI LAYER
352
  # ════════════════════════════════════════════════════════════════════════════
@@ -357,20 +462,20 @@ with gr.Blocks(css="#classdf td{min-width:120px}") as demo:
357
  """)
358
 
359
  with gr.Tab("Evaluate"):
360
- with gr.Row():
361
- api_in = gr.Textbox(label="Roboflow API key", type="password")
362
- url_txt = gr.File(label=".txt of RF dataset URLs", file_types=['.txt'])
363
- with gr.Row():
364
- zip_in = gr.File(label="Dataset ZIP")
365
- path_in = gr.Textbox(label="Server path")
366
- with gr.Row():
367
- yaml_in = gr.File(label="Custom YAML", file_types=['.yaml'])
368
- weights_in = gr.File(label="YOLO weights (.pt)")
369
  blur_sl = gr.Slider(0.0, 500.0, value=100.0, label="Blur threshold")
370
  iou_sl = gr.Slider(0.0, 1.0, value=0.5, label="IOU threshold")
371
  conf_sl = gr.Slider(0.0, 1.0, value=0.25, label="Min detection confidence")
 
372
  run_dup = gr.Checkbox(label="Check duplicates (fastdup)", value=False)
373
  run_modelqa= gr.Checkbox(label="Run Model QA & cleanlab", value=False)
 
374
  run_eval = gr.Button("Run Evaluation")
375
  out_md = gr.Markdown()
376
  out_df = gr.Dataframe()
@@ -383,11 +488,9 @@ with gr.Blocks(css="#classdf td{min-width:120px}") as demo:
383
  cfg = QCConfig(blur_thr, iou_thr, conf_thr, weights.name if weights else None)
384
  rf = Roboflow(api_key) if api_key and Roboflow else None
385
 
386
- # Roboflow URLs via .txt
387
- if url_txt:
388
  for line in Path(url_txt.name).read_text().splitlines():
389
- if not line.strip():
390
- continue
391
  try:
392
  ds = download_rf_dataset(line, rf, TMP_ROOT)
393
  md, df = run_quality(
@@ -399,7 +502,6 @@ with gr.Blocks(css="#classdf td{min-width:120px}") as demo:
399
  except Exception as e:
400
  reports.append(f"### {line}\n⚠️ {e}")
401
 
402
- # ZIP upload
403
  if zip_file:
404
  tmp = Path(tempfile.mkdtemp())
405
  shutil.unpack_archive(zip_file.name, tmp)
@@ -412,7 +514,6 @@ with gr.Blocks(css="#classdf td{min-width:120px}") as demo:
412
  reports.append(md); dfs.append(df)
413
  shutil.rmtree(tmp, ignore_errors=True)
414
 
415
- # Server path
416
  if server_path:
417
  ds = Path(server_path)
418
  md, df = run_quality(
@@ -446,14 +547,16 @@ with gr.Blocks(css="#classdf td{min-width:120px}") as demo:
446
  def _load_cb(rf_key, rf_urls_file, zip_files):
447
  global autoinc
448
  info_list, log_lines = [], []
 
449
 
450
- if rf_urls_file:
451
  for url in Path(rf_urls_file.name).read_text().splitlines():
452
- if not url.strip():
453
- continue
454
  try:
455
- ds = download_rf_dataset(url, Roboflow(rf_key), TMP_ROOT)
456
- names, splits = load_yaml(ds/"data.yaml").get("names", []), [s for s in ("train","valid","test") if (ds/s).exists()]
 
457
  info_list.append((str(ds), names, splits, Path(ds).name))
458
  log_lines.append(f"βœ”οΈ RF dataset **{Path(ds).name}** loaded ({len(names)} classes)")
459
  except Exception as e:
@@ -465,14 +568,13 @@ with gr.Blocks(css="#classdf td{min-width:120px}") as demo:
465
  tmp.mkdir(parents=True, exist_ok=True)
466
  shutil.unpack_archive(f.name, tmp)
467
  yaml_p = next(tmp.rglob("*.yaml"), None)
468
- if not yaml_p:
469
- continue
470
- names = load_yaml(yaml_p).get("names", [])
471
- splits= [s for s in ("train","valid","test") if (tmp/s).exists()]
472
- info_list.append((str(tmp), names, splits, tmp.name))
473
- log_lines.append(f"βœ”οΈ ZIP **{tmp.name}** loaded")
474
 
475
- return info_list, "\n".join(log_lines) if log_lines else "No datasets loaded."
476
 
477
  load_btn.click(_load_cb, [rf_key, rf_urls, zips_in], [ds_state, load_log])
478
 
@@ -504,7 +606,8 @@ with gr.Blocks(css="#classdf td{min-width:120px}") as demo:
504
  return None, "⚠️ Load datasets first."
505
  out_dir = merge_datasets(ds_info, class_df)
506
  zip_path = shutil.make_archive(str(out_dir), "zip", out_dir)
507
- return zip_path, f"βœ…Β Merged dataset at **{out_dir}** with {len(list(Path(out_dir).rglob('*.jpg')))} images."
 
508
 
509
  merge_btn.click(_merge_cb, [ds_state, class_df], [zip_out, merge_log])
510
 
 
72
  autoinc = 0 # helper for tmp‑dir names
73
 
74
  # ────────────────────────────────────────────────────────────────────────────
75
+ # Data‑class & basic helpers
76
  # ────────────────────────────────────────────────────────────────────────────
77
  @dataclass
78
  class QCConfig:
 
88
  with path.open('r', encoding='utf-8') as f:
89
  return yaml.safe_load(f)
90
 
91
+ def load_class_names(yaml_path: Path) -> List[str]:
92
+ data = load_yaml(yaml_path)
93
+ names = data.get("names", [])
94
+ if isinstance(names, dict):
95
+ return [names[k] for k in sorted(names, key=lambda x: int(x))]
96
+ return list(names)
97
+
98
  def parse_label_file(path: Path) -> list[tuple[int, float, float, float, float]]:
99
  if not path or not path.exists() or path.stat().st_size == 0:
100
  return []
 
141
  _model_cache[weights] = YOLO(weights)
142
  return _model_cache[weights]
143
 
144
+ # ───────── Concurrency helpers & QC functions ───────────────────────────────
145
  def _quality_stat_args(args: Tuple[Path, float]) -> Tuple[Path, bool, bool, bool]:
146
  path, thr = args
147
  if cv2 is None:
 
162
  except Exception:
163
  return True
164
 
 
165
  def qc_integrity(imgs: List[Path], lbls: List[Path], cfg: QCConfig) -> Dict:
166
  missing = [i for i, l in zip(imgs, lbls) if l is None]
167
  corrupt = []
 
177
  "score": max(score, 0),
178
  "details": {
179
  "missing_label_files": [str(p) for p in missing],
180
+ "corrupt_images": [str(p) for p in corrupt],
181
  }
182
  }
183
 
 
188
  boxes.append(len(bs))
189
  counts.update(b[0] for b in bs)
190
  if not counts:
191
+ return {"name": "Class balance", "score": 0, "details": "No labels"}
192
  bal = min(counts.values()) / max(counts.values()) * 100
193
  return {
194
+ "name": "Class balance",
195
+ "score": bal,
196
+ "details": {
197
  "class_counts": dict(counts),
198
  "boxes_per_image": {
199
  "min": min(boxes),
 
205
 
206
  def qc_image_quality(imgs: List[Path], cfg: QCConfig) -> Dict:
207
  if cv2 is None:
208
+ return {"name": "Image quality", "score": 100, "details": "cv2 missing"}
209
  blurry, dark, bright = [], [], []
210
  sample = imgs[:cfg.sample_limit]
211
  with ThreadPoolExecutor(max_workers=cfg.cpu_count) as ex:
 
217
  bad = len({*blurry, *dark, *bright})
218
  score = 100 - bad / max(len(sample), 1) * 100
219
  return {
220
+ "name": "Image quality",
221
+ "score": score,
222
+ "details": {
223
  "blurry": [str(p) for p in blurry],
224
+ "dark": [str(p) for p in dark],
225
  "bright": [str(p) for p in bright]
226
  }
227
  }
 
236
  fd.run()
237
  try:
238
  cc = fd.connected_components_grouped(sort_by="comp_size", ascending=False)
239
+ clusters = (cc["files"].tolist() if "files" in cc.columns
240
+ else cc.groupby("component")["filename"].apply(list).tolist())
241
  except Exception:
242
  clusters = fd.connected_components()
243
  dup = sum(len(c) - 1 for c in clusters)
244
  score = max(0.0, 100 - dup / len(imgs) * 100)
245
+ return {"name": "Duplicates", "score": score, "details": {"groups": clusters[:50]}}
246
  except Exception as e:
247
+ return {"name": "Duplicates", "score": 100.0, "details": {"fastdup_error": str(e)}}
248
+ return {"name": "Duplicates", "score": 100.0, "details": {"note": "skipped"}}
249
 
250
  def _rel_iou(b1, b2):
251
  x1, y1, w1, h1 = b1
252
  x2, y2, w2, h2 = b2
253
+ xa1, ya1 = x1 - w1/2, y1 - h1/2
254
+ xa2, ya2 = x1 + w1/2, y1 + h1/2
255
+ xb1, yb1 = x2 - w2/2, y2 - h2/2
256
+ xb2, yb2 = x2 + w2/2, y2 + h2/2
257
  ix1 = max(xa1, xb1); iy1 = max(ya1, yb1)
258
  ix2 = min(xa2, xb2); iy2 = min(ya2, yb2)
259
+ inter = max(ix2 - ix1, 0) * max(iy2 - iy1, 0)
260
  union = w1*h1 + w2*h2 - inter
261
  return inter/union if union else 0.0
262
 
263
  def qc_model_qa(imgs: List[Path], lbls: List[Path], cfg: QCConfig) -> Dict:
264
  model = get_model(cfg.weights)
265
  if model is None:
266
+ return {"name": "Model QA", "score": 100, "details": "skipped"}
267
  ious, mism = [], []
268
  sample = imgs[:cfg.sample_limit]
269
  for i in range(0, len(sample), cfg.batch_size):
 
273
  gt = parse_label_file(Path(p).parent.parent/'labels'/f"{Path(p).stem}.txt")
274
  for cls, x, y, w, h in gt:
275
  best = 0.0
276
+ for b, c, conf in zip(
277
+ res.boxes.xywh.cpu().numpy(),
278
+ res.boxes.cls.cpu().numpy(),
279
+ res.boxes.conf.cpu().numpy()
280
+ ):
281
  if conf < cfg.conf_thr or int(c) != cls:
282
  continue
283
+ best = max(best, _rel_iou((x, y, w, h), tuple(b)))
284
  ious.append(best)
285
  if best < cfg.iou_thr:
286
  mism.append(str(p))
287
  miou = float(np.mean(ious)) if ious else 1.0
288
+ return {"name": "Model QA", "score": miou*100, "details": {"mean_iou": miou, "mismatches": mism[:50]}}
289
 
290
  def qc_label_issues(imgs: List[Path], lbls: List[Path], cfg: QCConfig) -> Dict:
291
  if get_noise_indices is None:
292
+ return {"name": "Label issues", "score": 100, "details": "skipped"}
293
  labels, idxs = [], []
294
  sample = imgs[:cfg.sample_limit]
295
  for i, p in enumerate(sample):
 
297
  for cls, *_ in bs:
298
  labels.append(int(cls)); idxs.append(i)
299
  if not labels:
300
+ return {"name": "Label issues", "score": 100, "details": "no GT"}
301
  labels_arr = np.array(labels)
302
+ uniq = sorted(set(labels_arr))
303
+ probs = np.eye(len(uniq))[np.searchsorted(uniq, labels_arr)]
304
+ noise = get_noise_indices(labels=labels_arr, probabilities=probs)
305
+ flags = sorted({idxs[n] for n in noise})
306
+ files = [str(sample[i]) for i in flags]
307
+ score = 100 - len(flags)/len(labels)*100
308
+ return {"name": "Label issues", "score": score, "details": {"files": files[:50]}}
309
 
310
  def aggregate(results: List[Dict]) -> float:
311
  return sum(DEFAULT_W[r["name"]]*r["score"] for r in results)
312
 
313
+ # ───────────────── Roboflow TXT‑loading logic for both tabs ─────────────────
314
  RF_RE = re.compile(r"https?://universe\.roboflow\.com/([^/]+)/([^/]+)/dataset/(\d+)")
315
 
316
  def download_rf_dataset(url: str, rf_api: Roboflow, dest: Path) -> Path:
 
356
  df.index.name = "class"
357
  return "\n".join(md), df
358
 
359
+ def merge_datasets(
360
+ dataset_info_list: List[Tuple[str, List[str], List[str], str]],
361
+ class_map_df: pd.DataFrame,
362
+ out_dir: Path = Path("merged_dataset"),
363
+ seed: int = 1234,
364
+ ) -> Path:
365
+ random.seed(seed)
366
+ if out_dir.exists():
367
+ shutil.rmtree(out_dir, onerror=lambda f, p, _: (os.chmod(p, stat.S_IWRITE), f(p)))
368
+ for sub in ("train/images","train/labels","valid/images","valid/labels"):
369
+ (out_dir / sub).mkdir(parents=True, exist_ok=True)
370
+
371
+ class_name_mapping = {
372
+ row["original_class"]: row["new_name"] if not row["remove"] else "__REMOVED__"
373
+ for _, row in class_map_df.iterrows()
374
+ }
375
+ limits_per_merged = {
376
+ row["new_name"]: int(row["max_images"])
377
+ for _, row in class_map_df.iterrows()
378
+ if not row["remove"]
379
+ }
380
+ active_classes = [c for c in sorted(set(class_name_mapping.values())) if c != "__REMOVED__"]
381
+ id_map = {cls: idx for idx, cls in enumerate(active_classes)}
382
+
383
+ image_to_classes: dict[str, set[str]] = {}
384
+ image_to_label: dict[str, Path] = {}
385
+ class_to_images: dict[str, set[str]] = {c: set() for c in active_classes}
386
+
387
+ for dloc, class_names_dataset, splits, _ in dataset_info_list:
388
+ for split in splits:
389
+ labels_root = Path(dloc) / split / "labels"
390
+ if not labels_root.exists():
391
+ continue
392
+ for lp in labels_root.rglob("*.txt"):
393
+ im_name, cls_set = lp.stem + ".jpg", set()
394
+ for cls_id, *rest in parse_label_file(lp):
395
+ orig = class_names_dataset[int(cls_id)] if int(cls_id) < len(class_names_dataset) else None
396
+ if orig:
397
+ new = class_name_mapping.get(orig, orig)
398
+ if new in active_classes:
399
+ cls_set.add(new)
400
+ if not cls_set:
401
+ continue
402
+ img_path = str(lp.parent.parent / "images" / f"{lp.stem}.jpg")
403
+ image_to_classes[img_path] = cls_set
404
+ image_to_label[img_path] = lp
405
+ for c in cls_set:
406
+ class_to_images[c].add(img_path)
407
+
408
+ selected_images = set()
409
+ counters = {c: 0 for c in active_classes}
410
+ pool = [img for imgs in class_to_images.values() for img in imgs]
411
+ random.shuffle(pool)
412
+
413
+ for img in pool:
414
+ cs = image_to_classes[img]
415
+ if any(counters[c] >= limits_per_merged.get(c, 0) for c in cs):
416
+ continue
417
+ selected_images.add(img)
418
+ for c in cs:
419
+ counters[c] += 1
420
+
421
+ for img in selected_images:
422
+ split = "train" if random.random() < 0.9 else "valid"
423
+ dst_img = out_dir / split / "images" / Path(img).name
424
+ dst_img.parent.mkdir(parents=True, exist_ok=True)
425
+ shutil.copy(img, dst_img)
426
+
427
+ lp_src = image_to_label[img]
428
+ dst_lbl = out_dir / split / "labels" / lp_src.name
429
+ dst_lbl.parent.mkdir(parents=True, exist_ok=True)
430
+ lines = lp_src.read_text().splitlines()
431
+ new_lines = []
432
+ for line in lines:
433
+ parts = line.split()
434
+ cid = int(parts[0])
435
+ orig = class_names_dataset[cid] if cid < len(class_names_dataset) else None
436
+ merged = class_name_mapping.get(orig, orig) if orig else None
437
+ if merged and merged in active_classes:
438
+ new_id = id_map[merged]
439
+ new_lines.append(" ".join([str(new_id)] + parts[1:]))
440
+ if new_lines:
441
+ dst_lbl.write_text("\n".join(new_lines))
442
+ else:
443
+ dst_img.unlink(missing_ok=True)
444
+
445
+ data_yaml = {
446
+ "path": str(out_dir.resolve()),
447
+ "train": "train/images",
448
+ "val": "valid/images",
449
+ "nc": len(active_classes),
450
+ "names": active_classes,
451
+ }
452
+ (out_dir / "data.yaml").write_text(yaml.safe_dump(data_yaml))
453
+ return out_dir
454
+
455
  # ════════════════════════════════════════════════════════════════════════════
456
  # UI LAYER
457
  # ════════════════════════════════════════════════════════════════════════════
 
462
  """)
463
 
464
  with gr.Tab("Evaluate"):
465
+ api_in = gr.Textbox(label="Roboflow API key", type="password")
466
+ url_txt = gr.File(label=".txt of RF dataset URLs", file_types=['.txt'])
467
+ zip_in = gr.File(label="Dataset ZIP")
468
+ path_in = gr.Textbox(label="Server path")
469
+ yaml_in = gr.File(label="Custom YAML", file_types=['.yaml'])
470
+ weights_in = gr.File(label="YOLO weights (.pt)")
471
+
 
 
472
  blur_sl = gr.Slider(0.0, 500.0, value=100.0, label="Blur threshold")
473
  iou_sl = gr.Slider(0.0, 1.0, value=0.5, label="IOU threshold")
474
  conf_sl = gr.Slider(0.0, 1.0, value=0.25, label="Min detection confidence")
475
+
476
  run_dup = gr.Checkbox(label="Check duplicates (fastdup)", value=False)
477
  run_modelqa= gr.Checkbox(label="Run Model QA & cleanlab", value=False)
478
+
479
  run_eval = gr.Button("Run Evaluation")
480
  out_md = gr.Markdown()
481
  out_df = gr.Dataframe()
 
488
  cfg = QCConfig(blur_thr, iou_thr, conf_thr, weights.name if weights else None)
489
  rf = Roboflow(api_key) if api_key and Roboflow else None
490
 
491
+ if url_txt and rf:
 
492
  for line in Path(url_txt.name).read_text().splitlines():
493
+ if not line.strip(): continue
 
494
  try:
495
  ds = download_rf_dataset(line, rf, TMP_ROOT)
496
  md, df = run_quality(
 
502
  except Exception as e:
503
  reports.append(f"### {line}\n⚠️ {e}")
504
 
 
505
  if zip_file:
506
  tmp = Path(tempfile.mkdtemp())
507
  shutil.unpack_archive(zip_file.name, tmp)
 
514
  reports.append(md); dfs.append(df)
515
  shutil.rmtree(tmp, ignore_errors=True)
516
 
 
517
  if server_path:
518
  ds = Path(server_path)
519
  md, df = run_quality(
 
547
  def _load_cb(rf_key, rf_urls_file, zip_files):
548
  global autoinc
549
  info_list, log_lines = [], []
550
+ rf = Roboflow(rf_key) if rf_key and Roboflow else None
551
 
552
+ if rf_urls_file and rf:
553
  for url in Path(rf_urls_file.name).read_text().splitlines():
554
+ url = url.strip()
555
+ if not url: continue
556
  try:
557
+ ds = download_rf_dataset(url, rf, TMP_ROOT)
558
+ names = load_class_names(ds/"data.yaml")
559
+ splits = [s for s in ("train","valid","test") if (ds/s).exists()]
560
  info_list.append((str(ds), names, splits, Path(ds).name))
561
  log_lines.append(f"βœ”οΈ RF dataset **{Path(ds).name}** loaded ({len(names)} classes)")
562
  except Exception as e:
 
568
  tmp.mkdir(parents=True, exist_ok=True)
569
  shutil.unpack_archive(f.name, tmp)
570
  yaml_p = next(tmp.rglob("*.yaml"), None)
571
+ if yaml_p:
572
+ names = load_class_names(yaml_p)
573
+ splits= [s for s in ("train","valid","test") if (tmp/s).exists()]
574
+ info_list.append((str(tmp), names, splits, tmp.name))
575
+ log_lines.append(f"βœ”οΈ ZIP **{tmp.name}** loaded")
 
576
 
577
+ return info_list, "\n".join(log_lines) or "No datasets loaded."
578
 
579
  load_btn.click(_load_cb, [rf_key, rf_urls, zips_in], [ds_state, load_log])
580
 
 
606
  return None, "⚠️ Load datasets first."
607
  out_dir = merge_datasets(ds_info, class_df)
608
  zip_path = shutil.make_archive(str(out_dir), "zip", out_dir)
609
+ count = len(list(Path(out_dir).rglob("*.jpg")))
610
+ return zip_path, f"βœ…Β Merged dataset at **{out_dir}** with {count} images."
611
 
612
  merge_btn.click(_merge_cb, [ds_state, class_df], [zip_out, merge_log])
613