wuhp commited on
Commit
003a1e8
Β·
verified Β·
1 Parent(s): 496d684

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +295 -293
app.py CHANGED
@@ -69,7 +69,6 @@ DEFAULT_W = {
69
  logging.basicConfig(level=logging.INFO, format="%(asctime)s | %(levelname)s | %(message)s")
70
 
71
  _model_cache: dict[str, YOLO] = {}
72
-
73
  autoinc = 0 # helper for tmp‑dir names
74
 
75
  # ────────────────────────────────────────────────────────────────────────────
@@ -85,12 +84,10 @@ class QCConfig:
85
  batch_size: int = BATCH_SIZE
86
  sample_limit:int = SAMPLE_LIMIT
87
 
88
-
89
  def load_yaml(path: Path) -> Dict:
90
- with path.open("r", encoding="utf-8") as f:
91
  return yaml.safe_load(f)
92
 
93
-
94
  def parse_label_file(path: Path) -> list[tuple[int, float, float, float, float]]:
95
  if not path or not path.exists() or path.stat().st_size == 0:
96
  return []
@@ -102,21 +99,19 @@ def parse_label_file(path: Path) -> list[tuple[int, float, float, float, float]]
102
  except Exception:
103
  return []
104
 
105
-
106
  def guess_image_dirs(root: Path) -> List[Path]:
107
  candidates = [
108
- root / "images",
109
- root / "train" / "images",
110
- root / "valid" / "images",
111
- root / "val" / "images",
112
- root / "test" / "images",
113
  ]
114
  return [d for d in candidates if d.exists()]
115
 
116
-
117
  def gather_dataset(root: Path, yaml_path: Path | None):
118
  if yaml_path is None:
119
- yamls = list(root.glob("*.yaml"))
120
  if not yamls:
121
  raise FileNotFoundError("Dataset YAML not found")
122
  yaml_path = yamls[0]
@@ -124,15 +119,14 @@ def gather_dataset(root: Path, yaml_path: Path | None):
124
  img_dirs = guess_image_dirs(root)
125
  if not img_dirs:
126
  raise FileNotFoundError("images/ directory missing")
127
- imgs = [p for d in img_dirs for p in d.rglob("*.*") if imghdr.what(p)]
128
- labels_roots = {d.parent / "labels" for d in img_dirs}
129
  lbls = [
130
- next((lr / f"{p.stem}.txt" for lr in labels_roots if (lr / f"{p.stem}.txt").exists()), None)
131
  for p in imgs
132
  ]
133
  return imgs, lbls, meta
134
 
135
-
136
  def get_model(weights: str) -> YOLO | None:
137
  if not weights or YOLO is None:
138
  return None
@@ -140,249 +134,218 @@ def get_model(weights: str) -> YOLO | None:
140
  _model_cache[weights] = YOLO(weights)
141
  return _model_cache[weights]
142
 
143
- # ---------------------------------------------------------------------------
144
- # QUALITY‑EVALUATION (UNCHANGED from v3)
145
- # ---------------------------------------------------------------------------
146
- # --‑‑ <Functions qc_integrity / qc_class_balance / qc_image_quality ...>
147
- # **(unchanged – omitted for brevity; same as your previous v3 script)**
148
- # ---------------------------------------------------------------------------
149
-
150
- # ════════════════════════════════════════════════════════════════════════════
151
- # MERGE ✦ EDIT ✦ ZIP
152
- # ════════════════════════════════════════════════════════════════════════════
153
-
154
- # -------------------- Roboflow helpers --------------------
155
- RF_RE = re.compile(r"https?://universe\.roboflow\.com/([^/]+)/([^/]+)/(.*)")
156
-
157
- def parse_roboflow_url(url: str) -> tuple[str, str, int | None]:
158
- """
159
- Return (workspace, project, version|None) – tolerates many RF URL flavours.
160
- Any non‐positive or malformed version is treated as None.
161
- """
162
- m = RF_RE.match(url.strip())
163
- if not m:
164
- return None, None, None
165
- ws, proj, tail = m.groups()
166
- ver: int | None = None
167
-
168
- # explicit "dataset/<number>" in path
169
- if tail.startswith("dataset/"):
170
- try:
171
- v = int(tail.split("dataset/", 1)[1])
172
- if v > 0:
173
- ver = v
174
- except ValueError:
175
- pass
176
-
177
- # explicit "?version=<number>" in query
178
- if ver is None and "?version=" in url:
179
- try:
180
- v = int(url.split("?version=", 1)[1])
181
- if v > 0:
182
- ver = v
183
- except ValueError:
184
- pass
185
-
186
- return ws, proj, ver
187
-
188
-
189
- def get_latest_version(rf: Roboflow, ws: str, proj: str) -> str | None:
190
  try:
191
- p = rf.workspace(ws).project(proj)
192
- versions = p.versions()
193
- vnums = [int(getattr(v, "version_number", getattr(v, "number", 0))) for v in versions]
194
- return str(max(vnums)) if vnums else None
195
- except Exception as e:
196
- logging.warning(f"RF latest‑version lookup failed: {e}")
197
- return None
198
-
199
-
200
- def download_roboflow_dataset(
201
- url: str,
202
- rf_api_key: str,
203
- fmt: str = "yolov8",
204
- ) -> Tuple[Path, List[str], List[str]]:
205
- """Return (dataset_location, class_names, splits). Caches by folder name."""
206
- if Roboflow is None:
207
- raise RuntimeError("`roboflow` pip package not installed")
 
 
 
 
 
 
 
 
208
 
209
- ws, proj, ver = parse_roboflow_url(url)
210
- if not (ws and proj):
211
- raise ValueError(f"Bad Roboflow URL: {url!r}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
212
 
213
- rf = Roboflow(api_key=rf_api_key)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
214
 
215
- # if no explicit version or invalid, fetch latest
216
- if not ver or ver <= 0:
217
- latest = get_latest_version(rf, ws, proj)
218
- if latest is None:
219
- raise RuntimeError("Could not resolve latest Roboflow version")
220
  try:
221
- ver = int(latest)
222
- except ValueError:
223
- raise RuntimeError(f"Invalid latest version returned: {latest!r}")
224
-
225
- ds_dir = TMP_ROOT / f"{ws}_{proj}_v{ver}"
226
- if ds_dir.exists():
227
- yaml_path = ds_dir / "data.yaml"
228
- class_names = load_yaml(yaml_path).get("names", []) if yaml_path.exists() else []
229
- splits = [s for s in ("train","valid","test") if (ds_dir / s).exists()]
230
- return ds_dir, class_names, splits
231
-
232
- ds_dir.mkdir(parents=True, exist_ok=True)
233
- rf.workspace(ws).project(proj).version(ver).download(fmt, location=str(ds_dir))
234
-
235
- yaml_path = ds_dir / "data.yaml"
236
- class_names = load_yaml(yaml_path).get("names", []) if yaml_path.exists() else []
237
- splits = [s for s in ("train","valid","test") if (ds_dir / s).exists()]
238
- return ds_dir, class_names, splits
239
-
240
-
241
- # -------------------- Merge helpers (adapted from Streamlit) --------------
242
-
243
- def gather_class_counts(dataset_info_list, class_name_mapping):
244
- counts = Counter()
245
- for dloc, class_names, splits, _ in dataset_info_list:
246
- for split in splits:
247
- labels_dir = Path(dloc) / split / "labels"
248
- if not labels_dir.exists():
249
- continue
250
- for lp in labels_dir.rglob("*.txt"):
251
- for cls_id, *_ in parse_label_file(lp):
252
- orig = class_names[int(cls_id)] if int(cls_id) < len(class_names) else None
253
- if orig is None:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
254
  continue
255
- merged = class_name_mapping.get(orig, orig)
256
- counts[merged] += 1
257
- return dict(counts)
258
-
259
-
260
- def _process_label_file(label_path: Path, class_names_dataset, class_name_mapping):
261
- im_name = label_path.stem + label_path.suffix.replace(".txt", ".jpg")
262
- img_classes = set()
263
- for cls_id, *_ in parse_label_file(label_path):
264
- if 0 <= cls_id < len(class_names_dataset):
265
- orig = class_names_dataset[int(cls_id)]
266
- new = class_name_mapping.get(orig, orig)
267
- img_classes.add(new)
268
- return im_name, img_classes
269
-
270
-
271
- def merge_datasets(
272
- dataset_info_list: List[Tuple[str, List[str], List[str], str]],
273
- class_map_df: pd.DataFrame,
274
- out_dir: Path = Path("merged_dataset"),
275
- seed: int = 1234,
276
- ) -> Path:
277
- """Return path to merged dataset ready for training/eval."""
278
- random.seed(seed)
279
- if out_dir.exists():
280
- shutil.rmtree(out_dir, onerror=lambda f, p, _: (os.chmod(p, stat.S_IWRITE), f(p)))
281
- (out_dir / "train/images").mkdir(parents=True, exist_ok=True)
282
- (out_dir / "train/labels").mkdir(parents=True, exist_ok=True)
283
- (out_dir / "valid/images").mkdir(parents=True, exist_ok=True)
284
- (out_dir / "valid/labels").mkdir(parents=True, exist_ok=True)
285
-
286
- class_name_mapping = {
287
- row["original_class"]: row["new_name"] if not row["remove"] else "__REMOVED__"
288
- for _, row in class_map_df.iterrows()
289
- }
290
- limits_per_merged = {
291
- row["new_name"]: int(row["max_images"])
292
- for _, row in class_map_df.iterrows()
293
- if not row["remove"]
294
- }
295
- active_classes = [c for c in sorted(set(class_name_mapping.values())) if c != "__REMOVED__"]
296
- id_map = {cls: idx for idx, cls in enumerate(active_classes)}
297
-
298
- image_to_classes: dict[str, set[str]] = {}
299
- image_to_label: dict[str, Path] = {}
300
- class_to_images: dict[str, set[str]] = {c: set() for c in active_classes}
301
-
302
- for dloc, class_names_dataset, splits, _ in dataset_info_list:
303
- for split in splits:
304
- labels_root = Path(dloc) / split / "labels"
305
- if not labels_root.exists():
306
- continue
307
- for lp in labels_root.rglob("*.txt"):
308
- im_name, cls_set = _process_label_file(lp, class_names_dataset, class_name_mapping)
309
- cls_set = {c for c in cls_set if c in active_classes}
310
- if not cls_set:
311
- continue
312
- img_path = str(lp).replace("labels", "images").replace(".txt", ".jpg")
313
- image_to_classes[img_path] = cls_set
314
- image_to_label[img_path] = lp
315
- for c in cls_set:
316
- class_to_images[c].add(img_path)
317
-
318
- selected_images: set[str] = set()
319
- counters = {c: 0 for c in active_classes}
320
- shuffle_pool = [img for imgs in class_to_images.values() for img in imgs]
321
- random.shuffle(shuffle_pool)
322
-
323
- for img in shuffle_pool:
324
- cls_set = image_to_classes[img]
325
- if any(counters[c] >= limits_per_merged.get(c, 0) for c in cls_set):
326
- continue
327
- selected_images.add(img)
328
- for c in cls_set:
329
- counters[c] += 1
330
-
331
- for img in selected_images:
332
- split = "train" if random.random() < 0.9 else "valid"
333
- dst_img = out_dir / split / "images" / Path(img).name
334
- dst_img.parent.mkdir(parents=True, exist_ok=True)
335
- shutil.copy(img, dst_img)
336
-
337
- lp_src = image_to_label[img]
338
- dst_label = out_dir / split / "labels" / Path(lp_src).name
339
- dst_label.parent.mkdir(parents=True, exist_ok=True)
340
- with open(lp_src, "r") as f:
341
- lines = f.readlines()
342
- new_lines = []
343
- for line in lines:
344
- parts = line.strip().split()
345
- if not parts:
346
- continue
347
- cid = int(parts[0])
348
- dloc_match = next((cl for dloc2, cl, _, _ in dataset_info_list if str(lp_src).startswith(dloc2)), None)
349
- if dloc_match is None:
350
- continue
351
- orig_cls_name = dloc_match[cid] if cid < len(dloc_match) else None
352
- if orig_cls_name is None:
353
- continue
354
- merged_cls_name = class_name_mapping.get(orig_cls_name, orig_cls_name)
355
- if merged_cls_name not in active_classes:
356
- continue
357
- new_id = id_map[merged_cls_name]
358
- new_lines.append(" ".join([str(new_id)] + parts[1:]))
359
- if new_lines:
360
- with open(dst_label, "w") as f:
361
- f.write("\n".join(new_lines))
362
- else:
363
- (out_dir / split / "images" / Path(img).name).unlink(missing_ok=True)
364
-
365
- data_yaml = {
366
- "path": str(out_dir.resolve()),
367
- "train": "train/images",
368
- "val": "valid/images",
369
- "nc": len(active_classes),
370
- "names": active_classes,
371
- }
372
- with open(out_dir / "data.yaml", "w") as f:
373
- yaml.safe_dump(data_yaml, f)
374
-
375
- return out_dir
376
-
377
-
378
- def zip_directory(folder: Path) -> bytes:
379
- buf = io.BytesIO()
380
- with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zf:
381
- for file in folder.rglob("*"):
382
- zf.write(file, arcname=file.relative_to(folder))
383
- buf.seek(0)
384
- return buf.getvalue()
385
-
386
 
387
  # ════════════════════════════════════════════════════════════════════════════
388
  # UI LAYER
@@ -393,7 +356,6 @@ with gr.Blocks(css="#classdf td{min-width:120px}") as demo:
393
  _Evaluate β€’ Merge β€’ Edit β€’ Download_
394
  """)
395
 
396
- # ------------------------------ EVALUATE TAB --------------------------
397
  with gr.Tab("Evaluate"):
398
  with gr.Row():
399
  api_in = gr.Textbox(label="Roboflow API key", type="password")
@@ -404,27 +366,74 @@ with gr.Blocks(css="#classdf td{min-width:120px}") as demo:
404
  with gr.Row():
405
  yaml_in = gr.File(label="Custom YAML", file_types=['.yaml'])
406
  weights_in = gr.File(label="YOLO weights (.pt)")
407
- blur_sl = gr.Slider(0.0, 500.0, value=100.0, label="Blur threshold")
408
- iou_sl = gr.Slider(0.0, 1.0, value=0.5, label="IOU threshold")
409
- conf_sl = gr.Slider(0.0, 1.0, value=0.25, label="Min detection confidence")
410
- run_dup = gr.Checkbox(label="Check duplicates (fastdup)")
411
- run_qa = gr.Checkbox(label="Run Model QA & cleanlab")
412
- run_eval = gr.Button("Run Evaluation")
413
- out_md = gr.Markdown()
414
- out_df = gr.Dataframe(label="Class distribution")
415
-
416
- def _evaluate_cb(api_key, url_txt, zip_file, server_path, yaml_file, weights,
417
- blur_thr, iou_thr, conf_thr, run_dup, run_modelqa):
418
- return "Evaluation disabled in this trimmed snippet.", pd.DataFrame()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
419
 
420
  run_eval.click(
421
  _evaluate_cb,
422
- [api_in, url_txt, zip_in, path_in, yaml_in, weights_in,
423
- blur_sl, iou_sl, conf_sl, run_dup, run_qa],
424
- [out_md, out_df]
425
  )
426
 
427
- # ------------------------------ MERGE TAB -----------------------------
428
  with gr.Tab("Merge / Edit"):
429
  gr.Markdown("### 1️⃣ Load one or more datasets")
430
  rf_key = gr.Textbox(label="Roboflow API key", type="password")
@@ -436,15 +445,15 @@ with gr.Blocks(css="#classdf td{min-width:120px}") as demo:
436
 
437
  def _load_cb(rf_key, rf_urls_file, zip_files):
438
  global autoinc
439
- info_list = []
440
- log_lines = []
441
 
442
- if rf_urls_file is not None:
443
  for url in Path(rf_urls_file.name).read_text().splitlines():
444
  if not url.strip():
445
  continue
446
  try:
447
- ds, names, splits = download_roboflow_dataset(url, rf_key)
 
448
  info_list.append((str(ds), names, splits, Path(ds).name))
449
  log_lines.append(f"βœ”οΈ RF dataset **{Path(ds).name}** loaded ({len(names)} classes)")
450
  except Exception as e:
@@ -455,11 +464,11 @@ with gr.Blocks(css="#classdf td{min-width:120px}") as demo:
455
  tmp = TMP_ROOT / f"zip_{autoinc}"
456
  tmp.mkdir(parents=True, exist_ok=True)
457
  shutil.unpack_archive(f.name, tmp)
458
- yaml_path = next(tmp.rglob("*.yaml"), None)
459
- if yaml_path is None:
460
  continue
461
- names = load_yaml(yaml_path).get("names", [])
462
- splits = [s for s in ("train","valid","test") if (tmp / s).exists()]
463
  info_list.append((str(tmp), names, splits, tmp.name))
464
  log_lines.append(f"βœ”οΈ ZIP **{tmp.name}** loaded")
465
 
@@ -469,22 +478,19 @@ with gr.Blocks(css="#classdf td{min-width:120px}") as demo:
469
 
470
  gr.Markdown("### 2️⃣ Edit class mapping / limits / removal")
471
  class_df = gr.Dataframe(
472
- headers=["original_class", "new_name", "max_images", "remove"],
473
- datatype=["str", "str", "number", "bool"],
474
  interactive=True, elem_id="classdf"
475
  )
476
  refresh_btn = gr.Button("Build class table from loaded datasets")
477
 
478
  def _build_class_df(ds_info):
479
- class_names_all = []
480
- for _dloc, names, _spl, _ in ds_info:
481
- class_names_all.extend(names)
482
- class_names_all = sorted(set(class_names_all))
483
  return pd.DataFrame({
484
- "original_class": class_names_all,
485
- "new_name": class_names_all,
486
- "max_images": [99999] * len(class_names_all),
487
- "remove": [False] * len(class_names_all),
488
  })
489
 
490
  refresh_btn.click(_build_class_df, [ds_state], [class_df])
@@ -498,13 +504,9 @@ with gr.Blocks(css="#classdf td{min-width:120px}") as demo:
498
  return None, "⚠️ Load datasets first."
499
  out_dir = merge_datasets(ds_info, class_df)
500
  zip_path = shutil.make_archive(str(out_dir), "zip", out_dir)
501
- return zip_path, (
502
- f"βœ…Β Merged dataset created at **{out_dir}** with "
503
- f"{len(list(Path(out_dir).rglob('*.jpg')))} images."
504
- )
505
 
506
  merge_btn.click(_merge_cb, [ds_state, class_df], [zip_out, merge_log])
507
 
508
-
509
  if __name__ == "__main__":
510
  demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", 7860)))
 
69
  logging.basicConfig(level=logging.INFO, format="%(asctime)s | %(levelname)s | %(message)s")
70
 
71
  _model_cache: dict[str, YOLO] = {}
 
72
  autoinc = 0 # helper for tmp‑dir names
73
 
74
  # ────────────────────────────────────────────────────────────────────────────
 
84
  batch_size: int = BATCH_SIZE
85
  sample_limit:int = SAMPLE_LIMIT
86
 
 
87
  def load_yaml(path: Path) -> Dict:
88
+ with path.open('r', encoding='utf-8') as f:
89
  return yaml.safe_load(f)
90
 
 
91
  def parse_label_file(path: Path) -> list[tuple[int, float, float, float, float]]:
92
  if not path or not path.exists() or path.stat().st_size == 0:
93
  return []
 
99
  except Exception:
100
  return []
101
 
 
102
  def guess_image_dirs(root: Path) -> List[Path]:
103
  candidates = [
104
+ root/'images',
105
+ root/'train'/'images',
106
+ root/'valid'/'images',
107
+ root/'val' /'images',
108
+ root/'test' /'images',
109
  ]
110
  return [d for d in candidates if d.exists()]
111
 
 
112
  def gather_dataset(root: Path, yaml_path: Path | None):
113
  if yaml_path is None:
114
+ yamls = list(root.glob('*.yaml'))
115
  if not yamls:
116
  raise FileNotFoundError("Dataset YAML not found")
117
  yaml_path = yamls[0]
 
119
  img_dirs = guess_image_dirs(root)
120
  if not img_dirs:
121
  raise FileNotFoundError("images/ directory missing")
122
+ imgs = [p for d in img_dirs for p in d.rglob('*.*') if imghdr.what(p)]
123
+ labels_roots = {d.parent/'labels' for d in img_dirs}
124
  lbls = [
125
+ next((lr/f"{p.stem}.txt" for lr in labels_roots if (lr/f"{p.stem}.txt").exists()), None)
126
  for p in imgs
127
  ]
128
  return imgs, lbls, meta
129
 
 
130
  def get_model(weights: str) -> YOLO | None:
131
  if not weights or YOLO is None:
132
  return None
 
134
  _model_cache[weights] = YOLO(weights)
135
  return _model_cache[weights]
136
 
137
+ # ───────── Functions for I/O-bound concurrency ─────────────────────────────
138
+ def _quality_stat_args(args: Tuple[Path, float]) -> Tuple[Path, bool, bool, bool]:
139
+ path, thr = args
140
+ if cv2 is None:
141
+ return path, False, False, False
142
+ im = cv2.imread(str(path))
143
+ if im is None:
144
+ return path, False, False, False
145
+ gray = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
146
+ lap = cv2.Laplacian(gray, cv2.CV_64F).var()
147
+ mean = gray.mean()
148
+ return path, lap < thr, mean < 25, mean > 230
149
+
150
+ def _is_corrupt(path: Path) -> bool:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
  try:
152
+ with Image.open(path) as im:
153
+ im.verify()
154
+ return False
155
+ except Exception:
156
+ return True
157
+
158
+ # ───────────────── Quality Checks ──────────────────────────────────────────
159
+ def qc_integrity(imgs: List[Path], lbls: List[Path], cfg: QCConfig) -> Dict:
160
+ missing = [i for i, l in zip(imgs, lbls) if l is None]
161
+ corrupt = []
162
+ sample = imgs[:cfg.sample_limit]
163
+ with ThreadPoolExecutor(max_workers=cfg.cpu_count) as ex:
164
+ fut = {ex.submit(_is_corrupt, p): p for p in sample}
165
+ for f in as_completed(fut):
166
+ if f.result():
167
+ corrupt.append(fut[f])
168
+ score = 100 - (len(missing) + len(corrupt)) / max(len(imgs), 1) * 100
169
+ return {
170
+ "name": "Integrity",
171
+ "score": max(score, 0),
172
+ "details": {
173
+ "missing_label_files": [str(p) for p in missing],
174
+ "corrupt_images": [str(p) for p in corrupt],
175
+ }
176
+ }
177
 
178
+ def qc_class_balance(lbls: List[Path], cfg: QCConfig) -> Dict:
179
+ counts, boxes = Counter(), []
180
+ for l in lbls[:cfg.sample_limit]:
181
+ bs = parse_label_file(l) if l else []
182
+ boxes.append(len(bs))
183
+ counts.update(b[0] for b in bs)
184
+ if not counts:
185
+ return {"name":"Class balance","score":0,"details":"No labels"}
186
+ bal = min(counts.values()) / max(counts.values()) * 100
187
+ return {
188
+ "name":"Class balance",
189
+ "score":bal,
190
+ "details":{
191
+ "class_counts": dict(counts),
192
+ "boxes_per_image": {
193
+ "min": min(boxes),
194
+ "max": max(boxes),
195
+ "mean": float(np.mean(boxes))
196
+ }
197
+ }
198
+ }
199
 
200
+ def qc_image_quality(imgs: List[Path], cfg: QCConfig) -> Dict:
201
+ if cv2 is None:
202
+ return {"name":"Image quality","score":100,"details":"cv2 missing"}
203
+ blurry, dark, bright = [], [], []
204
+ sample = imgs[:cfg.sample_limit]
205
+ with ThreadPoolExecutor(max_workers=cfg.cpu_count) as ex:
206
+ args = [(p, cfg.blur_thr) for p in sample]
207
+ for p, isb, isd, isB in ex.map(_quality_stat_args, args):
208
+ if isb: blurry.append(p)
209
+ if isd: dark.append(p)
210
+ if isB: bright.append(p)
211
+ bad = len({*blurry, *dark, *bright})
212
+ score = 100 - bad / max(len(sample), 1) * 100
213
+ return {
214
+ "name":"Image quality",
215
+ "score":score,
216
+ "details":{
217
+ "blurry": [str(p) for p in blurry],
218
+ "dark": [str(p) for p in dark],
219
+ "bright": [str(p) for p in bright]
220
+ }
221
+ }
222
 
223
+ def qc_duplicates(imgs: List[Path], cfg: QCConfig) -> Dict:
224
+ if fastdup is not None and len(imgs) > 50:
 
 
 
225
  try:
226
+ fd = fastdup.create(
227
+ input_dir=str(Path(imgs[0]).parent.parent),
228
+ work_dir=str(TMP_ROOT / "fastdup")
229
+ )
230
+ fd.run()
231
+ try:
232
+ cc = fd.connected_components_grouped(sort_by="comp_size", ascending=False)
233
+ clusters = cc["files"].tolist() if "files" in cc.columns else cc.groupby("component")["filename"].apply(list).tolist()
234
+ except Exception:
235
+ clusters = fd.connected_components()
236
+ dup = sum(len(c) - 1 for c in clusters)
237
+ score = max(0.0, 100 - dup / len(imgs) * 100)
238
+ return {"name":"Duplicates","score":score,"details":{"groups":clusters[:50]}}
239
+ except Exception as e:
240
+ return {"name":"Duplicates","score":100.0,"details":{"fastdup_error":str(e)}}
241
+ return {"name":"Duplicates","score":100.0,"details":{"note":"skipped"}}
242
+
243
+ def _rel_iou(b1, b2):
244
+ x1, y1, w1, h1 = b1
245
+ x2, y2, w2, h2 = b2
246
+ xa1, ya1 = x1-w1/2, y1-h1/2
247
+ xa2, ya2 = x1+w1/2, y1+h1/2
248
+ xb1, yb1 = x2-w2/2, y2-h2/2
249
+ xb2, yb2 = x2+w2/2, y2+h2/2
250
+ ix1 = max(xa1, xb1); iy1 = max(ya1, yb1)
251
+ ix2 = min(xa2, xb2); iy2 = min(ya2, yb2)
252
+ inter = max(ix2-ix1,0)*max(iy2-iy1,0)
253
+ union = w1*h1 + w2*h2 - inter
254
+ return inter/union if union else 0.0
255
+
256
+ def qc_model_qa(imgs: List[Path], lbls: List[Path], cfg: QCConfig) -> Dict:
257
+ model = get_model(cfg.weights)
258
+ if model is None:
259
+ return {"name":"Model QA","score":100,"details":"skipped"}
260
+ ious, mism = [], []
261
+ sample = imgs[:cfg.sample_limit]
262
+ for i in range(0, len(sample), cfg.batch_size):
263
+ batch = sample[i:i+cfg.batch_size]
264
+ results = model.predict(batch, verbose=False, half=True, dynamic=True)
265
+ for p, res in zip(batch, results):
266
+ gt = parse_label_file(Path(p).parent.parent/'labels'/f"{Path(p).stem}.txt")
267
+ for cls, x, y, w, h in gt:
268
+ best = 0.0
269
+ for b, c, conf in zip(res.boxes.xywh.cpu().numpy(),
270
+ res.boxes.cls.cpu().numpy(),
271
+ res.boxes.conf.cpu().numpy()):
272
+ if conf < cfg.conf_thr or int(c) != cls:
273
  continue
274
+ best = max(best, _rel_iou((x,y,w,h), tuple(b)))
275
+ ious.append(best)
276
+ if best < cfg.iou_thr:
277
+ mism.append(str(p))
278
+ miou = float(np.mean(ious)) if ious else 1.0
279
+ return {"name":"Model QA","score":miou*100,"details":{"mean_iou":miou,"mismatches":mism[:50]}}
280
+
281
+ def qc_label_issues(imgs: List[Path], lbls: List[Path], cfg: QCConfig) -> Dict:
282
+ if get_noise_indices is None:
283
+ return {"name":"Label issues","score":100,"details":"skipped"}
284
+ labels, idxs = [], []
285
+ sample = imgs[:cfg.sample_limit]
286
+ for i, p in enumerate(sample):
287
+ bs = parse_label_file(lbls[i]) if lbls[i] else []
288
+ for cls, *_ in bs:
289
+ labels.append(int(cls)); idxs.append(i)
290
+ if not labels:
291
+ return {"name":"Label issues","score":100,"details":"no GT"}
292
+ labels_arr = np.array(labels)
293
+ uniq = sorted(set(labels_arr))
294
+ probs = np.eye(len(uniq))[np.searchsorted(uniq, labels_arr)]
295
+ noise = get_noise_indices(labels=labels_arr, probabilities=probs)
296
+ flags = sorted({idxs[n] for n in noise})
297
+ files = [str(sample[i]) for i in flags]
298
+ score = 100 - len(flags)/len(labels)*100
299
+ return {"name":"Label issues","score":score,"details":{"files":files[:50]}}
300
+
301
+ def aggregate(results: List[Dict]) -> float:
302
+ return sum(DEFAULT_W[r["name"]]*r["score"] for r in results)
303
+
304
+ # ───────────────── Roboflow TXT‑loading logic (from v3) ────────────────────
305
+ RF_RE = re.compile(r"https?://universe\.roboflow\.com/([^/]+)/([^/]+)/dataset/(\d+)")
306
+
307
+ def download_rf_dataset(url: str, rf_api: Roboflow, dest: Path) -> Path:
308
+ m = RF_RE.match(url.strip())
309
+ if not m:
310
+ raise ValueError(f"Bad RF URL: {url}")
311
+ ws, proj, ver = m.groups()
312
+ ds_dir = dest / f"{ws}_{proj}_v{ver}"
313
+ if ds_dir.exists():
314
+ return ds_dir
315
+ pr = rf_api.workspace(ws).project(proj)
316
+ pr.version(int(ver)).download("yolov8", location=str(ds_dir))
317
+ return ds_dir
318
+
319
+ def run_quality(
320
+ root: Path,
321
+ yaml_file: Path | None,
322
+ weights: Path | None,
323
+ cfg: QCConfig,
324
+ run_dup: bool,
325
+ run_modelqa: bool
326
+ ) -> Tuple[str, pd.DataFrame]:
327
+ imgs, lbls, meta = gather_dataset(root, yaml_file)
328
+ results = [
329
+ qc_integrity(imgs, lbls, cfg),
330
+ qc_class_balance(lbls, cfg),
331
+ qc_image_quality(imgs, cfg),
332
+ qc_duplicates(imgs, cfg) if run_dup else {"name":"Duplicates","score":100,"details":"skipped"},
333
+ qc_model_qa(imgs, lbls, cfg) if run_modelqa else {"name":"Model QA","score":100,"details":"skipped"},
334
+ qc_label_issues(imgs, lbls, cfg) if run_modelqa else {"name":"Label issues","score":100,"details":"skipped"},
335
+ ]
336
+ final = aggregate(results)
337
+ md = [f"## **{meta.get('name', root.name)}** β€” Score {final:.1f}/100"]
338
+ for r in results:
339
+ md.append(f"### {r['name']} {r['score']:.1f}")
340
+ md.append("<details><summary>details</summary>\n```json")
341
+ md.append(json.dumps(r["details"], indent=2))
342
+ md.append("```\n</details>\n")
343
+ df = pd.DataFrame.from_dict(
344
+ next(r for r in results if r["name"]=="Class balance")["details"]["class_counts"],
345
+ orient="index", columns=["count"]
346
+ )
347
+ df.index.name = "class"
348
+ return "\n".join(md), df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
349
 
350
  # ════════════════════════════════════════════════════════════════════════════
351
  # UI LAYER
 
356
  _Evaluate β€’ Merge β€’ Edit β€’ Download_
357
  """)
358
 
 
359
  with gr.Tab("Evaluate"):
360
  with gr.Row():
361
  api_in = gr.Textbox(label="Roboflow API key", type="password")
 
366
  with gr.Row():
367
  yaml_in = gr.File(label="Custom YAML", file_types=['.yaml'])
368
  weights_in = gr.File(label="YOLO weights (.pt)")
369
+ blur_sl = gr.Slider(0.0, 500.0, value=100.0, label="Blur threshold")
370
+ iou_sl = gr.Slider(0.0, 1.0, value=0.5, label="IOU threshold")
371
+ conf_sl = gr.Slider(0.0, 1.0, value=0.25, label="Min detection confidence")
372
+ run_dup = gr.Checkbox(label="Check duplicates (fastdup)", value=False)
373
+ run_modelqa= gr.Checkbox(label="Run Model QA & cleanlab", value=False)
374
+ run_eval = gr.Button("Run Evaluation")
375
+ out_md = gr.Markdown()
376
+ out_df = gr.Dataframe()
377
+
378
+ def _evaluate_cb(
379
+ api_key, url_txt, zip_file, server_path, yaml_file, weights,
380
+ blur_thr, iou_thr, conf_thr, run_dup, run_modelqa
381
+ ):
382
+ reports, dfs = [], []
383
+ cfg = QCConfig(blur_thr, iou_thr, conf_thr, weights.name if weights else None)
384
+ rf = Roboflow(api_key) if api_key and Roboflow else None
385
+
386
+ # Roboflow URLs via .txt
387
+ if url_txt:
388
+ for line in Path(url_txt.name).read_text().splitlines():
389
+ if not line.strip():
390
+ continue
391
+ try:
392
+ ds = download_rf_dataset(line, rf, TMP_ROOT)
393
+ md, df = run_quality(
394
+ ds, None,
395
+ Path(weights.name) if weights else None,
396
+ cfg, run_dup, run_modelqa
397
+ )
398
+ reports.append(md); dfs.append(df)
399
+ except Exception as e:
400
+ reports.append(f"### {line}\n⚠️ {e}")
401
+
402
+ # ZIP upload
403
+ if zip_file:
404
+ tmp = Path(tempfile.mkdtemp())
405
+ shutil.unpack_archive(zip_file.name, tmp)
406
+ md, df = run_quality(
407
+ tmp,
408
+ Path(yaml_file.name) if yaml_file else None,
409
+ Path(weights.name) if weights else None,
410
+ cfg, run_dup, run_modelqa
411
+ )
412
+ reports.append(md); dfs.append(df)
413
+ shutil.rmtree(tmp, ignore_errors=True)
414
+
415
+ # Server path
416
+ if server_path:
417
+ ds = Path(server_path)
418
+ md, df = run_quality(
419
+ ds,
420
+ Path(yaml_file.name) if yaml_file else None,
421
+ Path(weights.name) if weights else None,
422
+ cfg, run_dup, run_modelqa
423
+ )
424
+ reports.append(md); dfs.append(df)
425
+
426
+ summary = "\n---\n".join(reports) if reports else ""
427
+ combined = pd.concat(dfs).groupby(level=0).sum() if dfs else pd.DataFrame()
428
+ return summary, combined
429
 
430
  run_eval.click(
431
  _evaluate_cb,
432
+ inputs=[api_in, url_txt, zip_in, path_in, yaml_in, weights_in,
433
+ blur_sl, iou_sl, conf_sl, run_dup, run_modelqa],
434
+ outputs=[out_md, out_df]
435
  )
436
 
 
437
  with gr.Tab("Merge / Edit"):
438
  gr.Markdown("### 1️⃣ Load one or more datasets")
439
  rf_key = gr.Textbox(label="Roboflow API key", type="password")
 
445
 
446
  def _load_cb(rf_key, rf_urls_file, zip_files):
447
  global autoinc
448
+ info_list, log_lines = [], []
 
449
 
450
+ if rf_urls_file:
451
  for url in Path(rf_urls_file.name).read_text().splitlines():
452
  if not url.strip():
453
  continue
454
  try:
455
+ ds = download_rf_dataset(url, Roboflow(rf_key), TMP_ROOT)
456
+ names, splits = load_yaml(ds/"data.yaml").get("names", []), [s for s in ("train","valid","test") if (ds/s).exists()]
457
  info_list.append((str(ds), names, splits, Path(ds).name))
458
  log_lines.append(f"βœ”οΈ RF dataset **{Path(ds).name}** loaded ({len(names)} classes)")
459
  except Exception as e:
 
464
  tmp = TMP_ROOT / f"zip_{autoinc}"
465
  tmp.mkdir(parents=True, exist_ok=True)
466
  shutil.unpack_archive(f.name, tmp)
467
+ yaml_p = next(tmp.rglob("*.yaml"), None)
468
+ if not yaml_p:
469
  continue
470
+ names = load_yaml(yaml_p).get("names", [])
471
+ splits= [s for s in ("train","valid","test") if (tmp/s).exists()]
472
  info_list.append((str(tmp), names, splits, tmp.name))
473
  log_lines.append(f"βœ”οΈ ZIP **{tmp.name}** loaded")
474
 
 
478
 
479
  gr.Markdown("### 2️⃣ Edit class mapping / limits / removal")
480
  class_df = gr.Dataframe(
481
+ headers=["original_class","new_name","max_images","remove"],
482
+ datatype=["str","str","number","bool"],
483
  interactive=True, elem_id="classdf"
484
  )
485
  refresh_btn = gr.Button("Build class table from loaded datasets")
486
 
487
  def _build_class_df(ds_info):
488
+ all_names = sorted({n for _, names, _, _ in ds_info for n in names})
 
 
 
489
  return pd.DataFrame({
490
+ "original_class": all_names,
491
+ "new_name": all_names,
492
+ "max_images": [99999]*len(all_names),
493
+ "remove": [False]*len(all_names),
494
  })
495
 
496
  refresh_btn.click(_build_class_df, [ds_state], [class_df])
 
504
  return None, "⚠️ Load datasets first."
505
  out_dir = merge_datasets(ds_info, class_df)
506
  zip_path = shutil.make_archive(str(out_dir), "zip", out_dir)
507
+ return zip_path, f"βœ…Β Merged dataset at **{out_dir}** with {len(list(Path(out_dir).rglob('*.jpg')))} images."
 
 
 
508
 
509
  merge_btn.click(_merge_cb, [ds_state, class_df], [zip_out, merge_log])
510
 
 
511
  if __name__ == "__main__":
512
  demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", 7860)))