Update app.py
Browse files
app.py
CHANGED
@@ -69,7 +69,6 @@ DEFAULT_W = {
|
|
69 |
logging.basicConfig(level=logging.INFO, format="%(asctime)s | %(levelname)s | %(message)s")
|
70 |
|
71 |
_model_cache: dict[str, YOLO] = {}
|
72 |
-
|
73 |
autoinc = 0 # helper for tmpβdir names
|
74 |
|
75 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
@@ -85,12 +84,10 @@ class QCConfig:
|
|
85 |
batch_size: int = BATCH_SIZE
|
86 |
sample_limit:int = SAMPLE_LIMIT
|
87 |
|
88 |
-
|
89 |
def load_yaml(path: Path) -> Dict:
|
90 |
-
with path.open(
|
91 |
return yaml.safe_load(f)
|
92 |
|
93 |
-
|
94 |
def parse_label_file(path: Path) -> list[tuple[int, float, float, float, float]]:
|
95 |
if not path or not path.exists() or path.stat().st_size == 0:
|
96 |
return []
|
@@ -102,21 +99,19 @@ def parse_label_file(path: Path) -> list[tuple[int, float, float, float, float]]
|
|
102 |
except Exception:
|
103 |
return []
|
104 |
|
105 |
-
|
106 |
def guess_image_dirs(root: Path) -> List[Path]:
|
107 |
candidates = [
|
108 |
-
root
|
109 |
-
root
|
110 |
-
root
|
111 |
-
root
|
112 |
-
root
|
113 |
]
|
114 |
return [d for d in candidates if d.exists()]
|
115 |
|
116 |
-
|
117 |
def gather_dataset(root: Path, yaml_path: Path | None):
|
118 |
if yaml_path is None:
|
119 |
-
yamls = list(root.glob(
|
120 |
if not yamls:
|
121 |
raise FileNotFoundError("Dataset YAML not found")
|
122 |
yaml_path = yamls[0]
|
@@ -124,15 +119,14 @@ def gather_dataset(root: Path, yaml_path: Path | None):
|
|
124 |
img_dirs = guess_image_dirs(root)
|
125 |
if not img_dirs:
|
126 |
raise FileNotFoundError("images/ directory missing")
|
127 |
-
imgs = [p for d in img_dirs for p in d.rglob(
|
128 |
-
labels_roots = {d.parent
|
129 |
lbls = [
|
130 |
-
next((lr
|
131 |
for p in imgs
|
132 |
]
|
133 |
return imgs, lbls, meta
|
134 |
|
135 |
-
|
136 |
def get_model(weights: str) -> YOLO | None:
|
137 |
if not weights or YOLO is None:
|
138 |
return None
|
@@ -140,249 +134,218 @@ def get_model(weights: str) -> YOLO | None:
|
|
140 |
_model_cache[weights] = YOLO(weights)
|
141 |
return _model_cache[weights]
|
142 |
|
143 |
-
#
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
def parse_roboflow_url(url: str) -> tuple[str, str, int | None]:
|
158 |
-
"""
|
159 |
-
Return (workspace, project, version|None) β tolerates many RF URL flavours.
|
160 |
-
Any nonβpositive or malformed version is treated as None.
|
161 |
-
"""
|
162 |
-
m = RF_RE.match(url.strip())
|
163 |
-
if not m:
|
164 |
-
return None, None, None
|
165 |
-
ws, proj, tail = m.groups()
|
166 |
-
ver: int | None = None
|
167 |
-
|
168 |
-
# explicit "dataset/<number>" in path
|
169 |
-
if tail.startswith("dataset/"):
|
170 |
-
try:
|
171 |
-
v = int(tail.split("dataset/", 1)[1])
|
172 |
-
if v > 0:
|
173 |
-
ver = v
|
174 |
-
except ValueError:
|
175 |
-
pass
|
176 |
-
|
177 |
-
# explicit "?version=<number>" in query
|
178 |
-
if ver is None and "?version=" in url:
|
179 |
-
try:
|
180 |
-
v = int(url.split("?version=", 1)[1])
|
181 |
-
if v > 0:
|
182 |
-
ver = v
|
183 |
-
except ValueError:
|
184 |
-
pass
|
185 |
-
|
186 |
-
return ws, proj, ver
|
187 |
-
|
188 |
-
|
189 |
-
def get_latest_version(rf: Roboflow, ws: str, proj: str) -> str | None:
|
190 |
try:
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
208 |
|
209 |
-
|
210 |
-
|
211 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
212 |
|
213 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
214 |
|
215 |
-
|
216 |
-
if not
|
217 |
-
latest = get_latest_version(rf, ws, proj)
|
218 |
-
if latest is None:
|
219 |
-
raise RuntimeError("Could not resolve latest Roboflow version")
|
220 |
try:
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
254 |
continue
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
(
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
|
295 |
-
|
296 |
-
|
297 |
-
|
298 |
-
|
299 |
-
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
-
|
304 |
-
|
305 |
-
|
306 |
-
|
307 |
-
|
308 |
-
|
309 |
-
|
310 |
-
|
311 |
-
|
312 |
-
|
313 |
-
|
314 |
-
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
|
328 |
-
|
329 |
-
|
330 |
-
|
331 |
-
for img in selected_images:
|
332 |
-
split = "train" if random.random() < 0.9 else "valid"
|
333 |
-
dst_img = out_dir / split / "images" / Path(img).name
|
334 |
-
dst_img.parent.mkdir(parents=True, exist_ok=True)
|
335 |
-
shutil.copy(img, dst_img)
|
336 |
-
|
337 |
-
lp_src = image_to_label[img]
|
338 |
-
dst_label = out_dir / split / "labels" / Path(lp_src).name
|
339 |
-
dst_label.parent.mkdir(parents=True, exist_ok=True)
|
340 |
-
with open(lp_src, "r") as f:
|
341 |
-
lines = f.readlines()
|
342 |
-
new_lines = []
|
343 |
-
for line in lines:
|
344 |
-
parts = line.strip().split()
|
345 |
-
if not parts:
|
346 |
-
continue
|
347 |
-
cid = int(parts[0])
|
348 |
-
dloc_match = next((cl for dloc2, cl, _, _ in dataset_info_list if str(lp_src).startswith(dloc2)), None)
|
349 |
-
if dloc_match is None:
|
350 |
-
continue
|
351 |
-
orig_cls_name = dloc_match[cid] if cid < len(dloc_match) else None
|
352 |
-
if orig_cls_name is None:
|
353 |
-
continue
|
354 |
-
merged_cls_name = class_name_mapping.get(orig_cls_name, orig_cls_name)
|
355 |
-
if merged_cls_name not in active_classes:
|
356 |
-
continue
|
357 |
-
new_id = id_map[merged_cls_name]
|
358 |
-
new_lines.append(" ".join([str(new_id)] + parts[1:]))
|
359 |
-
if new_lines:
|
360 |
-
with open(dst_label, "w") as f:
|
361 |
-
f.write("\n".join(new_lines))
|
362 |
-
else:
|
363 |
-
(out_dir / split / "images" / Path(img).name).unlink(missing_ok=True)
|
364 |
-
|
365 |
-
data_yaml = {
|
366 |
-
"path": str(out_dir.resolve()),
|
367 |
-
"train": "train/images",
|
368 |
-
"val": "valid/images",
|
369 |
-
"nc": len(active_classes),
|
370 |
-
"names": active_classes,
|
371 |
-
}
|
372 |
-
with open(out_dir / "data.yaml", "w") as f:
|
373 |
-
yaml.safe_dump(data_yaml, f)
|
374 |
-
|
375 |
-
return out_dir
|
376 |
-
|
377 |
-
|
378 |
-
def zip_directory(folder: Path) -> bytes:
|
379 |
-
buf = io.BytesIO()
|
380 |
-
with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zf:
|
381 |
-
for file in folder.rglob("*"):
|
382 |
-
zf.write(file, arcname=file.relative_to(folder))
|
383 |
-
buf.seek(0)
|
384 |
-
return buf.getvalue()
|
385 |
-
|
386 |
|
387 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
388 |
# UI LAYER
|
@@ -393,7 +356,6 @@ with gr.Blocks(css="#classdf td{min-width:120px}") as demo:
|
|
393 |
_Evaluate β’ Merge β’ Edit β’ Download_
|
394 |
""")
|
395 |
|
396 |
-
# ------------------------------ EVALUATE TAB --------------------------
|
397 |
with gr.Tab("Evaluate"):
|
398 |
with gr.Row():
|
399 |
api_in = gr.Textbox(label="Roboflow API key", type="password")
|
@@ -404,27 +366,74 @@ with gr.Blocks(css="#classdf td{min-width:120px}") as demo:
|
|
404 |
with gr.Row():
|
405 |
yaml_in = gr.File(label="Custom YAML", file_types=['.yaml'])
|
406 |
weights_in = gr.File(label="YOLO weights (.pt)")
|
407 |
-
blur_sl
|
408 |
-
iou_sl
|
409 |
-
conf_sl
|
410 |
-
run_dup
|
411 |
-
|
412 |
-
run_eval
|
413 |
-
out_md
|
414 |
-
out_df
|
415 |
-
|
416 |
-
def _evaluate_cb(
|
417 |
-
|
418 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
419 |
|
420 |
run_eval.click(
|
421 |
_evaluate_cb,
|
422 |
-
[api_in, url_txt, zip_in, path_in, yaml_in, weights_in,
|
423 |
-
|
424 |
-
[out_md, out_df]
|
425 |
)
|
426 |
|
427 |
-
# ------------------------------ MERGE TAB -----------------------------
|
428 |
with gr.Tab("Merge / Edit"):
|
429 |
gr.Markdown("### 1οΈβ£Β Load one or more datasets")
|
430 |
rf_key = gr.Textbox(label="Roboflow API key", type="password")
|
@@ -436,15 +445,15 @@ with gr.Blocks(css="#classdf td{min-width:120px}") as demo:
|
|
436 |
|
437 |
def _load_cb(rf_key, rf_urls_file, zip_files):
|
438 |
global autoinc
|
439 |
-
info_list = []
|
440 |
-
log_lines = []
|
441 |
|
442 |
-
if rf_urls_file
|
443 |
for url in Path(rf_urls_file.name).read_text().splitlines():
|
444 |
if not url.strip():
|
445 |
continue
|
446 |
try:
|
447 |
-
ds
|
|
|
448 |
info_list.append((str(ds), names, splits, Path(ds).name))
|
449 |
log_lines.append(f"βοΈ RF dataset **{Path(ds).name}** loaded ({len(names)} classes)")
|
450 |
except Exception as e:
|
@@ -455,11 +464,11 @@ with gr.Blocks(css="#classdf td{min-width:120px}") as demo:
|
|
455 |
tmp = TMP_ROOT / f"zip_{autoinc}"
|
456 |
tmp.mkdir(parents=True, exist_ok=True)
|
457 |
shutil.unpack_archive(f.name, tmp)
|
458 |
-
|
459 |
-
if
|
460 |
continue
|
461 |
-
names = load_yaml(
|
462 |
-
splits
|
463 |
info_list.append((str(tmp), names, splits, tmp.name))
|
464 |
log_lines.append(f"βοΈ ZIP **{tmp.name}** loaded")
|
465 |
|
@@ -469,22 +478,19 @@ with gr.Blocks(css="#classdf td{min-width:120px}") as demo:
|
|
469 |
|
470 |
gr.Markdown("### 2οΈβ£Β Edit class mapping / limits / removal")
|
471 |
class_df = gr.Dataframe(
|
472 |
-
headers=["original_class",
|
473 |
-
datatype=["str",
|
474 |
interactive=True, elem_id="classdf"
|
475 |
)
|
476 |
refresh_btn = gr.Button("Build class table from loaded datasets")
|
477 |
|
478 |
def _build_class_df(ds_info):
|
479 |
-
|
480 |
-
for _dloc, names, _spl, _ in ds_info:
|
481 |
-
class_names_all.extend(names)
|
482 |
-
class_names_all = sorted(set(class_names_all))
|
483 |
return pd.DataFrame({
|
484 |
-
"original_class":
|
485 |
-
"new_name":
|
486 |
-
"max_images": [99999]
|
487 |
-
"remove": [False]
|
488 |
})
|
489 |
|
490 |
refresh_btn.click(_build_class_df, [ds_state], [class_df])
|
@@ -498,13 +504,9 @@ with gr.Blocks(css="#classdf td{min-width:120px}") as demo:
|
|
498 |
return None, "β οΈΒ Load datasets first."
|
499 |
out_dir = merge_datasets(ds_info, class_df)
|
500 |
zip_path = shutil.make_archive(str(out_dir), "zip", out_dir)
|
501 |
-
return zip_path, (
|
502 |
-
f"β
Β Merged dataset created at **{out_dir}** with "
|
503 |
-
f"{len(list(Path(out_dir).rglob('*.jpg')))} images."
|
504 |
-
)
|
505 |
|
506 |
merge_btn.click(_merge_cb, [ds_state, class_df], [zip_out, merge_log])
|
507 |
|
508 |
-
|
509 |
if __name__ == "__main__":
|
510 |
demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", 7860)))
|
|
|
69 |
logging.basicConfig(level=logging.INFO, format="%(asctime)s | %(levelname)s | %(message)s")
|
70 |
|
71 |
_model_cache: dict[str, YOLO] = {}
|
|
|
72 |
autoinc = 0 # helper for tmpβdir names
|
73 |
|
74 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
84 |
batch_size: int = BATCH_SIZE
|
85 |
sample_limit:int = SAMPLE_LIMIT
|
86 |
|
|
|
87 |
def load_yaml(path: Path) -> Dict:
|
88 |
+
with path.open('r', encoding='utf-8') as f:
|
89 |
return yaml.safe_load(f)
|
90 |
|
|
|
91 |
def parse_label_file(path: Path) -> list[tuple[int, float, float, float, float]]:
|
92 |
if not path or not path.exists() or path.stat().st_size == 0:
|
93 |
return []
|
|
|
99 |
except Exception:
|
100 |
return []
|
101 |
|
|
|
102 |
def guess_image_dirs(root: Path) -> List[Path]:
|
103 |
candidates = [
|
104 |
+
root/'images',
|
105 |
+
root/'train'/'images',
|
106 |
+
root/'valid'/'images',
|
107 |
+
root/'val' /'images',
|
108 |
+
root/'test' /'images',
|
109 |
]
|
110 |
return [d for d in candidates if d.exists()]
|
111 |
|
|
|
112 |
def gather_dataset(root: Path, yaml_path: Path | None):
|
113 |
if yaml_path is None:
|
114 |
+
yamls = list(root.glob('*.yaml'))
|
115 |
if not yamls:
|
116 |
raise FileNotFoundError("Dataset YAML not found")
|
117 |
yaml_path = yamls[0]
|
|
|
119 |
img_dirs = guess_image_dirs(root)
|
120 |
if not img_dirs:
|
121 |
raise FileNotFoundError("images/ directory missing")
|
122 |
+
imgs = [p for d in img_dirs for p in d.rglob('*.*') if imghdr.what(p)]
|
123 |
+
labels_roots = {d.parent/'labels' for d in img_dirs}
|
124 |
lbls = [
|
125 |
+
next((lr/f"{p.stem}.txt" for lr in labels_roots if (lr/f"{p.stem}.txt").exists()), None)
|
126 |
for p in imgs
|
127 |
]
|
128 |
return imgs, lbls, meta
|
129 |
|
|
|
130 |
def get_model(weights: str) -> YOLO | None:
|
131 |
if not weights or YOLO is None:
|
132 |
return None
|
|
|
134 |
_model_cache[weights] = YOLO(weights)
|
135 |
return _model_cache[weights]
|
136 |
|
137 |
+
# βββββββββ Functions for I/O-bound concurrency βββββββββββββββββββββββββββββ
|
138 |
+
def _quality_stat_args(args: Tuple[Path, float]) -> Tuple[Path, bool, bool, bool]:
|
139 |
+
path, thr = args
|
140 |
+
if cv2 is None:
|
141 |
+
return path, False, False, False
|
142 |
+
im = cv2.imread(str(path))
|
143 |
+
if im is None:
|
144 |
+
return path, False, False, False
|
145 |
+
gray = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
|
146 |
+
lap = cv2.Laplacian(gray, cv2.CV_64F).var()
|
147 |
+
mean = gray.mean()
|
148 |
+
return path, lap < thr, mean < 25, mean > 230
|
149 |
+
|
150 |
+
def _is_corrupt(path: Path) -> bool:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
151 |
try:
|
152 |
+
with Image.open(path) as im:
|
153 |
+
im.verify()
|
154 |
+
return False
|
155 |
+
except Exception:
|
156 |
+
return True
|
157 |
+
|
158 |
+
# βββββββββββββββββ Quality Checks ββββββββββββββββββββββββββββββββββββββββββ
|
159 |
+
def qc_integrity(imgs: List[Path], lbls: List[Path], cfg: QCConfig) -> Dict:
|
160 |
+
missing = [i for i, l in zip(imgs, lbls) if l is None]
|
161 |
+
corrupt = []
|
162 |
+
sample = imgs[:cfg.sample_limit]
|
163 |
+
with ThreadPoolExecutor(max_workers=cfg.cpu_count) as ex:
|
164 |
+
fut = {ex.submit(_is_corrupt, p): p for p in sample}
|
165 |
+
for f in as_completed(fut):
|
166 |
+
if f.result():
|
167 |
+
corrupt.append(fut[f])
|
168 |
+
score = 100 - (len(missing) + len(corrupt)) / max(len(imgs), 1) * 100
|
169 |
+
return {
|
170 |
+
"name": "Integrity",
|
171 |
+
"score": max(score, 0),
|
172 |
+
"details": {
|
173 |
+
"missing_label_files": [str(p) for p in missing],
|
174 |
+
"corrupt_images": [str(p) for p in corrupt],
|
175 |
+
}
|
176 |
+
}
|
177 |
|
178 |
+
def qc_class_balance(lbls: List[Path], cfg: QCConfig) -> Dict:
|
179 |
+
counts, boxes = Counter(), []
|
180 |
+
for l in lbls[:cfg.sample_limit]:
|
181 |
+
bs = parse_label_file(l) if l else []
|
182 |
+
boxes.append(len(bs))
|
183 |
+
counts.update(b[0] for b in bs)
|
184 |
+
if not counts:
|
185 |
+
return {"name":"Class balance","score":0,"details":"No labels"}
|
186 |
+
bal = min(counts.values()) / max(counts.values()) * 100
|
187 |
+
return {
|
188 |
+
"name":"Class balance",
|
189 |
+
"score":bal,
|
190 |
+
"details":{
|
191 |
+
"class_counts": dict(counts),
|
192 |
+
"boxes_per_image": {
|
193 |
+
"min": min(boxes),
|
194 |
+
"max": max(boxes),
|
195 |
+
"mean": float(np.mean(boxes))
|
196 |
+
}
|
197 |
+
}
|
198 |
+
}
|
199 |
|
200 |
+
def qc_image_quality(imgs: List[Path], cfg: QCConfig) -> Dict:
|
201 |
+
if cv2 is None:
|
202 |
+
return {"name":"Image quality","score":100,"details":"cv2 missing"}
|
203 |
+
blurry, dark, bright = [], [], []
|
204 |
+
sample = imgs[:cfg.sample_limit]
|
205 |
+
with ThreadPoolExecutor(max_workers=cfg.cpu_count) as ex:
|
206 |
+
args = [(p, cfg.blur_thr) for p in sample]
|
207 |
+
for p, isb, isd, isB in ex.map(_quality_stat_args, args):
|
208 |
+
if isb: blurry.append(p)
|
209 |
+
if isd: dark.append(p)
|
210 |
+
if isB: bright.append(p)
|
211 |
+
bad = len({*blurry, *dark, *bright})
|
212 |
+
score = 100 - bad / max(len(sample), 1) * 100
|
213 |
+
return {
|
214 |
+
"name":"Image quality",
|
215 |
+
"score":score,
|
216 |
+
"details":{
|
217 |
+
"blurry": [str(p) for p in blurry],
|
218 |
+
"dark": [str(p) for p in dark],
|
219 |
+
"bright": [str(p) for p in bright]
|
220 |
+
}
|
221 |
+
}
|
222 |
|
223 |
+
def qc_duplicates(imgs: List[Path], cfg: QCConfig) -> Dict:
|
224 |
+
if fastdup is not None and len(imgs) > 50:
|
|
|
|
|
|
|
225 |
try:
|
226 |
+
fd = fastdup.create(
|
227 |
+
input_dir=str(Path(imgs[0]).parent.parent),
|
228 |
+
work_dir=str(TMP_ROOT / "fastdup")
|
229 |
+
)
|
230 |
+
fd.run()
|
231 |
+
try:
|
232 |
+
cc = fd.connected_components_grouped(sort_by="comp_size", ascending=False)
|
233 |
+
clusters = cc["files"].tolist() if "files" in cc.columns else cc.groupby("component")["filename"].apply(list).tolist()
|
234 |
+
except Exception:
|
235 |
+
clusters = fd.connected_components()
|
236 |
+
dup = sum(len(c) - 1 for c in clusters)
|
237 |
+
score = max(0.0, 100 - dup / len(imgs) * 100)
|
238 |
+
return {"name":"Duplicates","score":score,"details":{"groups":clusters[:50]}}
|
239 |
+
except Exception as e:
|
240 |
+
return {"name":"Duplicates","score":100.0,"details":{"fastdup_error":str(e)}}
|
241 |
+
return {"name":"Duplicates","score":100.0,"details":{"note":"skipped"}}
|
242 |
+
|
243 |
+
def _rel_iou(b1, b2):
|
244 |
+
x1, y1, w1, h1 = b1
|
245 |
+
x2, y2, w2, h2 = b2
|
246 |
+
xa1, ya1 = x1-w1/2, y1-h1/2
|
247 |
+
xa2, ya2 = x1+w1/2, y1+h1/2
|
248 |
+
xb1, yb1 = x2-w2/2, y2-h2/2
|
249 |
+
xb2, yb2 = x2+w2/2, y2+h2/2
|
250 |
+
ix1 = max(xa1, xb1); iy1 = max(ya1, yb1)
|
251 |
+
ix2 = min(xa2, xb2); iy2 = min(ya2, yb2)
|
252 |
+
inter = max(ix2-ix1,0)*max(iy2-iy1,0)
|
253 |
+
union = w1*h1 + w2*h2 - inter
|
254 |
+
return inter/union if union else 0.0
|
255 |
+
|
256 |
+
def qc_model_qa(imgs: List[Path], lbls: List[Path], cfg: QCConfig) -> Dict:
|
257 |
+
model = get_model(cfg.weights)
|
258 |
+
if model is None:
|
259 |
+
return {"name":"Model QA","score":100,"details":"skipped"}
|
260 |
+
ious, mism = [], []
|
261 |
+
sample = imgs[:cfg.sample_limit]
|
262 |
+
for i in range(0, len(sample), cfg.batch_size):
|
263 |
+
batch = sample[i:i+cfg.batch_size]
|
264 |
+
results = model.predict(batch, verbose=False, half=True, dynamic=True)
|
265 |
+
for p, res in zip(batch, results):
|
266 |
+
gt = parse_label_file(Path(p).parent.parent/'labels'/f"{Path(p).stem}.txt")
|
267 |
+
for cls, x, y, w, h in gt:
|
268 |
+
best = 0.0
|
269 |
+
for b, c, conf in zip(res.boxes.xywh.cpu().numpy(),
|
270 |
+
res.boxes.cls.cpu().numpy(),
|
271 |
+
res.boxes.conf.cpu().numpy()):
|
272 |
+
if conf < cfg.conf_thr or int(c) != cls:
|
273 |
continue
|
274 |
+
best = max(best, _rel_iou((x,y,w,h), tuple(b)))
|
275 |
+
ious.append(best)
|
276 |
+
if best < cfg.iou_thr:
|
277 |
+
mism.append(str(p))
|
278 |
+
miou = float(np.mean(ious)) if ious else 1.0
|
279 |
+
return {"name":"Model QA","score":miou*100,"details":{"mean_iou":miou,"mismatches":mism[:50]}}
|
280 |
+
|
281 |
+
def qc_label_issues(imgs: List[Path], lbls: List[Path], cfg: QCConfig) -> Dict:
|
282 |
+
if get_noise_indices is None:
|
283 |
+
return {"name":"Label issues","score":100,"details":"skipped"}
|
284 |
+
labels, idxs = [], []
|
285 |
+
sample = imgs[:cfg.sample_limit]
|
286 |
+
for i, p in enumerate(sample):
|
287 |
+
bs = parse_label_file(lbls[i]) if lbls[i] else []
|
288 |
+
for cls, *_ in bs:
|
289 |
+
labels.append(int(cls)); idxs.append(i)
|
290 |
+
if not labels:
|
291 |
+
return {"name":"Label issues","score":100,"details":"no GT"}
|
292 |
+
labels_arr = np.array(labels)
|
293 |
+
uniq = sorted(set(labels_arr))
|
294 |
+
probs = np.eye(len(uniq))[np.searchsorted(uniq, labels_arr)]
|
295 |
+
noise = get_noise_indices(labels=labels_arr, probabilities=probs)
|
296 |
+
flags = sorted({idxs[n] for n in noise})
|
297 |
+
files = [str(sample[i]) for i in flags]
|
298 |
+
score = 100 - len(flags)/len(labels)*100
|
299 |
+
return {"name":"Label issues","score":score,"details":{"files":files[:50]}}
|
300 |
+
|
301 |
+
def aggregate(results: List[Dict]) -> float:
|
302 |
+
return sum(DEFAULT_W[r["name"]]*r["score"] for r in results)
|
303 |
+
|
304 |
+
# βββββββββββββββββ Roboflow TXTβloading logic (from v3) ββββββββββββββββββββ
|
305 |
+
RF_RE = re.compile(r"https?://universe\.roboflow\.com/([^/]+)/([^/]+)/dataset/(\d+)")
|
306 |
+
|
307 |
+
def download_rf_dataset(url: str, rf_api: Roboflow, dest: Path) -> Path:
|
308 |
+
m = RF_RE.match(url.strip())
|
309 |
+
if not m:
|
310 |
+
raise ValueError(f"Bad RF URL: {url}")
|
311 |
+
ws, proj, ver = m.groups()
|
312 |
+
ds_dir = dest / f"{ws}_{proj}_v{ver}"
|
313 |
+
if ds_dir.exists():
|
314 |
+
return ds_dir
|
315 |
+
pr = rf_api.workspace(ws).project(proj)
|
316 |
+
pr.version(int(ver)).download("yolov8", location=str(ds_dir))
|
317 |
+
return ds_dir
|
318 |
+
|
319 |
+
def run_quality(
|
320 |
+
root: Path,
|
321 |
+
yaml_file: Path | None,
|
322 |
+
weights: Path | None,
|
323 |
+
cfg: QCConfig,
|
324 |
+
run_dup: bool,
|
325 |
+
run_modelqa: bool
|
326 |
+
) -> Tuple[str, pd.DataFrame]:
|
327 |
+
imgs, lbls, meta = gather_dataset(root, yaml_file)
|
328 |
+
results = [
|
329 |
+
qc_integrity(imgs, lbls, cfg),
|
330 |
+
qc_class_balance(lbls, cfg),
|
331 |
+
qc_image_quality(imgs, cfg),
|
332 |
+
qc_duplicates(imgs, cfg) if run_dup else {"name":"Duplicates","score":100,"details":"skipped"},
|
333 |
+
qc_model_qa(imgs, lbls, cfg) if run_modelqa else {"name":"Model QA","score":100,"details":"skipped"},
|
334 |
+
qc_label_issues(imgs, lbls, cfg) if run_modelqa else {"name":"Label issues","score":100,"details":"skipped"},
|
335 |
+
]
|
336 |
+
final = aggregate(results)
|
337 |
+
md = [f"## **{meta.get('name', root.name)}** β Score {final:.1f}/100"]
|
338 |
+
for r in results:
|
339 |
+
md.append(f"### {r['name']} {r['score']:.1f}")
|
340 |
+
md.append("<details><summary>details</summary>\n```json")
|
341 |
+
md.append(json.dumps(r["details"], indent=2))
|
342 |
+
md.append("```\n</details>\n")
|
343 |
+
df = pd.DataFrame.from_dict(
|
344 |
+
next(r for r in results if r["name"]=="Class balance")["details"]["class_counts"],
|
345 |
+
orient="index", columns=["count"]
|
346 |
+
)
|
347 |
+
df.index.name = "class"
|
348 |
+
return "\n".join(md), df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
349 |
|
350 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
351 |
# UI LAYER
|
|
|
356 |
_Evaluate β’ Merge β’ Edit β’ Download_
|
357 |
""")
|
358 |
|
|
|
359 |
with gr.Tab("Evaluate"):
|
360 |
with gr.Row():
|
361 |
api_in = gr.Textbox(label="Roboflow API key", type="password")
|
|
|
366 |
with gr.Row():
|
367 |
yaml_in = gr.File(label="Custom YAML", file_types=['.yaml'])
|
368 |
weights_in = gr.File(label="YOLO weights (.pt)")
|
369 |
+
blur_sl = gr.Slider(0.0, 500.0, value=100.0, label="Blur threshold")
|
370 |
+
iou_sl = gr.Slider(0.0, 1.0, value=0.5, label="IOU threshold")
|
371 |
+
conf_sl = gr.Slider(0.0, 1.0, value=0.25, label="Min detection confidence")
|
372 |
+
run_dup = gr.Checkbox(label="Check duplicates (fastdup)", value=False)
|
373 |
+
run_modelqa= gr.Checkbox(label="Run Model QA & cleanlab", value=False)
|
374 |
+
run_eval = gr.Button("Run Evaluation")
|
375 |
+
out_md = gr.Markdown()
|
376 |
+
out_df = gr.Dataframe()
|
377 |
+
|
378 |
+
def _evaluate_cb(
|
379 |
+
api_key, url_txt, zip_file, server_path, yaml_file, weights,
|
380 |
+
blur_thr, iou_thr, conf_thr, run_dup, run_modelqa
|
381 |
+
):
|
382 |
+
reports, dfs = [], []
|
383 |
+
cfg = QCConfig(blur_thr, iou_thr, conf_thr, weights.name if weights else None)
|
384 |
+
rf = Roboflow(api_key) if api_key and Roboflow else None
|
385 |
+
|
386 |
+
# Roboflow URLs via .txt
|
387 |
+
if url_txt:
|
388 |
+
for line in Path(url_txt.name).read_text().splitlines():
|
389 |
+
if not line.strip():
|
390 |
+
continue
|
391 |
+
try:
|
392 |
+
ds = download_rf_dataset(line, rf, TMP_ROOT)
|
393 |
+
md, df = run_quality(
|
394 |
+
ds, None,
|
395 |
+
Path(weights.name) if weights else None,
|
396 |
+
cfg, run_dup, run_modelqa
|
397 |
+
)
|
398 |
+
reports.append(md); dfs.append(df)
|
399 |
+
except Exception as e:
|
400 |
+
reports.append(f"### {line}\nβ οΈ {e}")
|
401 |
+
|
402 |
+
# ZIP upload
|
403 |
+
if zip_file:
|
404 |
+
tmp = Path(tempfile.mkdtemp())
|
405 |
+
shutil.unpack_archive(zip_file.name, tmp)
|
406 |
+
md, df = run_quality(
|
407 |
+
tmp,
|
408 |
+
Path(yaml_file.name) if yaml_file else None,
|
409 |
+
Path(weights.name) if weights else None,
|
410 |
+
cfg, run_dup, run_modelqa
|
411 |
+
)
|
412 |
+
reports.append(md); dfs.append(df)
|
413 |
+
shutil.rmtree(tmp, ignore_errors=True)
|
414 |
+
|
415 |
+
# Server path
|
416 |
+
if server_path:
|
417 |
+
ds = Path(server_path)
|
418 |
+
md, df = run_quality(
|
419 |
+
ds,
|
420 |
+
Path(yaml_file.name) if yaml_file else None,
|
421 |
+
Path(weights.name) if weights else None,
|
422 |
+
cfg, run_dup, run_modelqa
|
423 |
+
)
|
424 |
+
reports.append(md); dfs.append(df)
|
425 |
+
|
426 |
+
summary = "\n---\n".join(reports) if reports else ""
|
427 |
+
combined = pd.concat(dfs).groupby(level=0).sum() if dfs else pd.DataFrame()
|
428 |
+
return summary, combined
|
429 |
|
430 |
run_eval.click(
|
431 |
_evaluate_cb,
|
432 |
+
inputs=[api_in, url_txt, zip_in, path_in, yaml_in, weights_in,
|
433 |
+
blur_sl, iou_sl, conf_sl, run_dup, run_modelqa],
|
434 |
+
outputs=[out_md, out_df]
|
435 |
)
|
436 |
|
|
|
437 |
with gr.Tab("Merge / Edit"):
|
438 |
gr.Markdown("### 1οΈβ£Β Load one or more datasets")
|
439 |
rf_key = gr.Textbox(label="Roboflow API key", type="password")
|
|
|
445 |
|
446 |
def _load_cb(rf_key, rf_urls_file, zip_files):
|
447 |
global autoinc
|
448 |
+
info_list, log_lines = [], []
|
|
|
449 |
|
450 |
+
if rf_urls_file:
|
451 |
for url in Path(rf_urls_file.name).read_text().splitlines():
|
452 |
if not url.strip():
|
453 |
continue
|
454 |
try:
|
455 |
+
ds = download_rf_dataset(url, Roboflow(rf_key), TMP_ROOT)
|
456 |
+
names, splits = load_yaml(ds/"data.yaml").get("names", []), [s for s in ("train","valid","test") if (ds/s).exists()]
|
457 |
info_list.append((str(ds), names, splits, Path(ds).name))
|
458 |
log_lines.append(f"βοΈ RF dataset **{Path(ds).name}** loaded ({len(names)} classes)")
|
459 |
except Exception as e:
|
|
|
464 |
tmp = TMP_ROOT / f"zip_{autoinc}"
|
465 |
tmp.mkdir(parents=True, exist_ok=True)
|
466 |
shutil.unpack_archive(f.name, tmp)
|
467 |
+
yaml_p = next(tmp.rglob("*.yaml"), None)
|
468 |
+
if not yaml_p:
|
469 |
continue
|
470 |
+
names = load_yaml(yaml_p).get("names", [])
|
471 |
+
splits= [s for s in ("train","valid","test") if (tmp/s).exists()]
|
472 |
info_list.append((str(tmp), names, splits, tmp.name))
|
473 |
log_lines.append(f"βοΈ ZIP **{tmp.name}** loaded")
|
474 |
|
|
|
478 |
|
479 |
gr.Markdown("### 2οΈβ£Β Edit class mapping / limits / removal")
|
480 |
class_df = gr.Dataframe(
|
481 |
+
headers=["original_class","new_name","max_images","remove"],
|
482 |
+
datatype=["str","str","number","bool"],
|
483 |
interactive=True, elem_id="classdf"
|
484 |
)
|
485 |
refresh_btn = gr.Button("Build class table from loaded datasets")
|
486 |
|
487 |
def _build_class_df(ds_info):
|
488 |
+
all_names = sorted({n for _, names, _, _ in ds_info for n in names})
|
|
|
|
|
|
|
489 |
return pd.DataFrame({
|
490 |
+
"original_class": all_names,
|
491 |
+
"new_name": all_names,
|
492 |
+
"max_images": [99999]*len(all_names),
|
493 |
+
"remove": [False]*len(all_names),
|
494 |
})
|
495 |
|
496 |
refresh_btn.click(_build_class_df, [ds_state], [class_df])
|
|
|
504 |
return None, "β οΈΒ Load datasets first."
|
505 |
out_dir = merge_datasets(ds_info, class_df)
|
506 |
zip_path = shutil.make_archive(str(out_dir), "zip", out_dir)
|
507 |
+
return zip_path, f"β
Β Merged dataset at **{out_dir}** with {len(list(Path(out_dir).rglob('*.jpg')))} images."
|
|
|
|
|
|
|
508 |
|
509 |
merge_btn.click(_merge_cb, [ds_state, class_df], [zip_out, merge_log])
|
510 |
|
|
|
511 |
if __name__ == "__main__":
|
512 |
demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", 7860)))
|