Update app.py
Browse files
app.py
CHANGED
@@ -72,7 +72,7 @@ _model_cache: dict[str, YOLO] = {}
|
|
72 |
autoinc = 0 # helper for tmpβdir names
|
73 |
|
74 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
75 |
-
# Dataβclass & helpers
|
76 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
77 |
@dataclass
|
78 |
class QCConfig:
|
@@ -88,6 +88,13 @@ def load_yaml(path: Path) -> Dict:
|
|
88 |
with path.open('r', encoding='utf-8') as f:
|
89 |
return yaml.safe_load(f)
|
90 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
def parse_label_file(path: Path) -> list[tuple[int, float, float, float, float]]:
|
92 |
if not path or not path.exists() or path.stat().st_size == 0:
|
93 |
return []
|
@@ -134,7 +141,7 @@ def get_model(weights: str) -> YOLO | None:
|
|
134 |
_model_cache[weights] = YOLO(weights)
|
135 |
return _model_cache[weights]
|
136 |
|
137 |
-
# βββββββββ
|
138 |
def _quality_stat_args(args: Tuple[Path, float]) -> Tuple[Path, bool, bool, bool]:
|
139 |
path, thr = args
|
140 |
if cv2 is None:
|
@@ -155,7 +162,6 @@ def _is_corrupt(path: Path) -> bool:
|
|
155 |
except Exception:
|
156 |
return True
|
157 |
|
158 |
-
# βββββββββββββββββ Quality Checks ββββββββββββββββββββββββββββββββββββββββββ
|
159 |
def qc_integrity(imgs: List[Path], lbls: List[Path], cfg: QCConfig) -> Dict:
|
160 |
missing = [i for i, l in zip(imgs, lbls) if l is None]
|
161 |
corrupt = []
|
@@ -171,7 +177,7 @@ def qc_integrity(imgs: List[Path], lbls: List[Path], cfg: QCConfig) -> Dict:
|
|
171 |
"score": max(score, 0),
|
172 |
"details": {
|
173 |
"missing_label_files": [str(p) for p in missing],
|
174 |
-
"corrupt_images":
|
175 |
}
|
176 |
}
|
177 |
|
@@ -182,12 +188,12 @@ def qc_class_balance(lbls: List[Path], cfg: QCConfig) -> Dict:
|
|
182 |
boxes.append(len(bs))
|
183 |
counts.update(b[0] for b in bs)
|
184 |
if not counts:
|
185 |
-
return {"name":"Class balance","score":0,"details":"No labels"}
|
186 |
bal = min(counts.values()) / max(counts.values()) * 100
|
187 |
return {
|
188 |
-
"name":"Class balance",
|
189 |
-
"score":bal,
|
190 |
-
"details":{
|
191 |
"class_counts": dict(counts),
|
192 |
"boxes_per_image": {
|
193 |
"min": min(boxes),
|
@@ -199,7 +205,7 @@ def qc_class_balance(lbls: List[Path], cfg: QCConfig) -> Dict:
|
|
199 |
|
200 |
def qc_image_quality(imgs: List[Path], cfg: QCConfig) -> Dict:
|
201 |
if cv2 is None:
|
202 |
-
return {"name":"Image quality","score":100,"details":"cv2 missing"}
|
203 |
blurry, dark, bright = [], [], []
|
204 |
sample = imgs[:cfg.sample_limit]
|
205 |
with ThreadPoolExecutor(max_workers=cfg.cpu_count) as ex:
|
@@ -211,11 +217,11 @@ def qc_image_quality(imgs: List[Path], cfg: QCConfig) -> Dict:
|
|
211 |
bad = len({*blurry, *dark, *bright})
|
212 |
score = 100 - bad / max(len(sample), 1) * 100
|
213 |
return {
|
214 |
-
"name":"Image quality",
|
215 |
-
"score":score,
|
216 |
-
"details":{
|
217 |
"blurry": [str(p) for p in blurry],
|
218 |
-
"dark":
|
219 |
"bright": [str(p) for p in bright]
|
220 |
}
|
221 |
}
|
@@ -230,33 +236,34 @@ def qc_duplicates(imgs: List[Path], cfg: QCConfig) -> Dict:
|
|
230 |
fd.run()
|
231 |
try:
|
232 |
cc = fd.connected_components_grouped(sort_by="comp_size", ascending=False)
|
233 |
-
clusters = cc["files"].tolist() if "files" in cc.columns
|
|
|
234 |
except Exception:
|
235 |
clusters = fd.connected_components()
|
236 |
dup = sum(len(c) - 1 for c in clusters)
|
237 |
score = max(0.0, 100 - dup / len(imgs) * 100)
|
238 |
-
return {"name":"Duplicates","score":score,"details":{"groups":clusters[:50]}}
|
239 |
except Exception as e:
|
240 |
-
return {"name":"Duplicates","score":100.0,"details":{"fastdup_error":str(e)}}
|
241 |
-
return {"name":"Duplicates","score":100.0,"details":{"note":"skipped"}}
|
242 |
|
243 |
def _rel_iou(b1, b2):
|
244 |
x1, y1, w1, h1 = b1
|
245 |
x2, y2, w2, h2 = b2
|
246 |
-
xa1, ya1 = x1-w1/2, y1-h1/2
|
247 |
-
xa2, ya2 = x1+w1/2, y1+h1/2
|
248 |
-
xb1, yb1 = x2-w2/2, y2-h2/2
|
249 |
-
xb2, yb2 = x2+w2/2, y2+h2/2
|
250 |
ix1 = max(xa1, xb1); iy1 = max(ya1, yb1)
|
251 |
ix2 = min(xa2, xb2); iy2 = min(ya2, yb2)
|
252 |
-
inter = max(ix2-ix1,0)*max(iy2-iy1,0)
|
253 |
union = w1*h1 + w2*h2 - inter
|
254 |
return inter/union if union else 0.0
|
255 |
|
256 |
def qc_model_qa(imgs: List[Path], lbls: List[Path], cfg: QCConfig) -> Dict:
|
257 |
model = get_model(cfg.weights)
|
258 |
if model is None:
|
259 |
-
return {"name":"Model QA","score":100,"details":"skipped"}
|
260 |
ious, mism = [], []
|
261 |
sample = imgs[:cfg.sample_limit]
|
262 |
for i in range(0, len(sample), cfg.batch_size):
|
@@ -266,21 +273,23 @@ def qc_model_qa(imgs: List[Path], lbls: List[Path], cfg: QCConfig) -> Dict:
|
|
266 |
gt = parse_label_file(Path(p).parent.parent/'labels'/f"{Path(p).stem}.txt")
|
267 |
for cls, x, y, w, h in gt:
|
268 |
best = 0.0
|
269 |
-
for b, c, conf in zip(
|
270 |
-
|
271 |
-
|
|
|
|
|
272 |
if conf < cfg.conf_thr or int(c) != cls:
|
273 |
continue
|
274 |
-
best = max(best, _rel_iou((x,y,w,h), tuple(b)))
|
275 |
ious.append(best)
|
276 |
if best < cfg.iou_thr:
|
277 |
mism.append(str(p))
|
278 |
miou = float(np.mean(ious)) if ious else 1.0
|
279 |
-
return {"name":"Model QA","score":miou*100,"details":{"mean_iou":miou,"mismatches":mism[:50]}}
|
280 |
|
281 |
def qc_label_issues(imgs: List[Path], lbls: List[Path], cfg: QCConfig) -> Dict:
|
282 |
if get_noise_indices is None:
|
283 |
-
return {"name":"Label issues","score":100,"details":"skipped"}
|
284 |
labels, idxs = [], []
|
285 |
sample = imgs[:cfg.sample_limit]
|
286 |
for i, p in enumerate(sample):
|
@@ -288,20 +297,20 @@ def qc_label_issues(imgs: List[Path], lbls: List[Path], cfg: QCConfig) -> Dict:
|
|
288 |
for cls, *_ in bs:
|
289 |
labels.append(int(cls)); idxs.append(i)
|
290 |
if not labels:
|
291 |
-
return {"name":"Label issues","score":100,"details":"no GT"}
|
292 |
labels_arr = np.array(labels)
|
293 |
-
uniq
|
294 |
-
probs
|
295 |
-
noise
|
296 |
-
flags
|
297 |
-
files
|
298 |
-
score
|
299 |
-
return {"name":"Label issues","score":score,"details":{"files":files[:50]}}
|
300 |
|
301 |
def aggregate(results: List[Dict]) -> float:
|
302 |
return sum(DEFAULT_W[r["name"]]*r["score"] for r in results)
|
303 |
|
304 |
-
# βββββββββββββββββ Roboflow TXTβloading logic
|
305 |
RF_RE = re.compile(r"https?://universe\.roboflow\.com/([^/]+)/([^/]+)/dataset/(\d+)")
|
306 |
|
307 |
def download_rf_dataset(url: str, rf_api: Roboflow, dest: Path) -> Path:
|
@@ -347,6 +356,102 @@ def run_quality(
|
|
347 |
df.index.name = "class"
|
348 |
return "\n".join(md), df
|
349 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
350 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
351 |
# UI LAYER
|
352 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
@@ -357,20 +462,20 @@ with gr.Blocks(css="#classdf td{min-width:120px}") as demo:
|
|
357 |
""")
|
358 |
|
359 |
with gr.Tab("Evaluate"):
|
360 |
-
|
361 |
-
|
362 |
-
|
363 |
-
|
364 |
-
|
365 |
-
|
366 |
-
|
367 |
-
yaml_in = gr.File(label="Custom YAML", file_types=['.yaml'])
|
368 |
-
weights_in = gr.File(label="YOLO weights (.pt)")
|
369 |
blur_sl = gr.Slider(0.0, 500.0, value=100.0, label="Blur threshold")
|
370 |
iou_sl = gr.Slider(0.0, 1.0, value=0.5, label="IOU threshold")
|
371 |
conf_sl = gr.Slider(0.0, 1.0, value=0.25, label="Min detection confidence")
|
|
|
372 |
run_dup = gr.Checkbox(label="Check duplicates (fastdup)", value=False)
|
373 |
run_modelqa= gr.Checkbox(label="Run Model QA & cleanlab", value=False)
|
|
|
374 |
run_eval = gr.Button("Run Evaluation")
|
375 |
out_md = gr.Markdown()
|
376 |
out_df = gr.Dataframe()
|
@@ -383,11 +488,9 @@ with gr.Blocks(css="#classdf td{min-width:120px}") as demo:
|
|
383 |
cfg = QCConfig(blur_thr, iou_thr, conf_thr, weights.name if weights else None)
|
384 |
rf = Roboflow(api_key) if api_key and Roboflow else None
|
385 |
|
386 |
-
|
387 |
-
if url_txt:
|
388 |
for line in Path(url_txt.name).read_text().splitlines():
|
389 |
-
if not line.strip():
|
390 |
-
continue
|
391 |
try:
|
392 |
ds = download_rf_dataset(line, rf, TMP_ROOT)
|
393 |
md, df = run_quality(
|
@@ -399,7 +502,6 @@ with gr.Blocks(css="#classdf td{min-width:120px}") as demo:
|
|
399 |
except Exception as e:
|
400 |
reports.append(f"### {line}\nβ οΈ {e}")
|
401 |
|
402 |
-
# ZIP upload
|
403 |
if zip_file:
|
404 |
tmp = Path(tempfile.mkdtemp())
|
405 |
shutil.unpack_archive(zip_file.name, tmp)
|
@@ -412,7 +514,6 @@ with gr.Blocks(css="#classdf td{min-width:120px}") as demo:
|
|
412 |
reports.append(md); dfs.append(df)
|
413 |
shutil.rmtree(tmp, ignore_errors=True)
|
414 |
|
415 |
-
# Server path
|
416 |
if server_path:
|
417 |
ds = Path(server_path)
|
418 |
md, df = run_quality(
|
@@ -446,14 +547,16 @@ with gr.Blocks(css="#classdf td{min-width:120px}") as demo:
|
|
446 |
def _load_cb(rf_key, rf_urls_file, zip_files):
|
447 |
global autoinc
|
448 |
info_list, log_lines = [], []
|
|
|
449 |
|
450 |
-
if rf_urls_file:
|
451 |
for url in Path(rf_urls_file.name).read_text().splitlines():
|
452 |
-
|
453 |
-
|
454 |
try:
|
455 |
-
ds = download_rf_dataset(url,
|
456 |
-
names
|
|
|
457 |
info_list.append((str(ds), names, splits, Path(ds).name))
|
458 |
log_lines.append(f"βοΈ RF dataset **{Path(ds).name}** loaded ({len(names)} classes)")
|
459 |
except Exception as e:
|
@@ -465,14 +568,13 @@ with gr.Blocks(css="#classdf td{min-width:120px}") as demo:
|
|
465 |
tmp.mkdir(parents=True, exist_ok=True)
|
466 |
shutil.unpack_archive(f.name, tmp)
|
467 |
yaml_p = next(tmp.rglob("*.yaml"), None)
|
468 |
-
if
|
469 |
-
|
470 |
-
|
471 |
-
|
472 |
-
|
473 |
-
log_lines.append(f"βοΈ ZIP **{tmp.name}** loaded")
|
474 |
|
475 |
-
return info_list, "\n".join(log_lines)
|
476 |
|
477 |
load_btn.click(_load_cb, [rf_key, rf_urls, zips_in], [ds_state, load_log])
|
478 |
|
@@ -504,7 +606,8 @@ with gr.Blocks(css="#classdf td{min-width:120px}") as demo:
|
|
504 |
return None, "β οΈΒ Load datasets first."
|
505 |
out_dir = merge_datasets(ds_info, class_df)
|
506 |
zip_path = shutil.make_archive(str(out_dir), "zip", out_dir)
|
507 |
-
|
|
|
508 |
|
509 |
merge_btn.click(_merge_cb, [ds_state, class_df], [zip_out, merge_log])
|
510 |
|
|
|
72 |
autoinc = 0 # helper for tmpβdir names
|
73 |
|
74 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
75 |
+
# Dataβclass & basic helpers
|
76 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
77 |
@dataclass
|
78 |
class QCConfig:
|
|
|
88 |
with path.open('r', encoding='utf-8') as f:
|
89 |
return yaml.safe_load(f)
|
90 |
|
91 |
+
def load_class_names(yaml_path: Path) -> List[str]:
|
92 |
+
data = load_yaml(yaml_path)
|
93 |
+
names = data.get("names", [])
|
94 |
+
if isinstance(names, dict):
|
95 |
+
return [names[k] for k in sorted(names, key=lambda x: int(x))]
|
96 |
+
return list(names)
|
97 |
+
|
98 |
def parse_label_file(path: Path) -> list[tuple[int, float, float, float, float]]:
|
99 |
if not path or not path.exists() or path.stat().st_size == 0:
|
100 |
return []
|
|
|
141 |
_model_cache[weights] = YOLO(weights)
|
142 |
return _model_cache[weights]
|
143 |
|
144 |
+
# βββββββββ Concurrency helpers & QC functions βββββββββββββββββββββββββββββββ
|
145 |
def _quality_stat_args(args: Tuple[Path, float]) -> Tuple[Path, bool, bool, bool]:
|
146 |
path, thr = args
|
147 |
if cv2 is None:
|
|
|
162 |
except Exception:
|
163 |
return True
|
164 |
|
|
|
165 |
def qc_integrity(imgs: List[Path], lbls: List[Path], cfg: QCConfig) -> Dict:
|
166 |
missing = [i for i, l in zip(imgs, lbls) if l is None]
|
167 |
corrupt = []
|
|
|
177 |
"score": max(score, 0),
|
178 |
"details": {
|
179 |
"missing_label_files": [str(p) for p in missing],
|
180 |
+
"corrupt_images": [str(p) for p in corrupt],
|
181 |
}
|
182 |
}
|
183 |
|
|
|
188 |
boxes.append(len(bs))
|
189 |
counts.update(b[0] for b in bs)
|
190 |
if not counts:
|
191 |
+
return {"name": "Class balance", "score": 0, "details": "No labels"}
|
192 |
bal = min(counts.values()) / max(counts.values()) * 100
|
193 |
return {
|
194 |
+
"name": "Class balance",
|
195 |
+
"score": bal,
|
196 |
+
"details": {
|
197 |
"class_counts": dict(counts),
|
198 |
"boxes_per_image": {
|
199 |
"min": min(boxes),
|
|
|
205 |
|
206 |
def qc_image_quality(imgs: List[Path], cfg: QCConfig) -> Dict:
|
207 |
if cv2 is None:
|
208 |
+
return {"name": "Image quality", "score": 100, "details": "cv2 missing"}
|
209 |
blurry, dark, bright = [], [], []
|
210 |
sample = imgs[:cfg.sample_limit]
|
211 |
with ThreadPoolExecutor(max_workers=cfg.cpu_count) as ex:
|
|
|
217 |
bad = len({*blurry, *dark, *bright})
|
218 |
score = 100 - bad / max(len(sample), 1) * 100
|
219 |
return {
|
220 |
+
"name": "Image quality",
|
221 |
+
"score": score,
|
222 |
+
"details": {
|
223 |
"blurry": [str(p) for p in blurry],
|
224 |
+
"dark": [str(p) for p in dark],
|
225 |
"bright": [str(p) for p in bright]
|
226 |
}
|
227 |
}
|
|
|
236 |
fd.run()
|
237 |
try:
|
238 |
cc = fd.connected_components_grouped(sort_by="comp_size", ascending=False)
|
239 |
+
clusters = (cc["files"].tolist() if "files" in cc.columns
|
240 |
+
else cc.groupby("component")["filename"].apply(list).tolist())
|
241 |
except Exception:
|
242 |
clusters = fd.connected_components()
|
243 |
dup = sum(len(c) - 1 for c in clusters)
|
244 |
score = max(0.0, 100 - dup / len(imgs) * 100)
|
245 |
+
return {"name": "Duplicates", "score": score, "details": {"groups": clusters[:50]}}
|
246 |
except Exception as e:
|
247 |
+
return {"name": "Duplicates", "score": 100.0, "details": {"fastdup_error": str(e)}}
|
248 |
+
return {"name": "Duplicates", "score": 100.0, "details": {"note": "skipped"}}
|
249 |
|
250 |
def _rel_iou(b1, b2):
|
251 |
x1, y1, w1, h1 = b1
|
252 |
x2, y2, w2, h2 = b2
|
253 |
+
xa1, ya1 = x1 - w1/2, y1 - h1/2
|
254 |
+
xa2, ya2 = x1 + w1/2, y1 + h1/2
|
255 |
+
xb1, yb1 = x2 - w2/2, y2 - h2/2
|
256 |
+
xb2, yb2 = x2 + w2/2, y2 + h2/2
|
257 |
ix1 = max(xa1, xb1); iy1 = max(ya1, yb1)
|
258 |
ix2 = min(xa2, xb2); iy2 = min(ya2, yb2)
|
259 |
+
inter = max(ix2 - ix1, 0) * max(iy2 - iy1, 0)
|
260 |
union = w1*h1 + w2*h2 - inter
|
261 |
return inter/union if union else 0.0
|
262 |
|
263 |
def qc_model_qa(imgs: List[Path], lbls: List[Path], cfg: QCConfig) -> Dict:
|
264 |
model = get_model(cfg.weights)
|
265 |
if model is None:
|
266 |
+
return {"name": "Model QA", "score": 100, "details": "skipped"}
|
267 |
ious, mism = [], []
|
268 |
sample = imgs[:cfg.sample_limit]
|
269 |
for i in range(0, len(sample), cfg.batch_size):
|
|
|
273 |
gt = parse_label_file(Path(p).parent.parent/'labels'/f"{Path(p).stem}.txt")
|
274 |
for cls, x, y, w, h in gt:
|
275 |
best = 0.0
|
276 |
+
for b, c, conf in zip(
|
277 |
+
res.boxes.xywh.cpu().numpy(),
|
278 |
+
res.boxes.cls.cpu().numpy(),
|
279 |
+
res.boxes.conf.cpu().numpy()
|
280 |
+
):
|
281 |
if conf < cfg.conf_thr or int(c) != cls:
|
282 |
continue
|
283 |
+
best = max(best, _rel_iou((x, y, w, h), tuple(b)))
|
284 |
ious.append(best)
|
285 |
if best < cfg.iou_thr:
|
286 |
mism.append(str(p))
|
287 |
miou = float(np.mean(ious)) if ious else 1.0
|
288 |
+
return {"name": "Model QA", "score": miou*100, "details": {"mean_iou": miou, "mismatches": mism[:50]}}
|
289 |
|
290 |
def qc_label_issues(imgs: List[Path], lbls: List[Path], cfg: QCConfig) -> Dict:
|
291 |
if get_noise_indices is None:
|
292 |
+
return {"name": "Label issues", "score": 100, "details": "skipped"}
|
293 |
labels, idxs = [], []
|
294 |
sample = imgs[:cfg.sample_limit]
|
295 |
for i, p in enumerate(sample):
|
|
|
297 |
for cls, *_ in bs:
|
298 |
labels.append(int(cls)); idxs.append(i)
|
299 |
if not labels:
|
300 |
+
return {"name": "Label issues", "score": 100, "details": "no GT"}
|
301 |
labels_arr = np.array(labels)
|
302 |
+
uniq = sorted(set(labels_arr))
|
303 |
+
probs = np.eye(len(uniq))[np.searchsorted(uniq, labels_arr)]
|
304 |
+
noise = get_noise_indices(labels=labels_arr, probabilities=probs)
|
305 |
+
flags = sorted({idxs[n] for n in noise})
|
306 |
+
files = [str(sample[i]) for i in flags]
|
307 |
+
score = 100 - len(flags)/len(labels)*100
|
308 |
+
return {"name": "Label issues", "score": score, "details": {"files": files[:50]}}
|
309 |
|
310 |
def aggregate(results: List[Dict]) -> float:
|
311 |
return sum(DEFAULT_W[r["name"]]*r["score"] for r in results)
|
312 |
|
313 |
+
# βββββββββββββββββ Roboflow TXTβloading logic for both tabs βββββββββββββββββ
|
314 |
RF_RE = re.compile(r"https?://universe\.roboflow\.com/([^/]+)/([^/]+)/dataset/(\d+)")
|
315 |
|
316 |
def download_rf_dataset(url: str, rf_api: Roboflow, dest: Path) -> Path:
|
|
|
356 |
df.index.name = "class"
|
357 |
return "\n".join(md), df
|
358 |
|
359 |
+
def merge_datasets(
|
360 |
+
dataset_info_list: List[Tuple[str, List[str], List[str], str]],
|
361 |
+
class_map_df: pd.DataFrame,
|
362 |
+
out_dir: Path = Path("merged_dataset"),
|
363 |
+
seed: int = 1234,
|
364 |
+
) -> Path:
|
365 |
+
random.seed(seed)
|
366 |
+
if out_dir.exists():
|
367 |
+
shutil.rmtree(out_dir, onerror=lambda f, p, _: (os.chmod(p, stat.S_IWRITE), f(p)))
|
368 |
+
for sub in ("train/images","train/labels","valid/images","valid/labels"):
|
369 |
+
(out_dir / sub).mkdir(parents=True, exist_ok=True)
|
370 |
+
|
371 |
+
class_name_mapping = {
|
372 |
+
row["original_class"]: row["new_name"] if not row["remove"] else "__REMOVED__"
|
373 |
+
for _, row in class_map_df.iterrows()
|
374 |
+
}
|
375 |
+
limits_per_merged = {
|
376 |
+
row["new_name"]: int(row["max_images"])
|
377 |
+
for _, row in class_map_df.iterrows()
|
378 |
+
if not row["remove"]
|
379 |
+
}
|
380 |
+
active_classes = [c for c in sorted(set(class_name_mapping.values())) if c != "__REMOVED__"]
|
381 |
+
id_map = {cls: idx for idx, cls in enumerate(active_classes)}
|
382 |
+
|
383 |
+
image_to_classes: dict[str, set[str]] = {}
|
384 |
+
image_to_label: dict[str, Path] = {}
|
385 |
+
class_to_images: dict[str, set[str]] = {c: set() for c in active_classes}
|
386 |
+
|
387 |
+
for dloc, class_names_dataset, splits, _ in dataset_info_list:
|
388 |
+
for split in splits:
|
389 |
+
labels_root = Path(dloc) / split / "labels"
|
390 |
+
if not labels_root.exists():
|
391 |
+
continue
|
392 |
+
for lp in labels_root.rglob("*.txt"):
|
393 |
+
im_name, cls_set = lp.stem + ".jpg", set()
|
394 |
+
for cls_id, *rest in parse_label_file(lp):
|
395 |
+
orig = class_names_dataset[int(cls_id)] if int(cls_id) < len(class_names_dataset) else None
|
396 |
+
if orig:
|
397 |
+
new = class_name_mapping.get(orig, orig)
|
398 |
+
if new in active_classes:
|
399 |
+
cls_set.add(new)
|
400 |
+
if not cls_set:
|
401 |
+
continue
|
402 |
+
img_path = str(lp.parent.parent / "images" / f"{lp.stem}.jpg")
|
403 |
+
image_to_classes[img_path] = cls_set
|
404 |
+
image_to_label[img_path] = lp
|
405 |
+
for c in cls_set:
|
406 |
+
class_to_images[c].add(img_path)
|
407 |
+
|
408 |
+
selected_images = set()
|
409 |
+
counters = {c: 0 for c in active_classes}
|
410 |
+
pool = [img for imgs in class_to_images.values() for img in imgs]
|
411 |
+
random.shuffle(pool)
|
412 |
+
|
413 |
+
for img in pool:
|
414 |
+
cs = image_to_classes[img]
|
415 |
+
if any(counters[c] >= limits_per_merged.get(c, 0) for c in cs):
|
416 |
+
continue
|
417 |
+
selected_images.add(img)
|
418 |
+
for c in cs:
|
419 |
+
counters[c] += 1
|
420 |
+
|
421 |
+
for img in selected_images:
|
422 |
+
split = "train" if random.random() < 0.9 else "valid"
|
423 |
+
dst_img = out_dir / split / "images" / Path(img).name
|
424 |
+
dst_img.parent.mkdir(parents=True, exist_ok=True)
|
425 |
+
shutil.copy(img, dst_img)
|
426 |
+
|
427 |
+
lp_src = image_to_label[img]
|
428 |
+
dst_lbl = out_dir / split / "labels" / lp_src.name
|
429 |
+
dst_lbl.parent.mkdir(parents=True, exist_ok=True)
|
430 |
+
lines = lp_src.read_text().splitlines()
|
431 |
+
new_lines = []
|
432 |
+
for line in lines:
|
433 |
+
parts = line.split()
|
434 |
+
cid = int(parts[0])
|
435 |
+
orig = class_names_dataset[cid] if cid < len(class_names_dataset) else None
|
436 |
+
merged = class_name_mapping.get(orig, orig) if orig else None
|
437 |
+
if merged and merged in active_classes:
|
438 |
+
new_id = id_map[merged]
|
439 |
+
new_lines.append(" ".join([str(new_id)] + parts[1:]))
|
440 |
+
if new_lines:
|
441 |
+
dst_lbl.write_text("\n".join(new_lines))
|
442 |
+
else:
|
443 |
+
dst_img.unlink(missing_ok=True)
|
444 |
+
|
445 |
+
data_yaml = {
|
446 |
+
"path": str(out_dir.resolve()),
|
447 |
+
"train": "train/images",
|
448 |
+
"val": "valid/images",
|
449 |
+
"nc": len(active_classes),
|
450 |
+
"names": active_classes,
|
451 |
+
}
|
452 |
+
(out_dir / "data.yaml").write_text(yaml.safe_dump(data_yaml))
|
453 |
+
return out_dir
|
454 |
+
|
455 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
456 |
# UI LAYER
|
457 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
462 |
""")
|
463 |
|
464 |
with gr.Tab("Evaluate"):
|
465 |
+
api_in = gr.Textbox(label="Roboflow API key", type="password")
|
466 |
+
url_txt = gr.File(label=".txt of RF dataset URLs", file_types=['.txt'])
|
467 |
+
zip_in = gr.File(label="Dataset ZIP")
|
468 |
+
path_in = gr.Textbox(label="Server path")
|
469 |
+
yaml_in = gr.File(label="Custom YAML", file_types=['.yaml'])
|
470 |
+
weights_in = gr.File(label="YOLO weights (.pt)")
|
471 |
+
|
|
|
|
|
472 |
blur_sl = gr.Slider(0.0, 500.0, value=100.0, label="Blur threshold")
|
473 |
iou_sl = gr.Slider(0.0, 1.0, value=0.5, label="IOU threshold")
|
474 |
conf_sl = gr.Slider(0.0, 1.0, value=0.25, label="Min detection confidence")
|
475 |
+
|
476 |
run_dup = gr.Checkbox(label="Check duplicates (fastdup)", value=False)
|
477 |
run_modelqa= gr.Checkbox(label="Run Model QA & cleanlab", value=False)
|
478 |
+
|
479 |
run_eval = gr.Button("Run Evaluation")
|
480 |
out_md = gr.Markdown()
|
481 |
out_df = gr.Dataframe()
|
|
|
488 |
cfg = QCConfig(blur_thr, iou_thr, conf_thr, weights.name if weights else None)
|
489 |
rf = Roboflow(api_key) if api_key and Roboflow else None
|
490 |
|
491 |
+
if url_txt and rf:
|
|
|
492 |
for line in Path(url_txt.name).read_text().splitlines():
|
493 |
+
if not line.strip(): continue
|
|
|
494 |
try:
|
495 |
ds = download_rf_dataset(line, rf, TMP_ROOT)
|
496 |
md, df = run_quality(
|
|
|
502 |
except Exception as e:
|
503 |
reports.append(f"### {line}\nβ οΈ {e}")
|
504 |
|
|
|
505 |
if zip_file:
|
506 |
tmp = Path(tempfile.mkdtemp())
|
507 |
shutil.unpack_archive(zip_file.name, tmp)
|
|
|
514 |
reports.append(md); dfs.append(df)
|
515 |
shutil.rmtree(tmp, ignore_errors=True)
|
516 |
|
|
|
517 |
if server_path:
|
518 |
ds = Path(server_path)
|
519 |
md, df = run_quality(
|
|
|
547 |
def _load_cb(rf_key, rf_urls_file, zip_files):
|
548 |
global autoinc
|
549 |
info_list, log_lines = [], []
|
550 |
+
rf = Roboflow(rf_key) if rf_key and Roboflow else None
|
551 |
|
552 |
+
if rf_urls_file and rf:
|
553 |
for url in Path(rf_urls_file.name).read_text().splitlines():
|
554 |
+
url = url.strip()
|
555 |
+
if not url: continue
|
556 |
try:
|
557 |
+
ds = download_rf_dataset(url, rf, TMP_ROOT)
|
558 |
+
names = load_class_names(ds/"data.yaml")
|
559 |
+
splits = [s for s in ("train","valid","test") if (ds/s).exists()]
|
560 |
info_list.append((str(ds), names, splits, Path(ds).name))
|
561 |
log_lines.append(f"βοΈ RF dataset **{Path(ds).name}** loaded ({len(names)} classes)")
|
562 |
except Exception as e:
|
|
|
568 |
tmp.mkdir(parents=True, exist_ok=True)
|
569 |
shutil.unpack_archive(f.name, tmp)
|
570 |
yaml_p = next(tmp.rglob("*.yaml"), None)
|
571 |
+
if yaml_p:
|
572 |
+
names = load_class_names(yaml_p)
|
573 |
+
splits= [s for s in ("train","valid","test") if (tmp/s).exists()]
|
574 |
+
info_list.append((str(tmp), names, splits, tmp.name))
|
575 |
+
log_lines.append(f"βοΈ ZIP **{tmp.name}** loaded")
|
|
|
576 |
|
577 |
+
return info_list, "\n".join(log_lines) or "No datasets loaded."
|
578 |
|
579 |
load_btn.click(_load_cb, [rf_key, rf_urls, zips_in], [ds_state, load_log])
|
580 |
|
|
|
606 |
return None, "β οΈΒ Load datasets first."
|
607 |
out_dir = merge_datasets(ds_info, class_df)
|
608 |
zip_path = shutil.make_archive(str(out_dir), "zip", out_dir)
|
609 |
+
count = len(list(Path(out_dir).rglob("*.jpg")))
|
610 |
+
return zip_path, f"β
Β Merged dataset at **{out_dir}** with {count} images."
|
611 |
|
612 |
merge_btn.click(_merge_cb, [ds_state, class_df], [zip_out, merge_log])
|
613 |
|