Update app.py
Browse files
app.py
CHANGED
@@ -70,7 +70,7 @@ class QCConfig:
|
|
70 |
weights: str | None
|
71 |
cpu_count: int = CPU_COUNT
|
72 |
batch_size: int = BATCH_SIZE
|
73 |
-
sample_limit:
|
74 |
|
75 |
# βββββββββββ Helpers & Caching βββββββββββββββββββββββββββββββββββββββββββββ
|
76 |
def load_yaml(path: Path) -> Dict:
|
@@ -171,12 +171,12 @@ def qc_class_balance(lbls: List[Path], cfg: QCConfig) -> Dict:
|
|
171 |
boxes.append(len(bs))
|
172 |
counts.update(b[0] for b in bs)
|
173 |
if not counts:
|
174 |
-
return {"name":
|
175 |
bal = min(counts.values()) / max(counts.values()) * 100
|
176 |
return {
|
177 |
-
"name":
|
178 |
-
"score":
|
179 |
-
"details":
|
180 |
"class_counts": dict(counts),
|
181 |
"boxes_per_image": {
|
182 |
"min": min(boxes),
|
@@ -188,7 +188,7 @@ def qc_class_balance(lbls: List[Path], cfg: QCConfig) -> Dict:
|
|
188 |
|
189 |
def qc_image_quality(imgs: List[Path], cfg: QCConfig) -> Dict:
|
190 |
if cv2 is None:
|
191 |
-
return {"name":
|
192 |
blurry, dark, bright = [], [], []
|
193 |
sample = imgs[:cfg.sample_limit]
|
194 |
with ThreadPoolExecutor(max_workers=cfg.cpu_count) as ex:
|
@@ -200,9 +200,9 @@ def qc_image_quality(imgs: List[Path], cfg: QCConfig) -> Dict:
|
|
200 |
bad = len({*blurry, *dark, *bright})
|
201 |
score = 100 - bad / max(len(sample), 1) * 100
|
202 |
return {
|
203 |
-
"name":
|
204 |
-
"score":
|
205 |
-
"details":
|
206 |
"blurry": [str(p) for p in blurry],
|
207 |
"dark": [str(p) for p in dark],
|
208 |
"bright": [str(p) for p in bright]
|
@@ -217,12 +217,28 @@ def qc_duplicates(imgs: List[Path], cfg: QCConfig) -> Dict:
|
|
217 |
work_dir=str(TMP_ROOT / "fastdup")
|
218 |
)
|
219 |
fd.run()
|
220 |
-
|
221 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
222 |
dup = sum(len(c) - 1 for c in clusters)
|
|
|
223 |
return {
|
224 |
"name": "Duplicates",
|
225 |
-
"score":
|
226 |
"details": {"groups": clusters[:50]}
|
227 |
}
|
228 |
except Exception as e:
|
@@ -236,14 +252,14 @@ def qc_duplicates(imgs: List[Path], cfg: QCConfig) -> Dict:
|
|
236 |
def qc_model_qa(imgs: List[Path], lbls: List[Path], cfg: QCConfig) -> Dict:
|
237 |
model = get_model(cfg.weights)
|
238 |
if model is None:
|
239 |
-
return {"name":
|
240 |
ious, mism = [], []
|
241 |
sample = imgs[:cfg.sample_limit]
|
242 |
for i in range(0, len(sample), cfg.batch_size):
|
243 |
-
batch = sample[i:i
|
244 |
results = model.predict(batch, verbose=False, half=True, dynamic=True)
|
245 |
for p, res in zip(batch, results):
|
246 |
-
gt = parse_label_file(Path(p).parent.parent
|
247 |
for cls, x, y, w, h in gt:
|
248 |
best = 0.0
|
249 |
for b, c, conf in zip(
|
@@ -259,14 +275,14 @@ def qc_model_qa(imgs: List[Path], lbls: List[Path], cfg: QCConfig) -> Dict:
|
|
259 |
mism.append(str(p))
|
260 |
miou = float(np.mean(ious)) if ious else 1.0
|
261 |
return {
|
262 |
-
"name":
|
263 |
-
"score":
|
264 |
-
"details":
|
265 |
}
|
266 |
|
267 |
def qc_label_issues(imgs: List[Path], lbls: List[Path], cfg: QCConfig) -> Dict:
|
268 |
if get_noise_indices is None:
|
269 |
-
return {"name":
|
270 |
labels, idxs = [], []
|
271 |
sample = imgs[:cfg.sample_limit]
|
272 |
for i, p in enumerate(sample):
|
@@ -275,35 +291,35 @@ def qc_label_issues(imgs: List[Path], lbls: List[Path], cfg: QCConfig) -> Dict:
|
|
275 |
labels.append(int(cls))
|
276 |
idxs.append(i)
|
277 |
if not labels:
|
278 |
-
return {"name":
|
279 |
labels_arr = np.array(labels)
|
280 |
uniq = sorted(set(labels_arr))
|
281 |
probs = np.eye(len(uniq))[np.searchsorted(uniq, labels_arr)]
|
282 |
noise = get_noise_indices(labels=labels_arr, probabilities=probs)
|
283 |
flags = sorted({idxs[n] for n in noise})
|
284 |
files = [str(sample[i]) for i in flags]
|
285 |
-
score = 100 - len(flags)
|
286 |
return {
|
287 |
-
"name":
|
288 |
-
"score":
|
289 |
-
"details":
|
290 |
}
|
291 |
|
292 |
def _rel_iou(b1, b2):
|
293 |
x1, y1, w1, h1 = b1
|
294 |
x2, y2, w2, h2 = b2
|
295 |
-
xa1, ya1 = x1
|
296 |
-
xa2, ya2 = x1
|
297 |
-
xb1, yb1 = x2
|
298 |
-
xb2, yb2 = x2
|
299 |
ix1 = max(xa1, xb1); iy1 = max(ya1, yb1)
|
300 |
ix2 = min(xa2, xb2); iy2 = min(ya2, yb2)
|
301 |
-
inter = max(ix2
|
302 |
-
union = w1
|
303 |
-
return inter
|
304 |
|
305 |
def aggregate(results: List[Dict]) -> float:
|
306 |
-
return sum(DEFAULT_W[r["name"]]
|
307 |
|
308 |
RF_RE = re.compile(r"https?://universe\.roboflow\.com/([^/]+)/([^/]+)/dataset/(\d+)")
|
309 |
|
@@ -312,7 +328,7 @@ def download_rf_dataset(url: str, rf_api: Roboflow, dest: Path) -> Path:
|
|
312 |
if not m:
|
313 |
raise ValueError(f"Bad RF URL: {url}")
|
314 |
ws, proj, ver = m.groups()
|
315 |
-
ds_dir = dest
|
316 |
if ds_dir.exists():
|
317 |
return ds_dir
|
318 |
pr = rf_api.workspace(ws).project(proj)
|
@@ -332,9 +348,9 @@ def run_quality(
|
|
332 |
qc_integrity(imgs, lbls, cfg),
|
333 |
qc_class_balance(lbls, cfg),
|
334 |
qc_image_quality(imgs, cfg),
|
335 |
-
qc_duplicates(imgs, cfg) if run_dup else {"name":
|
336 |
-
qc_model_qa(imgs, lbls, cfg) if run_modelqa else {"name":
|
337 |
-
qc_label_issues(imgs, lbls, cfg) if run_modelqa else {"name":
|
338 |
]
|
339 |
final = aggregate(results)
|
340 |
|
@@ -398,9 +414,9 @@ with gr.Blocks(title="YOLO Dataset Quality Evaluator v3") as demo:
|
|
398 |
if not line.strip():
|
399 |
continue
|
400 |
try:
|
401 |
-
ds
|
402 |
-
|
403 |
-
None,
|
404 |
Path(weights.name) if weights else None,
|
405 |
cfg, run_dup, run_modelqa
|
406 |
)
|
|
|
70 |
weights: str | None
|
71 |
cpu_count: int = CPU_COUNT
|
72 |
batch_size: int = BATCH_SIZE
|
73 |
+
sample_limit:int = SAMPLE_LIMIT
|
74 |
|
75 |
# βββββββββββ Helpers & Caching βββββββββββββββββββββββββββββββββββββββββββββ
|
76 |
def load_yaml(path: Path) -> Dict:
|
|
|
171 |
boxes.append(len(bs))
|
172 |
counts.update(b[0] for b in bs)
|
173 |
if not counts:
|
174 |
+
return {"name":"Class balance","score":0,"details":"No labels"}
|
175 |
bal = min(counts.values()) / max(counts.values()) * 100
|
176 |
return {
|
177 |
+
"name":"Class balance",
|
178 |
+
"score":bal,
|
179 |
+
"details":{
|
180 |
"class_counts": dict(counts),
|
181 |
"boxes_per_image": {
|
182 |
"min": min(boxes),
|
|
|
188 |
|
189 |
def qc_image_quality(imgs: List[Path], cfg: QCConfig) -> Dict:
|
190 |
if cv2 is None:
|
191 |
+
return {"name":"Image quality","score":100,"details":"cv2 missing"}
|
192 |
blurry, dark, bright = [], [], []
|
193 |
sample = imgs[:cfg.sample_limit]
|
194 |
with ThreadPoolExecutor(max_workers=cfg.cpu_count) as ex:
|
|
|
200 |
bad = len({*blurry, *dark, *bright})
|
201 |
score = 100 - bad / max(len(sample), 1) * 100
|
202 |
return {
|
203 |
+
"name":"Image quality",
|
204 |
+
"score":score,
|
205 |
+
"details":{
|
206 |
"blurry": [str(p) for p in blurry],
|
207 |
"dark": [str(p) for p in dark],
|
208 |
"bright": [str(p) for p in bright]
|
|
|
217 |
work_dir=str(TMP_ROOT / "fastdup")
|
218 |
)
|
219 |
fd.run()
|
220 |
+
|
221 |
+
# Try the grouped-DataFrame API first:
|
222 |
+
try:
|
223 |
+
cc = fd.connected_components_grouped(sort_by="comp_size", ascending=False)
|
224 |
+
if "files" in cc.columns:
|
225 |
+
clusters = cc["files"].tolist()
|
226 |
+
else:
|
227 |
+
# fallback: group by component ID, collect filenames
|
228 |
+
clusters = (
|
229 |
+
cc.groupby("component")["filename"]
|
230 |
+
.apply(list)
|
231 |
+
.tolist()
|
232 |
+
)
|
233 |
+
except Exception:
|
234 |
+
# final fallback to the old list-based API
|
235 |
+
clusters = fd.connected_components()
|
236 |
+
|
237 |
dup = sum(len(c) - 1 for c in clusters)
|
238 |
+
score = max(0.0, 100 - dup / len(imgs) * 100)
|
239 |
return {
|
240 |
"name": "Duplicates",
|
241 |
+
"score": score,
|
242 |
"details": {"groups": clusters[:50]}
|
243 |
}
|
244 |
except Exception as e:
|
|
|
252 |
def qc_model_qa(imgs: List[Path], lbls: List[Path], cfg: QCConfig) -> Dict:
|
253 |
model = get_model(cfg.weights)
|
254 |
if model is None:
|
255 |
+
return {"name":"Model QA","score":100,"details":"skipped"}
|
256 |
ious, mism = [], []
|
257 |
sample = imgs[:cfg.sample_limit]
|
258 |
for i in range(0, len(sample), cfg.batch_size):
|
259 |
+
batch = sample[i:i+cfg.batch_size]
|
260 |
results = model.predict(batch, verbose=False, half=True, dynamic=True)
|
261 |
for p, res in zip(batch, results):
|
262 |
+
gt = parse_label_file(Path(p).parent.parent/'labels'/f"{Path(p).stem}.txt")
|
263 |
for cls, x, y, w, h in gt:
|
264 |
best = 0.0
|
265 |
for b, c, conf in zip(
|
|
|
275 |
mism.append(str(p))
|
276 |
miou = float(np.mean(ious)) if ious else 1.0
|
277 |
return {
|
278 |
+
"name":"Model QA",
|
279 |
+
"score":miou*100,
|
280 |
+
"details":{"mean_iou":miou, "mismatches":mism[:50]}
|
281 |
}
|
282 |
|
283 |
def qc_label_issues(imgs: List[Path], lbls: List[Path], cfg: QCConfig) -> Dict:
|
284 |
if get_noise_indices is None:
|
285 |
+
return {"name":"Label issues","score":100,"details":"skipped"}
|
286 |
labels, idxs = [], []
|
287 |
sample = imgs[:cfg.sample_limit]
|
288 |
for i, p in enumerate(sample):
|
|
|
291 |
labels.append(int(cls))
|
292 |
idxs.append(i)
|
293 |
if not labels:
|
294 |
+
return {"name":"Label issues","score":100,"details":"no GT"}
|
295 |
labels_arr = np.array(labels)
|
296 |
uniq = sorted(set(labels_arr))
|
297 |
probs = np.eye(len(uniq))[np.searchsorted(uniq, labels_arr)]
|
298 |
noise = get_noise_indices(labels=labels_arr, probabilities=probs)
|
299 |
flags = sorted({idxs[n] for n in noise})
|
300 |
files = [str(sample[i]) for i in flags]
|
301 |
+
score = 100 - len(flags)/len(labels)*100
|
302 |
return {
|
303 |
+
"name":"Label issues",
|
304 |
+
"score":score,
|
305 |
+
"details":{"files":files[:50]}
|
306 |
}
|
307 |
|
308 |
def _rel_iou(b1, b2):
|
309 |
x1, y1, w1, h1 = b1
|
310 |
x2, y2, w2, h2 = b2
|
311 |
+
xa1, ya1 = x1-w1/2, y1-h1/2
|
312 |
+
xa2, ya2 = x1+w1/2, y1+h1/2
|
313 |
+
xb1, yb1 = x2-w2/2, y2-h2/2
|
314 |
+
xb2, yb2 = x2+w2/2, y2+h2/2
|
315 |
ix1 = max(xa1, xb1); iy1 = max(ya1, yb1)
|
316 |
ix2 = min(xa2, xb2); iy2 = min(ya2, yb2)
|
317 |
+
inter = max(ix2-ix1, 0) * max(iy2-iy1, 0)
|
318 |
+
union = w1*h1 + w2*h2 - inter
|
319 |
+
return inter/union if union else 0.0
|
320 |
|
321 |
def aggregate(results: List[Dict]) -> float:
|
322 |
+
return sum(DEFAULT_W[r["name"]]*r["score"] for r in results)
|
323 |
|
324 |
RF_RE = re.compile(r"https?://universe\.roboflow\.com/([^/]+)/([^/]+)/dataset/(\d+)")
|
325 |
|
|
|
328 |
if not m:
|
329 |
raise ValueError(f"Bad RF URL: {url}")
|
330 |
ws, proj, ver = m.groups()
|
331 |
+
ds_dir = dest/f"{ws}_{proj}_v{ver}"
|
332 |
if ds_dir.exists():
|
333 |
return ds_dir
|
334 |
pr = rf_api.workspace(ws).project(proj)
|
|
|
348 |
qc_integrity(imgs, lbls, cfg),
|
349 |
qc_class_balance(lbls, cfg),
|
350 |
qc_image_quality(imgs, cfg),
|
351 |
+
qc_duplicates(imgs, cfg) if run_dup else {"name":"Duplicates","score":100,"details":"skipped"},
|
352 |
+
qc_model_qa(imgs, lbls, cfg) if run_modelqa else {"name":"Model QA","score":100,"details":"skipped"},
|
353 |
+
qc_label_issues(imgs, lbls, cfg) if run_modelqa else {"name":"Label issues","score":100,"details":"skipped"},
|
354 |
]
|
355 |
final = aggregate(results)
|
356 |
|
|
|
414 |
if not line.strip():
|
415 |
continue
|
416 |
try:
|
417 |
+
ds = download_rf_dataset(line, rf, TMP_ROOT)
|
418 |
+
md, df = run_quality(
|
419 |
+
ds, None,
|
420 |
Path(weights.name) if weights else None,
|
421 |
cfg, run_dup, run_modelqa
|
422 |
)
|