Update app.py
Browse files
app.py
CHANGED
@@ -1,15 +1,14 @@
|
|
1 |
-
"""
|
2 |
-
app.py β Roboflowβaware YOLOv8 Dataset Quality Evaluator (
|
3 |
-
|
4 |
-
Changelog (2025β04β17)
|
5 |
-
ββββββββββββββββββββββ
|
6 |
-
β’ **
|
7 |
-
β’ **
|
8 |
-
β’
|
9 |
-
β’
|
10 |
-
β’
|
11 |
-
β’
|
12 |
-
β’ Tunable `CPU_COUNT` + envβvar guard for HF Spaces quota.
|
13 |
"""
|
14 |
|
15 |
from __future__ import annotations
|
@@ -33,62 +32,65 @@ import yaml
|
|
33 |
from PIL import Image
|
34 |
from tqdm import tqdm
|
35 |
|
36 |
-
#
|
37 |
try:
|
38 |
-
import cv2
|
39 |
except ImportError:
|
40 |
cv2 = None
|
41 |
-
|
42 |
try:
|
43 |
-
import imagehash
|
44 |
except ImportError:
|
45 |
imagehash = None
|
46 |
-
|
47 |
try:
|
48 |
-
import fastdup
|
49 |
except ImportError:
|
50 |
fastdup = None
|
51 |
-
|
52 |
try:
|
53 |
-
from ultralytics import YOLO
|
54 |
except ImportError:
|
55 |
-
YOLO = None
|
56 |
-
|
57 |
try:
|
58 |
-
from roboflow import Roboflow
|
59 |
except ImportError:
|
60 |
-
Roboflow = None
|
|
|
|
|
|
|
|
|
61 |
|
62 |
-
#
|
63 |
TMP_ROOT = Path(tempfile.gettempdir()) / "rf_datasets"
|
64 |
TMP_ROOT.mkdir(parents=True, exist_ok=True)
|
65 |
-
|
66 |
-
# Limit CPU workers on HF Spaces (feel free to raise locally)
|
67 |
CPU_COUNT = int(os.getenv("QC_CPU", max(1, (os.cpu_count() or 4) // 2)))
|
68 |
-
|
69 |
|
70 |
DEFAULT_W = {
|
71 |
-
"Integrity":
|
72 |
-
"Class balance":
|
73 |
-
"Image quality":
|
74 |
-
"Duplicates":
|
75 |
-
"Model QA":
|
|
|
76 |
}
|
77 |
|
78 |
-
|
79 |
-
class DuplicateGroup:
|
80 |
-
hash_val: str
|
81 |
-
paths: List[Path]
|
82 |
-
|
83 |
-
# βββββββββββββββββββββββββββββββββββββββββ Generic helpers βββββ
|
84 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
85 |
def load_yaml(path: Path) -> Dict:
|
86 |
-
with path.open(encoding=
|
87 |
return yaml.safe_load(f)
|
88 |
|
89 |
-
|
90 |
def parse_label_file(path: Path) -> list[tuple[int, float, float, float, float]]:
|
91 |
-
if not path.exists() or path.stat().st_size == 0:
|
92 |
return []
|
93 |
try:
|
94 |
arr = np.loadtxt(path, dtype=float)
|
@@ -98,364 +100,285 @@ def parse_label_file(path: Path) -> list[tuple[int, float, float, float, float]]
|
|
98 |
except Exception:
|
99 |
return []
|
100 |
|
101 |
-
|
102 |
def guess_image_dirs(root: Path) -> List[Path]:
|
103 |
-
subs = [
|
104 |
-
|
105 |
-
root / "train" / "images",
|
106 |
-
root / "valid" / "images",
|
107 |
-
root / "val" / "images",
|
108 |
-
root / "test" / "images",
|
109 |
-
]
|
110 |
return [d for d in subs if d.exists()]
|
111 |
|
112 |
-
|
113 |
-
def gather_dataset(root: Path, yaml_path: Path | None = None):
|
114 |
if yaml_path is None:
|
115 |
-
yamls = list(root.glob(
|
116 |
if not yamls:
|
117 |
raise FileNotFoundError("Dataset YAML not found")
|
118 |
yaml_path = yamls[0]
|
119 |
-
|
120 |
meta = load_yaml(yaml_path)
|
121 |
img_dirs = guess_image_dirs(root)
|
122 |
if not img_dirs:
|
123 |
-
raise FileNotFoundError("images/ directory
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
return imgs, lbls, meta
|
129 |
|
130 |
-
#
|
131 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
132 |
|
133 |
def _is_corrupt(path: Path) -> bool:
|
134 |
try:
|
135 |
with Image.open(path) as im:
|
136 |
im.verify()
|
137 |
return False
|
138 |
-
except
|
139 |
return True
|
140 |
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
"details": {
|
156 |
-
"missing_label_files": [str(p) for p in miss_lbl],
|
157 |
-
"corrupt_images": [str(p) for p in corrupt],
|
158 |
-
},
|
159 |
-
}
|
160 |
-
|
161 |
-
# Class balance -------------------------------------------------
|
162 |
-
|
163 |
-
def qc_class_balance(lbls: List[Path]):
|
164 |
-
cls_counts = Counter()
|
165 |
-
boxes_per_img = []
|
166 |
for l in lbls:
|
167 |
-
bs
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
# Image quality -------------------------------------------------
|
188 |
-
|
189 |
-
def _quality_stat(path: Path, blur_thr: float):
|
190 |
-
im = cv2.imread(str(path)) if cv2 else None
|
191 |
-
if im is None:
|
192 |
-
return path, False, False, False
|
193 |
-
gray = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
|
194 |
-
lap = cv2.Laplacian(gray, cv2.CV_64F).var()
|
195 |
-
br = gray.mean()
|
196 |
-
return path, lap < blur_thr, br < 25, br > 230
|
197 |
-
|
198 |
-
|
199 |
-
def qc_image_quality(imgs: List[Path], blur_thr: float = 100.0):
|
200 |
if cv2 is None:
|
201 |
-
return {"name":
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
if is_bright:
|
219 |
-
bright.append(p)
|
220 |
-
|
221 |
-
bad = len(set(blurry + dark + bright))
|
222 |
-
score = 100 - bad / max(len(imgs), 1) * 100
|
223 |
-
return {
|
224 |
-
"name": "Image quality",
|
225 |
-
"score": score,
|
226 |
-
"details": {
|
227 |
-
"blurry": [str(p) for p in blurry],
|
228 |
-
"dark": [str(p) for p in dark],
|
229 |
-
"bright": [str(p) for p in bright],
|
230 |
-
},
|
231 |
-
}
|
232 |
-
|
233 |
-
# Duplicate images ---------------------------------------------
|
234 |
-
|
235 |
-
def qc_duplicates(imgs: List[Path]):
|
236 |
-
# Fast path β use fastdup if installed & enough images
|
237 |
-
if fastdup is not None and len(imgs) > 50:
|
238 |
try:
|
239 |
-
fd
|
240 |
-
|
241 |
-
clusters
|
242 |
-
dup
|
243 |
-
score
|
244 |
-
|
245 |
-
|
246 |
-
"score": score,
|
247 |
-
"details": {"groups": clusters[:50]},
|
248 |
-
}
|
249 |
-
except Exception:
|
250 |
-
pass # fallback to hash
|
251 |
-
|
252 |
if imagehash is None:
|
253 |
-
return {"name":
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
hashes: Dict[str, List[Path]] = defaultdict(list)
|
259 |
-
with ProcessPoolExecutor(max_workers=CPU_COUNT) as ex:
|
260 |
-
for h, p in tqdm(
|
261 |
-
zip(ex.map(_hash, imgs), imgs),
|
262 |
-
total=len(imgs),
|
263 |
-
desc="hashing",
|
264 |
-
leave=False,
|
265 |
-
):
|
266 |
hashes[h].append(p)
|
267 |
-
|
268 |
-
|
269 |
-
dup
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
inter = max(ix2 - ix1, 0) * max(iy2 - iy1, 0)
|
286 |
-
union = w1 * h1 + w2 * h2 - inter
|
287 |
-
return inter / union if union else 0.0
|
288 |
-
|
289 |
-
|
290 |
-
def qc_model_qa(imgs: List[Path], weights: str | None, lbls: List[Path], iou_thr: float = 0.5):
|
291 |
-
if weights is None or YOLO is None:
|
292 |
-
return {"name": "Model QA", "score": 100, "details": "skipped (no weights)"}
|
293 |
-
|
294 |
-
model = YOLO(weights)
|
295 |
ious, mism = [], []
|
296 |
-
|
297 |
-
|
298 |
-
|
299 |
-
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
-
|
304 |
-
|
305 |
-
|
306 |
-
|
307 |
-
|
308 |
-
|
309 |
-
|
|
|
|
|
310 |
ious.append(best)
|
311 |
-
if best < iou_thr:
|
312 |
mism.append(str(p))
|
313 |
-
|
314 |
-
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
323 |
def aggregate(scores):
|
324 |
-
return sum(DEFAULT_W.get(r[
|
325 |
|
326 |
-
# βββββββββββββββββββββββββββββββββββββββββ Roboflow helpers ββββ
|
327 |
RF_RE = re.compile(r"https://universe\.roboflow\.com/([^/]+)/([^/]+)/dataset/(\d+)")
|
328 |
-
|
329 |
-
|
330 |
-
m
|
331 |
-
|
332 |
-
|
333 |
-
|
334 |
-
|
335 |
-
|
336 |
-
|
337 |
-
|
338 |
-
|
339 |
-
|
340 |
-
|
341 |
-
|
342 |
-
|
343 |
-
|
344 |
-
|
345 |
-
|
346 |
-
imgs, lbls, meta = gather_dataset(root, yaml_override)
|
347 |
-
res = [
|
348 |
-
qc_integrity(imgs, lbls),
|
349 |
-
qc_class_balance(lbls),
|
350 |
-
qc_image_quality(imgs),
|
351 |
-
qc_duplicates(imgs),
|
352 |
-
qc_model_qa(imgs, str(weights) if weights else None, lbls),
|
353 |
]
|
354 |
-
final
|
355 |
-
|
356 |
-
md = [f"## **{meta.get('name', root.name)}**Β βΒ ScoreΒ {final:.1f}/100"]
|
357 |
for r in res:
|
358 |
-
md.append(f"### {r['name']}
|
359 |
-
md.append("<details><summary>details</summary>\n
|
360 |
-
md.append(json.dumps(r[
|
361 |
md.append("```\n</details>\n")
|
362 |
-
|
363 |
-
|
364 |
-
|
365 |
-
df = pd.DataFrame.from_dict(cls_counts, orient="index", columns=["count"])
|
366 |
-
df.index.name = "class"
|
367 |
-
return md_str, df
|
368 |
-
|
369 |
-
# βββββββββββββββββββββββββββββββββββββββββ Gradio UI βββββββββββ
|
370 |
-
|
371 |
-
def evaluate(
|
372 |
-
api_key: str,
|
373 |
-
url_txt: gr.File | None,
|
374 |
-
zip_file: gr.File | None,
|
375 |
-
server_path: str,
|
376 |
-
yaml_file: gr.File | None,
|
377 |
-
weights: gr.File | None,
|
378 |
-
):
|
379 |
-
if not any([url_txt, zip_file, server_path]):
|
380 |
-
return "Upload a .txt of URLs or dataset ZIP/path", pd.DataFrame()
|
381 |
-
|
382 |
-
reports, dfs = [], []
|
383 |
-
|
384 |
-
# Roboflow batch ------------------------------------------
|
385 |
-
if url_txt:
|
386 |
-
if Roboflow is None:
|
387 |
-
return "`roboflow` not installed", pd.DataFrame()
|
388 |
-
if not api_key:
|
389 |
-
return "Enter Roboflow API key", pd.DataFrame()
|
390 |
-
|
391 |
-
rf = Roboflow(api_key=api_key.strip())
|
392 |
-
for line in Path(url_txt.name).read_text().splitlines():
|
393 |
-
if not line.strip():
|
394 |
-
continue
|
395 |
-
try:
|
396 |
-
ds_root = download_rf_dataset(line, rf, TMP_ROOT)
|
397 |
-
md, df = run_quality(ds_root, None, Path(weights.name) if weights else None)
|
398 |
-
reports.append(md)
|
399 |
-
dfs.append(df)
|
400 |
-
except Exception as e:
|
401 |
-
reports.append(f"### {line}\n\nβ οΈΒ {e}")
|
402 |
-
|
403 |
-
# Manual ZIP ----------------------------------------------
|
404 |
-
if zip_file:
|
405 |
-
tmp_dir = Path(tempfile.mkdtemp())
|
406 |
-
shutil.unpack_archive(zip_file.name, tmp_dir)
|
407 |
-
md, df = run_quality(tmp_dir, Path(yaml_file.name) if yaml_file else None, Path(weights.name) if weights else None)
|
408 |
-
reports.append(md)
|
409 |
-
dfs.append(df)
|
410 |
-
shutil.rmtree(tmp_dir, ignore_errors=True)
|
411 |
-
|
412 |
-
# Manual path ---------------------------------------------
|
413 |
-
if server_path:
|
414 |
-
md, df = run_quality(Path(server_path), Path(yaml_file.name) if yaml_file else None, Path(weights.name) if weights else None)
|
415 |
-
reports.append(md)
|
416 |
-
dfs.append(df)
|
417 |
-
|
418 |
-
summary_md = "\n\n---\n\n".join(reports)
|
419 |
-
combined_df = pd.concat(dfs).groupby(level=0).sum() if dfs else pd.DataFrame()
|
420 |
-
return summary_md, combined_df
|
421 |
-
|
422 |
-
# βββββββββββββββββββββββββββββββββββββββββ Launch ββββββββββββ
|
423 |
-
with gr.Blocks(title="YOLO Dataset Quality Evaluator") as demo:
|
424 |
-
gr.Markdown(
|
425 |
-
"""
|
426 |
-
# YOLOv8 Dataset Quality Evaluator
|
427 |
-
|
428 |
-
### Roboflow batch
|
429 |
-
1. Paste your **Roboflow API key**
|
430 |
-
2. Upload a **.txt** file β one `https://universe.roboflow.com/.../dataset/x` per line
|
431 |
-
|
432 |
-
### Manual
|
433 |
-
* Upload a dataset **ZIP** or type a dataset **path** on the server
|
434 |
-
* Optionally supply a custom **data.yaml** and/or a **YOLOΒ .pt** weights file for modelβassisted QA
|
435 |
-
"""
|
436 |
)
|
|
|
|
|
437 |
|
438 |
-
|
439 |
-
|
440 |
-
|
|
|
441 |
|
|
|
|
|
442 |
with gr.Row():
|
443 |
-
|
444 |
-
|
445 |
-
|
|
|
|
|
446 |
with gr.Row():
|
447 |
-
yaml_in
|
448 |
weights_in = gr.File(label="YOLO weights (.pt)")
|
449 |
-
|
|
|
|
|
|
|
450 |
run_btn = gr.Button("Evaluate")
|
451 |
-
out_md
|
452 |
-
out_df
|
453 |
-
|
454 |
-
|
455 |
-
|
456 |
-
|
457 |
-
|
458 |
-
|
459 |
-
|
460 |
-
|
461 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
app.py β Roboflowβaware YOLOv8 Dataset Quality Evaluator (v3)
|
3 |
+
|
4 |
+
Changelog (2025β04β17)
|
5 |
+
ββββββββββββββββββββββ
|
6 |
+
β’ **YOLO model caching** to avoid repeated loads
|
7 |
+
β’ **Exposed thresholds** (blur, IOU, confidence) as Gradio sliders
|
8 |
+
β’ **Config dataclass** for unified settings
|
9 |
+
β’ **Cleanlab integration** for label-issue detection
|
10 |
+
β’ **Parallel label parsing** and expanded caching
|
11 |
+
β’ **Adjusted QC weights** to include Label Issues stage
|
|
|
12 |
"""
|
13 |
|
14 |
from __future__ import annotations
|
|
|
32 |
from PIL import Image
|
33 |
from tqdm import tqdm
|
34 |
|
35 |
+
# Optional heavy deps
|
36 |
try:
|
37 |
+
import cv2
|
38 |
except ImportError:
|
39 |
cv2 = None
|
|
|
40 |
try:
|
41 |
+
import imagehash
|
42 |
except ImportError:
|
43 |
imagehash = None
|
|
|
44 |
try:
|
45 |
+
import fastdup
|
46 |
except ImportError:
|
47 |
fastdup = None
|
|
|
48 |
try:
|
49 |
+
from ultralytics import YOLO
|
50 |
except ImportError:
|
51 |
+
YOLO = None
|
|
|
52 |
try:
|
53 |
+
from roboflow import Roboflow
|
54 |
except ImportError:
|
55 |
+
Roboflow = None
|
56 |
+
try:
|
57 |
+
from cleanlab.pruning import get_noise_indices
|
58 |
+
except ImportError:
|
59 |
+
get_noise_indices = None
|
60 |
|
61 |
+
# βββββββββββββββββ Config & Constants βββββββββββββββββ
|
62 |
TMP_ROOT = Path(tempfile.gettempdir()) / "rf_datasets"
|
63 |
TMP_ROOT.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
64 |
CPU_COUNT = int(os.getenv("QC_CPU", max(1, (os.cpu_count() or 4) // 2)))
|
65 |
+
BATCH_SIZE = int(os.getenv("QC_BATCH", 16))
|
66 |
|
67 |
DEFAULT_W = {
|
68 |
+
"Integrity": 0.25,
|
69 |
+
"Class balance":0.10,
|
70 |
+
"Image quality":0.15,
|
71 |
+
"Duplicates": 0.10,
|
72 |
+
"Model QA": 0.30,
|
73 |
+
"Label issues": 0.10,
|
74 |
}
|
75 |
|
76 |
+
_model_cache: dict[str, YOLO] = {}
|
|
|
|
|
|
|
|
|
|
|
77 |
|
78 |
+
@dataclass
|
79 |
+
class QCConfig:
|
80 |
+
blur_thr: float
|
81 |
+
iou_thr: float
|
82 |
+
conf_thr: float
|
83 |
+
weights: str | None
|
84 |
+
cpu_count: int = CPU_COUNT
|
85 |
+
batch_size: int = BATCH_SIZE
|
86 |
+
|
87 |
+
# βββββββββββββββββββ Helpers & Caching βββββββββββββββββββ
|
88 |
def load_yaml(path: Path) -> Dict:
|
89 |
+
with path.open('r', encoding='utf-8') as f:
|
90 |
return yaml.safe_load(f)
|
91 |
|
|
|
92 |
def parse_label_file(path: Path) -> list[tuple[int, float, float, float, float]]:
|
93 |
+
if not path or not path.exists() or path.stat().st_size == 0:
|
94 |
return []
|
95 |
try:
|
96 |
arr = np.loadtxt(path, dtype=float)
|
|
|
100 |
except Exception:
|
101 |
return []
|
102 |
|
|
|
103 |
def guess_image_dirs(root: Path) -> List[Path]:
|
104 |
+
subs = [root / 'images', root / 'train' / 'images', root / 'valid' / 'images',
|
105 |
+
root / 'val' / 'images', root / 'test' / 'images']
|
|
|
|
|
|
|
|
|
|
|
106 |
return [d for d in subs if d.exists()]
|
107 |
|
108 |
+
def gather_dataset(root: Path, yaml_path: Path | None):
|
|
|
109 |
if yaml_path is None:
|
110 |
+
yamls = list(root.glob('*.yaml'))
|
111 |
if not yamls:
|
112 |
raise FileNotFoundError("Dataset YAML not found")
|
113 |
yaml_path = yamls[0]
|
|
|
114 |
meta = load_yaml(yaml_path)
|
115 |
img_dirs = guess_image_dirs(root)
|
116 |
if not img_dirs:
|
117 |
+
raise FileNotFoundError("images/ directory missing")
|
118 |
+
imgs = [p for d in img_dirs for p in d.rglob('*.*') if imghdr.what(p)]
|
119 |
+
labels_root = {d.parent/'labels' for d in img_dirs}
|
120 |
+
lbls = [next((lr/f"{p.stem}.txt" for lr in labels_root if (lr/f"{p.stem}.txt").exists()), None)
|
121 |
+
for p in imgs]
|
122 |
return imgs, lbls, meta
|
123 |
|
124 |
+
# YOLO model caching
|
125 |
+
|
126 |
+
def get_model(weights: str) -> YOLO | None:
|
127 |
+
if weights is None or YOLO is None:
|
128 |
+
return None
|
129 |
+
if weights not in _model_cache:
|
130 |
+
_model_cache[weights] = YOLO(weights)
|
131 |
+
return _model_cache[weights]
|
132 |
+
|
133 |
+
# βββββββββββββββββββββ Quality Checks βββββββββββββββββββββ
|
134 |
|
135 |
def _is_corrupt(path: Path) -> bool:
|
136 |
try:
|
137 |
with Image.open(path) as im:
|
138 |
im.verify()
|
139 |
return False
|
140 |
+
except:
|
141 |
return True
|
142 |
|
143 |
+
def qc_integrity(imgs: List[Path], lbls: List[Path], cfg: QCConfig):
|
144 |
+
miss = [i for i,l in zip(imgs,lbls) if l is None]
|
145 |
+
corrupt = []
|
146 |
+
with ProcessPoolExecutor(max_workers=cfg.cpu_count) as ex:
|
147 |
+
fut = {ex.submit(_is_corrupt,p):p for p in imgs}
|
148 |
+
for f in as_completed(fut):
|
149 |
+
if f.result(): corrupt.append(fut[f])
|
150 |
+
score = 100 - (len(miss)+len(corrupt))/max(len(imgs),1)*100
|
151 |
+
return {"name":"Integrity","score":max(score,0),
|
152 |
+
"details":{"missing_label_files":[str(p) for p in miss],
|
153 |
+
"corrupt_images":[str(p) for p in corrupt]}}
|
154 |
+
|
155 |
+
def qc_class_balance(lbls: List[Path], cfg: QCConfig):
|
156 |
+
counts=Counter(); boxes=[]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
157 |
for l in lbls:
|
158 |
+
bs=parse_label_file(l) if l else []
|
159 |
+
boxes.append(len(bs)); counts.update(b[0] for b in bs)
|
160 |
+
if not counts:
|
161 |
+
return {"name":"Class balance","score":0,"details":"No labels"}
|
162 |
+
bal=(min(counts.values())/max(counts.values()))*100
|
163 |
+
return {"name":"Class balance","score":bal,
|
164 |
+
"details":{"class_counts":dict(counts),
|
165 |
+
"boxes_per_image":{"min":int(np.min(boxes)),
|
166 |
+
"max":int(np.max(boxes)),
|
167 |
+
"mean":float(np.mean(boxes))}}}
|
168 |
+
|
169 |
+
def _quality_stat(path:Path, blur_thr:float):
|
170 |
+
if cv2 is None: return path,False,False,False
|
171 |
+
im=cv2.imread(str(path));
|
172 |
+
gray=cv2.cvtColor(im,cv2.COLOR_BGR2GRAY)
|
173 |
+
lap=cv2.Laplacian(gray,cv2.CV_64F).var(); br=gray.mean()
|
174 |
+
return path, lap<blur_thr, br<25, br>230
|
175 |
+
|
176 |
+
def qc_image_quality(imgs:List[Path], cfg:QCConfig):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
177 |
if cv2 is None:
|
178 |
+
return {"name":"Image quality","score":100,"details":"cv2 missing"}
|
179 |
+
blurry, dark, bright = [],[],[];
|
180 |
+
with ProcessPoolExecutor(max_workers=cfg.cpu_count) as ex:
|
181 |
+
for p,isb,isd,isB in tqdm(ex.map(lambda x: _quality_stat(x,cfg.blur_thr), imgs),
|
182 |
+
total=len(imgs), desc='img-quality', leave=False):
|
183 |
+
if isb: blurry.append(p)
|
184 |
+
if isd: dark.append(p)
|
185 |
+
if isB: bright.append(p)
|
186 |
+
bad=len({*blurry,*dark,*bright})
|
187 |
+
score=100 - bad/max(len(imgs),1)*100
|
188 |
+
return {"name":"Image quality","score":score,
|
189 |
+
"details":{"blurry":[str(p) for p in blurry],
|
190 |
+
"dark":[str(p) for p in dark],
|
191 |
+
"bright":[str(p) for p in bright]}}
|
192 |
+
|
193 |
+
def qc_duplicates(imgs:List[Path], cfg:QCConfig):
|
194 |
+
if fastdup and len(imgs)>50:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
195 |
try:
|
196 |
+
fd=fastdup.create(input_dir=str(Path(imgs[0]).parent.parent),
|
197 |
+
work_dir=str(TMP_ROOT/'fastdup'))
|
198 |
+
fd.run(); clusters=fd.get_clusters()
|
199 |
+
dup=sum(len(c)-1 for c in clusters)
|
200 |
+
return {"name":"Duplicates","score":100-dup/len(imgs)*100,
|
201 |
+
"details":{"groups":clusters[:50]}}
|
202 |
+
except: pass
|
|
|
|
|
|
|
|
|
|
|
|
|
203 |
if imagehash is None:
|
204 |
+
return {"name":"Duplicates","score":100,"details":"deps missing"}
|
205 |
+
hashes=defaultdict(list)
|
206 |
+
with ProcessPoolExecutor(max_workers=cfg.cpu_count) as ex:
|
207 |
+
for h,p in zip(ex.map(lambda x: str(imagehash.average_hash(Image.open(x))),imgs), imgs):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
208 |
hashes[h].append(p)
|
209 |
+
groups=[g for g in hashes.values() if len(g)>1]
|
210 |
+
dup=sum(len(g)-1 for g in groups)
|
211 |
+
return {"name":"Duplicates","score":100-dup/len(imgs)*100,
|
212 |
+
"details":{"groups":[[str(p) for p in g] for g in groups[:50]]}}
|
213 |
+
|
214 |
+
def _rel_iou(b1,b2):
|
215 |
+
x1,y1,w1,h1=b1; x2,y2,w2,h2=b2
|
216 |
+
xa1,ya1,xa2,ya2=x1-w1/2,y1-h1/2,x1+w1/2,y1+h1/2
|
217 |
+
xb1,yb1,xb2,yb2=x2-w2/2,y2-h2/2,x2+w2/2,y2+h2/2
|
218 |
+
ix1,iy1,ix2,iy2=max(xa1,xb1),max(ya1,yb1),min(xa2,xb2),min(ya2,yb2)
|
219 |
+
inter=max(ix2-ix1,0)*max(iy2-iy1,0)
|
220 |
+
union=w1*h1+w2*h2-inter
|
221 |
+
return inter/union if union else 0.0
|
222 |
+
|
223 |
+
def qc_model_qa(imgs:List[Path], lbls:List[Path], cfg:QCConfig):
|
224 |
+
model=get_model(cfg.weights)
|
225 |
+
if model is None:
|
226 |
+
return {"name":"Model QA","score":100,"details":"skipped"}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
227 |
ious, mism = [], []
|
228 |
+
for i in range(0,len(imgs),cfg.batch_size):
|
229 |
+
batch=imgs[i:i+cfg.batch_size]
|
230 |
+
results=model.predict(batch, verbose=False)
|
231 |
+
for p,res in zip(batch,results):
|
232 |
+
gt=parse_label_file(lbls[imgs.index(p)])
|
233 |
+
if not gt: continue
|
234 |
+
preds = res.boxes.xywh.cpu().numpy()
|
235 |
+
confs = res.boxes.conf.cpu().numpy()
|
236 |
+
classes = res.boxes.cls.cpu().numpy()
|
237 |
+
mask = confs >= cfg.conf_thr
|
238 |
+
preds, classes = preds[mask], classes[mask]
|
239 |
+
for cls,x,y,w,h in gt:
|
240 |
+
best=0.0
|
241 |
+
for b,c in zip(preds,classes):
|
242 |
+
if int(c)!=cls: continue
|
243 |
+
best=max(best,_rel_iou((x,y,w,h),tuple(b)))
|
244 |
ious.append(best)
|
245 |
+
if best < cfg.iou_thr:
|
246 |
mism.append(str(p))
|
247 |
+
miou=float(np.mean(ious)) if ious else 1.0
|
248 |
+
return {"name":"Model QA","score":miou*100,
|
249 |
+
"details":{"mean_iou":miou,"mismatches":mism[:50]}}
|
250 |
+
|
251 |
+
def qc_label_issues(imgs:List[Path], lbls:List[Path], cfg:QCConfig):
|
252 |
+
if get_noise_indices is None or cfg.weights is None:
|
253 |
+
return {"name":"Label issues","score":100,"details":"skipped"}
|
254 |
+
model=get_model(cfg.weights)
|
255 |
+
if model is None:
|
256 |
+
return {"name":"Label issues","score":100,"details":"skipped"}
|
257 |
+
labels,preds,samps = [],[],[]
|
258 |
+
for i in range(0,len(imgs),cfg.batch_size):
|
259 |
+
batch=imgs[i:i+cfg.batch_size]
|
260 |
+
results=model.predict(batch, verbose=False)
|
261 |
+
for p,res in zip(batch,results):
|
262 |
+
gt=parse_label_file(lbls[imgs.index(p)])
|
263 |
+
for cls,x,y,w,h in gt:
|
264 |
+
labels.append(int(cls))
|
265 |
+
# find predicted class with highest IoU
|
266 |
+
best_i, best_c = 0.0, -1
|
267 |
+
for b,c in zip(res.boxes.xywh.cpu().numpy(), res.boxes.cls.cpu().numpy()):
|
268 |
+
iou=_rel_iou((x,y,w,h),tuple(b))
|
269 |
+
if iou>best_i:
|
270 |
+
best_i, best_c = iou, int(c)
|
271 |
+
preds.append(best_c)
|
272 |
+
samps.append(p)
|
273 |
+
if not labels:
|
274 |
+
return {"name":"Label issues","score":100,"details":"no GT"}
|
275 |
+
noise_idx = get_noise_indices(np.array(labels), np.array(preds))
|
276 |
+
sus = list({str(samps[i]) for i in noise_idx})[:50]
|
277 |
+
score = 100 - len(noise_idx)/len(labels)*100
|
278 |
+
return {"name":"Label issues","score":score,
|
279 |
+
"details":{"suspect_images": sus}}
|
280 |
+
|
281 |
+
# βββββββββββββββββββββββ Aggregate & Run ββββββββββββββββββββββ
|
282 |
def aggregate(scores):
|
283 |
+
return sum(DEFAULT_W.get(r['name'],0)*r['score'] for r in scores)
|
284 |
|
|
|
285 |
RF_RE = re.compile(r"https://universe\.roboflow\.com/([^/]+)/([^/]+)/dataset/(\d+)")
|
286 |
+
def download_rf_dataset(url:str, rf_api:Roboflow, dest:Path)->Path:
|
287 |
+
m=RF_RE.match(url.strip());
|
288 |
+
if not m: raise ValueError(f"Bad RF URL: {url}")
|
289 |
+
ws,proj,ver = m.groups()
|
290 |
+
ds = dest/f"{ws}_{proj}_v{ver}"
|
291 |
+
if ds.exists(): return ds
|
292 |
+
proj_obj = rf_api.workspace(ws).project(proj)
|
293 |
+
proj_obj.version(int(ver)).download('yolov8', location=str(ds))
|
294 |
+
return ds
|
295 |
+
|
296 |
+
def run_quality(root:Path, yaml_override:Path|None, lbls:List[Path], imgs:List[Path], cfg:QCConfig):
|
297 |
+
res=[
|
298 |
+
qc_integrity(imgs,lbls,cfg),
|
299 |
+
qc_class_balance(lbls,cfg),
|
300 |
+
qc_image_quality(imgs,cfg),
|
301 |
+
qc_duplicates(imgs,cfg),
|
302 |
+
qc_model_qa(imgs,lbls,cfg),
|
303 |
+
qc_label_issues(imgs,lbls,cfg),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
304 |
]
|
305 |
+
final=aggregate(res)
|
306 |
+
md=[f"## **{root.name}** β ScoreΒ {final:.1f}/100"]
|
|
|
307 |
for r in res:
|
308 |
+
md.append(f"### {r['name']} Β {r['score']:.1f}")
|
309 |
+
md.append("<details><summary>details</summary>\n```json")
|
310 |
+
md.append(json.dumps(r['details'],indent=2))
|
311 |
md.append("```\n</details>\n")
|
312 |
+
df = pd.DataFrame.from_dict(
|
313 |
+
next(r for r in res if r['name']=='Class balance')['details']['class_counts'],
|
314 |
+
orient='index', columns=['count']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
315 |
)
|
316 |
+
df.index.name='class'
|
317 |
+
return "\n".join(md), df
|
318 |
|
319 |
+
# βββββββββββββββββββββββ Gradio UI ββββββββββββββββββββββ
|
320 |
+
with gr.Blocks(title="YOLO Dataset Quality Evaluator v3") as demo:
|
321 |
+
gr.Markdown("""
|
322 |
+
# YOLOv8 Dataset Quality Evaluator v3
|
323 |
|
324 |
+
* Tweaks: blur, IOU & confidence sliders; Cleanlab label issues; model caching
|
325 |
+
""")
|
326 |
with gr.Row():
|
327 |
+
api_in = gr.Textbox(label="Roboflow API key", type="password")
|
328 |
+
url_txt = gr.File(label=".txt of RF dataset URLs", file_types=['.txt'])
|
329 |
+
with gr.Row():
|
330 |
+
zip_in = gr.File(label="Dataset ZIP")
|
331 |
+
path_in = gr.Textbox(label="Server path")
|
332 |
with gr.Row():
|
333 |
+
yaml_in = gr.File(label="Custom YAML", file_types=['.yaml'])
|
334 |
weights_in = gr.File(label="YOLO weights (.pt)")
|
335 |
+
with gr.Row():
|
336 |
+
blur_sl = gr.Slider(0,500,value=100,label="Blur threshold")
|
337 |
+
iou_sl = gr.Slider(0.0,1.0,value=0.5,label="IOU threshold")
|
338 |
+
conf_sl = gr.Slider(0.0,1.0,value=0.25,label="Min detection confidence")
|
339 |
run_btn = gr.Button("Evaluate")
|
340 |
+
out_md = gr.Markdown()
|
341 |
+
out_df = gr.Dataframe()
|
342 |
+
|
343 |
+
def evaluate(api_key, url_txt, zip_file, server_path, yaml_file, weights,
|
344 |
+
blur_thr, iou_thr, conf_thr):
|
345 |
+
reports, dfs = [], []
|
346 |
+
cfg = QCConfig(blur_thr, iou_thr, conf_thr,
|
347 |
+
weights.name if weights else None)
|
348 |
+
rf = Roboflow(api_key) if api_key and Roboflow else None
|
349 |
+
# Roboflow batch
|
350 |
+
if url_txt:
|
351 |
+
for line in Path(url_txt.name).read_text().splitlines():
|
352 |
+
if not line.strip(): continue
|
353 |
+
try:
|
354 |
+
ds = download_rf_dataset(line, rf, TMP_ROOT)
|
355 |
+
imgs,lbls,_ = gather_dataset(ds,None)
|
356 |
+
md, df = run_quality(ds,None,lbls,imgs,cfg)
|
357 |
+
reports.append(md); dfs.append(df)
|
358 |
+
except Exception as e:
|
359 |
+
reports.append(f"### {line}\nβ οΈΒ {e}")
|
360 |
+
# ZIP
|
361 |
+
if zip_file:
|
362 |
+
tmp=Path(tempfile.mkdtemp())
|
363 |
+
shutil.unpack_archive(zip_file.name,tmp)
|
364 |
+
imgs,lbls,_=gather_dataset(tmp,Path(yaml_file.name) if yaml_file else None)
|
365 |
+
md,df=run_quality(tmp,None,lbls,imgs,cfg)
|
366 |
+
reports.append(md); dfs.append(df)
|
367 |
+
shutil.rmtree(tmp)
|
368 |
+
# Server path
|
369 |
+
if server_path:
|
370 |
+
ds=Path(server_path)
|
371 |
+
imgs,lbls,_=gather_dataset(ds,Path(yaml_file.name) if yaml_file else None)
|
372 |
+
md,df=run_quality(ds,None,lbls,imgs,cfg)
|
373 |
+
reports.append(md); dfs.append(df)
|
374 |
+
summary='\n---\n'.join(reports)
|
375 |
+
combined = pd.concat(dfs).groupby(level=0).sum() if dfs else pd.DataFrame()
|
376 |
+
return summary, combined
|
377 |
+
|
378 |
+
run_btn.click(evaluate,
|
379 |
+
inputs=[api_in, url_txt, zip_in, path_in, yaml_in, weights_in,
|
380 |
+
blur_sl, iou_sl, conf_sl],
|
381 |
+
outputs=[out_md, out_df])
|
382 |
+
|
383 |
+
if __name__ == '__main__':
|
384 |
+
demo.launch(server_name='0.0.0.0', server_port=int(os.getenv('PORT',7860)))
|