Update app.py
Browse files
app.py
CHANGED
@@ -4,11 +4,11 @@ app.py β Roboflowβaware YOLOv8 Dataset Quality Evaluator (v3)
|
|
4 |
Changelog (2025β04β17)
|
5 |
ββββββββββββββββββββββ
|
6 |
β’ Fix RF URL regex to accept http/https
|
7 |
-
β’
|
8 |
-
β’
|
9 |
β’ YOLO model caching
|
10 |
β’ Config dataclass & Gradio sliders for blur, IOU, confidence
|
11 |
-
β’ Cleanlab integration for label
|
12 |
"""
|
13 |
|
14 |
from __future__ import annotations
|
@@ -97,13 +97,13 @@ def parse_label_file(path: Path) -> list[tuple[int, float, float, float, float]]
|
|
97 |
if arr.ndim == 1:
|
98 |
arr = arr.reshape(1, -1)
|
99 |
return [tuple(row) for row in arr]
|
100 |
-
except
|
101 |
return []
|
102 |
|
103 |
def guess_image_dirs(root: Path) -> List[Path]:
|
104 |
-
|
105 |
-
|
106 |
-
return [d for d in
|
107 |
|
108 |
def gather_dataset(root: Path, yaml_path: Path | None):
|
109 |
if yaml_path is None:
|
@@ -116,13 +116,13 @@ def gather_dataset(root: Path, yaml_path: Path | None):
|
|
116 |
if not img_dirs:
|
117 |
raise FileNotFoundError("images/ directory missing")
|
118 |
imgs = [p for d in img_dirs for p in d.rglob('*.*') if imghdr.what(p)]
|
119 |
-
|
120 |
-
lbls = [next((lr/f"{p.stem}.txt" for lr in
|
121 |
for p in imgs]
|
122 |
return imgs, lbls, meta
|
123 |
|
124 |
def get_model(weights: str) -> YOLO | None:
|
125 |
-
if weights
|
126 |
return None
|
127 |
if weights not in _model_cache:
|
128 |
_model_cache[weights] = YOLO(weights)
|
@@ -130,7 +130,7 @@ def get_model(weights: str) -> YOLO | None:
|
|
130 |
|
131 |
# βββββββββ Functions for parallel mapping ββββββββββββββββββββββββββββββββββ
|
132 |
def _quality_stat_args(args: Tuple[Path, float]) -> Tuple[Path, bool, bool, bool]:
|
133 |
-
path,
|
134 |
if cv2 is None:
|
135 |
return path, False, False, False
|
136 |
im = cv2.imread(str(path))
|
@@ -138,11 +138,8 @@ def _quality_stat_args(args: Tuple[Path, float]) -> Tuple[Path, bool, bool, bool
|
|
138 |
return path, False, False, False
|
139 |
gray = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
|
140 |
lap = cv2.Laplacian(gray, cv2.CV_64F).var()
|
141 |
-
|
142 |
-
return path, lap <
|
143 |
-
|
144 |
-
def _compute_hash(path: Path) -> str:
|
145 |
-
return str(imagehash.average_hash(Image.open(path)))
|
146 |
|
147 |
def _is_corrupt(path: Path) -> bool:
|
148 |
try:
|
@@ -154,19 +151,19 @@ def _is_corrupt(path: Path) -> bool:
|
|
154 |
|
155 |
# βββββββββββββββββ Quality Checks ββββββββββββββββββββββββββββββββββββββββββ
|
156 |
def qc_integrity(imgs: List[Path], lbls: List[Path], cfg: QCConfig) -> Dict:
|
157 |
-
missing = [i for i,
|
158 |
corrupt = []
|
159 |
with ProcessPoolExecutor(max_workers=cfg.cpu_count) as ex:
|
160 |
fut = {ex.submit(_is_corrupt, p): p for p in imgs}
|
161 |
for f in as_completed(fut):
|
162 |
if f.result(): corrupt.append(fut[f])
|
163 |
-
score = 100 - (len(missing)
|
164 |
return {"name":"Integrity","score":max(score,0),
|
165 |
"details":{"missing_label_files":[str(p) for p in missing],
|
166 |
"corrupt_images":[str(p) for p in corrupt]}}
|
167 |
|
168 |
def qc_class_balance(lbls: List[Path], cfg: QCConfig) -> Dict:
|
169 |
-
counts = Counter()
|
170 |
for l in lbls:
|
171 |
bs = parse_label_file(l) if l else []
|
172 |
boxes.append(len(bs)); counts.update(b[0] for b in bs)
|
@@ -175,67 +172,69 @@ def qc_class_balance(lbls: List[Path], cfg: QCConfig) -> Dict:
|
|
175 |
bal = min(counts.values())/max(counts.values())*100
|
176 |
return {"name":"Class balance","score":bal,
|
177 |
"details":{"class_counts":dict(counts),
|
178 |
-
"boxes_per_image":{
|
|
|
|
|
179 |
|
180 |
def qc_image_quality(imgs: List[Path], cfg: QCConfig) -> Dict:
|
181 |
if cv2 is None:
|
182 |
return {"name":"Image quality","score":100,"details":"cv2 missing"}
|
183 |
-
blurry,dark,bright = [],[],[]
|
184 |
with ProcessPoolExecutor(max_workers=cfg.cpu_count) as ex:
|
185 |
args = [(p, cfg.blur_thr) for p in imgs]
|
186 |
for p, isb, isd, isB in tqdm(
|
187 |
-
ex.map(_quality_stat_args, args), total=len(imgs),
|
|
|
|
|
188 |
if isb: blurry.append(p)
|
189 |
if isd: dark.append(p)
|
190 |
if isB: bright.append(p)
|
191 |
bad = len({*blurry,*dark,*bright})
|
192 |
-
score = 100 - bad
|
193 |
return {"name":"Image quality","score":score,
|
194 |
"details":{"blurry":[str(p) for p in blurry],
|
195 |
"dark":[str(p) for p in dark],
|
196 |
"bright":[str(p) for p in bright]}}
|
197 |
|
198 |
def qc_duplicates(imgs: List[Path], cfg: QCConfig) -> Dict:
|
199 |
-
|
|
|
200 |
try:
|
201 |
-
fd = fastdup.create(
|
202 |
-
|
|
|
|
|
|
|
|
|
203 |
dup = sum(len(c)-1 for c in clusters)
|
204 |
return {"name":"Duplicates","score":100-dup/len(imgs)*100,
|
205 |
"details":{"groups":clusters[:50]}}
|
206 |
-
except:
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
for h,p in tqdm(zip(ex.map(_compute_hash, imgs), imgs),total=len(imgs),desc="hashing",leave=False):
|
213 |
-
hashes[h].append(p)
|
214 |
-
groups = [g for g in hashes.values() if len(g)>1]
|
215 |
-
dup = sum(len(g)-1 for g in groups)
|
216 |
-
score = 100 - dup / max(len(imgs), 1) * 100
|
217 |
-
return {"name":"Duplicates","score":score,
|
218 |
-
"details":{"groups":[[str(p) for p in g] for g in groups[:50]]}}
|
219 |
|
220 |
def qc_model_qa(imgs: List[Path], lbls: List[Path], cfg: QCConfig) -> Dict:
|
221 |
model = get_model(cfg.weights)
|
222 |
if model is None:
|
223 |
return {"name":"Model QA","score":100,"details":"skipped"}
|
224 |
ious, mism = [], []
|
225 |
-
for i in range(0,
|
226 |
batch = imgs[i:i+cfg.batch_size]
|
227 |
results = model.predict(batch, verbose=False, half=True, dynamic=True)
|
228 |
-
for p,res in zip(batch,
|
229 |
gt = parse_label_file(p.parent.parent/'labels'/f"{p.stem}.txt")
|
230 |
for cls,x,y,w,h in gt:
|
231 |
best=0.0
|
232 |
for b,c,conf in zip(res.boxes.xywh.cpu().numpy(),
|
233 |
res.boxes.cls.cpu().numpy(),
|
234 |
res.boxes.conf.cpu().numpy()):
|
235 |
-
if conf
|
236 |
-
best = max(best,
|
237 |
ious.append(best)
|
238 |
-
if best
|
239 |
miou = float(np.mean(ious)) if ious else 1.0
|
240 |
return {"name":"Model QA","score":miou*100,
|
241 |
"details":{"mean_iou":miou,"mismatches":mism[:50]}}
|
@@ -243,52 +242,53 @@ def qc_model_qa(imgs: List[Path], lbls: List[Path], cfg: QCConfig) -> Dict:
|
|
243 |
def qc_label_issues(imgs: List[Path], lbls: List[Path], cfg: QCConfig) -> Dict:
|
244 |
if get_noise_indices is None:
|
245 |
return {"name":"Label issues","score":100,"details":"cleanlab missing"}
|
246 |
-
labels,preds,idxs = [],[],[]
|
247 |
-
|
|
|
248 |
bs = parse_label_file(lbl) if lbl else []
|
249 |
for cls,*_ in bs:
|
250 |
labels.append(int(cls)); idxs.append(i)
|
251 |
-
|
252 |
-
|
253 |
-
preds.append(
|
254 |
if not labels:
|
255 |
return {"name":"Label issues","score":100,"details":"no GT"}
|
256 |
labels_arr = np.array(labels)
|
257 |
-
#
|
258 |
-
|
|
|
259 |
noise = get_noise_indices(labels=labels_arr, probabilities=probs)
|
260 |
-
|
261 |
-
files = [str(imgs[i]) for i in
|
262 |
-
score = 100 - len(
|
263 |
-
return {"name":"Label issues","score":score,
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
|
|
272 |
return inter/union if union else 0.0
|
273 |
|
274 |
def aggregate(results: List[Dict]) -> float:
|
275 |
-
return sum(DEFAULT_W[r['name']]
|
276 |
|
277 |
-
# βββββββββββββββββββ RF URL & Download ββββββββββββββββββββββββββββββββββββ
|
278 |
RF_RE = re.compile(r"https?://universe\.roboflow\.com/([^/]+)/([^/]+)/dataset/(\d+)")
|
|
|
279 |
def download_rf_dataset(url: str, rf_api: Roboflow, dest: Path) -> Path:
|
280 |
m = RF_RE.match(url.strip())
|
281 |
if not m:
|
282 |
raise ValueError(f"Bad RF URL: {url}")
|
283 |
ws, proj, ver = m.groups()
|
284 |
ds_dir = dest/f"{ws}_{proj}_v{ver}"
|
285 |
-
if ds_dir.exists():
|
286 |
-
|
287 |
-
|
288 |
-
project.version(int(ver)).download("yolov8", location=str(ds_dir))
|
289 |
return ds_dir
|
290 |
|
291 |
-
# βββββββββββββββββββ Main runner & Gradio UI βββββββββββββββββββββββββββββ
|
292 |
def run_quality(root: Path, yaml_file: Path | None, weights: Path | None, cfg: QCConfig) -> Tuple[str,pd.DataFrame]:
|
293 |
imgs,lbls,meta = gather_dataset(root, yaml_file)
|
294 |
results = [
|
@@ -300,11 +300,11 @@ def run_quality(root: Path, yaml_file: Path | None, weights: Path | None, cfg: Q
|
|
300 |
qc_label_issues(imgs,lbls,cfg),
|
301 |
]
|
302 |
final = aggregate(results)
|
303 |
-
md = [f"## **{meta.get('name',
|
304 |
for r in results:
|
305 |
md.append(f"### {r['name']}Β Β {r['score']:.1f}")
|
306 |
md.append("<details><summary>details</summary>\n```json")
|
307 |
-
md.append(json.dumps(r['details'],
|
308 |
md.append("```\n</details>\n")
|
309 |
df = pd.DataFrame.from_dict(
|
310 |
next(r for r in results if r['name']=='Class balance')['details']['class_counts'],
|
@@ -319,6 +319,7 @@ with gr.Blocks(title="YOLO Dataset Quality Evaluator v3") as demo:
|
|
319 |
|
320 |
* Configurable blur, IOU & confidence thresholds
|
321 |
* Cleanlab label-issue detection
|
|
|
322 |
* Model caching for speed
|
323 |
""")
|
324 |
with gr.Row():
|
@@ -343,7 +344,6 @@ with gr.Blocks(title="YOLO Dataset Quality Evaluator v3") as demo:
|
|
343 |
reports, dfs = [], []
|
344 |
cfg = QCConfig(blur_thr, iou_thr, conf_thr, weights.name if weights else None)
|
345 |
rf = Roboflow(api_key) if api_key and Roboflow else None
|
346 |
-
# Roboflow batch
|
347 |
if url_txt:
|
348 |
for line in Path(url_txt.name).read_text().splitlines():
|
349 |
if not line.strip(): continue
|
@@ -353,7 +353,6 @@ with gr.Blocks(title="YOLO Dataset Quality Evaluator v3") as demo:
|
|
353 |
reports.append(md); dfs.append(df)
|
354 |
except Exception as e:
|
355 |
reports.append(f"### {line}\nβ οΈΒ {e}")
|
356 |
-
# Manual ZIP
|
357 |
if zip_file:
|
358 |
tmp = Path(tempfile.mkdtemp())
|
359 |
shutil.unpack_archive(zip_file.name, tmp)
|
@@ -361,7 +360,6 @@ with gr.Blocks(title="YOLO Dataset Quality Evaluator v3") as demo:
|
|
361 |
Path(weights.name) if weights else None, cfg)
|
362 |
reports.append(md); dfs.append(df)
|
363 |
shutil.rmtree(tmp, ignore_errors=True)
|
364 |
-
# Server path
|
365 |
if server_path:
|
366 |
ds = Path(server_path)
|
367 |
md, df = run_quality(ds, Path(yaml_file.name) if yaml_file else None,
|
|
|
4 |
Changelog (2025β04β17)
|
5 |
ββββββββββββββββββββββ
|
6 |
β’ Fix RF URL regex to accept http/https
|
7 |
+
β’ Top-level functions for parallel mapping (picklable)
|
8 |
+
β’ Fastdup-only path in qc_duplicates (skips hashing fallback)
|
9 |
β’ YOLO model caching
|
10 |
β’ Config dataclass & Gradio sliders for blur, IOU, confidence
|
11 |
+
β’ Cleanlab integration for label-issue detection
|
12 |
"""
|
13 |
|
14 |
from __future__ import annotations
|
|
|
97 |
if arr.ndim == 1:
|
98 |
arr = arr.reshape(1, -1)
|
99 |
return [tuple(row) for row in arr]
|
100 |
+
except:
|
101 |
return []
|
102 |
|
103 |
def guess_image_dirs(root: Path) -> List[Path]:
|
104 |
+
candidates = [root/'images', root/'train'/'images', root/'valid'/'images',
|
105 |
+
root/'val'/'images', root/'test'/'images']
|
106 |
+
return [d for d in candidates if d.exists()]
|
107 |
|
108 |
def gather_dataset(root: Path, yaml_path: Path | None):
|
109 |
if yaml_path is None:
|
|
|
116 |
if not img_dirs:
|
117 |
raise FileNotFoundError("images/ directory missing")
|
118 |
imgs = [p for d in img_dirs for p in d.rglob('*.*') if imghdr.what(p)]
|
119 |
+
labels_roots = {d.parent/'labels' for d in img_dirs}
|
120 |
+
lbls = [next((lr/f"{p.stem}.txt" for lr in labels_roots if (lr/f"{p.stem}.txt").exists()), None)
|
121 |
for p in imgs]
|
122 |
return imgs, lbls, meta
|
123 |
|
124 |
def get_model(weights: str) -> YOLO | None:
|
125 |
+
if not weights or YOLO is None:
|
126 |
return None
|
127 |
if weights not in _model_cache:
|
128 |
_model_cache[weights] = YOLO(weights)
|
|
|
130 |
|
131 |
# βββββββββ Functions for parallel mapping ββββββββββββββββββββββββββββββββββ
|
132 |
def _quality_stat_args(args: Tuple[Path, float]) -> Tuple[Path, bool, bool, bool]:
|
133 |
+
path, thr = args
|
134 |
if cv2 is None:
|
135 |
return path, False, False, False
|
136 |
im = cv2.imread(str(path))
|
|
|
138 |
return path, False, False, False
|
139 |
gray = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
|
140 |
lap = cv2.Laplacian(gray, cv2.CV_64F).var()
|
141 |
+
mean = gray.mean()
|
142 |
+
return path, lap < thr, mean < 25, mean > 230
|
|
|
|
|
|
|
143 |
|
144 |
def _is_corrupt(path: Path) -> bool:
|
145 |
try:
|
|
|
151 |
|
152 |
# βββββββββββββββββ Quality Checks ββββββββββββββββββββββββββββββββββββββββββ
|
153 |
def qc_integrity(imgs: List[Path], lbls: List[Path], cfg: QCConfig) -> Dict:
|
154 |
+
missing = [i for i,l in zip(imgs,lbls) if l is None]
|
155 |
corrupt = []
|
156 |
with ProcessPoolExecutor(max_workers=cfg.cpu_count) as ex:
|
157 |
fut = {ex.submit(_is_corrupt, p): p for p in imgs}
|
158 |
for f in as_completed(fut):
|
159 |
if f.result(): corrupt.append(fut[f])
|
160 |
+
score = 100 - (len(missing)+len(corrupt))/max(len(imgs),1)*100
|
161 |
return {"name":"Integrity","score":max(score,0),
|
162 |
"details":{"missing_label_files":[str(p) for p in missing],
|
163 |
"corrupt_images":[str(p) for p in corrupt]}}
|
164 |
|
165 |
def qc_class_balance(lbls: List[Path], cfg: QCConfig) -> Dict:
|
166 |
+
counts, boxes = Counter(), []
|
167 |
for l in lbls:
|
168 |
bs = parse_label_file(l) if l else []
|
169 |
boxes.append(len(bs)); counts.update(b[0] for b in bs)
|
|
|
172 |
bal = min(counts.values())/max(counts.values())*100
|
173 |
return {"name":"Class balance","score":bal,
|
174 |
"details":{"class_counts":dict(counts),
|
175 |
+
"boxes_per_image":{
|
176 |
+
"min":min(boxes),"max":max(boxes),
|
177 |
+
"mean":float(np.mean(boxes))}}}
|
178 |
|
179 |
def qc_image_quality(imgs: List[Path], cfg: QCConfig) -> Dict:
|
180 |
if cv2 is None:
|
181 |
return {"name":"Image quality","score":100,"details":"cv2 missing"}
|
182 |
+
blurry, dark, bright = [], [], []
|
183 |
with ProcessPoolExecutor(max_workers=cfg.cpu_count) as ex:
|
184 |
args = [(p, cfg.blur_thr) for p in imgs]
|
185 |
for p, isb, isd, isB in tqdm(
|
186 |
+
ex.map(_quality_stat_args, args), total=len(imgs),
|
187 |
+
desc="img-quality", leave=False
|
188 |
+
):
|
189 |
if isb: blurry.append(p)
|
190 |
if isd: dark.append(p)
|
191 |
if isB: bright.append(p)
|
192 |
bad = len({*blurry,*dark,*bright})
|
193 |
+
score = 100 - bad/max(len(imgs),1)*100
|
194 |
return {"name":"Image quality","score":score,
|
195 |
"details":{"blurry":[str(p) for p in blurry],
|
196 |
"dark":[str(p) for p in dark],
|
197 |
"bright":[str(p) for p in bright]}}
|
198 |
|
199 |
def qc_duplicates(imgs: List[Path], cfg: QCConfig) -> Dict:
|
200 |
+
# fastdup-only path
|
201 |
+
if fastdup is not None and len(imgs) > 50:
|
202 |
try:
|
203 |
+
fd = fastdup.create(
|
204 |
+
input_dir=str(Path(imgs[0]).parent.parent),
|
205 |
+
work_dir=str(TMP_ROOT/'fastdup')
|
206 |
+
)
|
207 |
+
fd.run()
|
208 |
+
clusters = fd.get_clusters()
|
209 |
dup = sum(len(c)-1 for c in clusters)
|
210 |
return {"name":"Duplicates","score":100-dup/len(imgs)*100,
|
211 |
"details":{"groups":clusters[:50]}}
|
212 |
+
except Exception as e:
|
213 |
+
return {"name":"Duplicates","score":100,
|
214 |
+
"details":{"fastdup_error":str(e)}}
|
215 |
+
# fallback skipped
|
216 |
+
return {"name":"Duplicates","score":100,
|
217 |
+
"details":{"note":"fastdup not available or small dataset"}}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
218 |
|
219 |
def qc_model_qa(imgs: List[Path], lbls: List[Path], cfg: QCConfig) -> Dict:
|
220 |
model = get_model(cfg.weights)
|
221 |
if model is None:
|
222 |
return {"name":"Model QA","score":100,"details":"skipped"}
|
223 |
ious, mism = [], []
|
224 |
+
for i in range(0,len(imgs),cfg.batch_size):
|
225 |
batch = imgs[i:i+cfg.batch_size]
|
226 |
results = model.predict(batch, verbose=False, half=True, dynamic=True)
|
227 |
+
for p,res in zip(batch,results):
|
228 |
gt = parse_label_file(p.parent.parent/'labels'/f"{p.stem}.txt")
|
229 |
for cls,x,y,w,h in gt:
|
230 |
best=0.0
|
231 |
for b,c,conf in zip(res.boxes.xywh.cpu().numpy(),
|
232 |
res.boxes.cls.cpu().numpy(),
|
233 |
res.boxes.conf.cpu().numpy()):
|
234 |
+
if conf<cfg.conf_thr or int(c)!=cls: continue
|
235 |
+
best = max(best,_rel_iou((x,y,w,h),tuple(b)))
|
236 |
ious.append(best)
|
237 |
+
if best<cfg.iou_thr: mism.append(str(p))
|
238 |
miou = float(np.mean(ious)) if ious else 1.0
|
239 |
return {"name":"Model QA","score":miou*100,
|
240 |
"details":{"mean_iou":miou,"mismatches":mism[:50]}}
|
|
|
242 |
def qc_label_issues(imgs: List[Path], lbls: List[Path], cfg: QCConfig) -> Dict:
|
243 |
if get_noise_indices is None:
|
244 |
return {"name":"Label issues","score":100,"details":"cleanlab missing"}
|
245 |
+
labels, preds, idxs = [], [], []
|
246 |
+
model = get_model(cfg.weights)
|
247 |
+
for i,(img,lbl) in enumerate(zip(imgs,lbls)):
|
248 |
bs = parse_label_file(lbl) if lbl else []
|
249 |
for cls,*_ in bs:
|
250 |
labels.append(int(cls)); idxs.append(i)
|
251 |
+
res = model.predict([img], verbose=False)[0]
|
252 |
+
pred_cls = int(res.boxes.cls.cpu().numpy()[0]) if len(res.boxes)>0 else -1
|
253 |
+
preds.append(pred_cls)
|
254 |
if not labels:
|
255 |
return {"name":"Label issues","score":100,"details":"no GT"}
|
256 |
labels_arr = np.array(labels)
|
257 |
+
# one-hot dummy
|
258 |
+
uniq = sorted(set(labels_arr))
|
259 |
+
probs = np.eye(len(uniq))[np.searchsorted(uniq, labels_arr)]
|
260 |
noise = get_noise_indices(labels=labels_arr, probabilities=probs)
|
261 |
+
flags = sorted({idxs[n] for n in noise})
|
262 |
+
files = [str(imgs[i]) for i in flags]
|
263 |
+
score = 100 - len(flags)/len(labels)*100
|
264 |
+
return {"name":"Label issues","score":score,
|
265 |
+
"details":{"files":files[:50]}}
|
266 |
+
|
267 |
+
def _rel_iou(b1,b2):
|
268 |
+
x1,y1,w1,h1=b1; x2,y2,w2,h2=b2
|
269 |
+
xa1,ya1,xa2,ya2=x1-w1/2,y1-h1/2,x1+w1/2,y1+h1/2
|
270 |
+
xb1,yb1,xb2,yb2=x2-w2/2,y2-h2/2,x2+w2/2,y2+h2/2
|
271 |
+
ix1,iy1,ix2,iy2=max(xa1,xb1),max(ya1,yb1),min(xa2,xb2),min(ya2,yb2)
|
272 |
+
inter=max(ix2-ix1,0)*max(iy2-iy1,0)
|
273 |
+
union=w1*h1+w2*h2-inter
|
274 |
return inter/union if union else 0.0
|
275 |
|
276 |
def aggregate(results: List[Dict]) -> float:
|
277 |
+
return sum(DEFAULT_W[r['name']]*r['score'] for r in results)
|
278 |
|
|
|
279 |
RF_RE = re.compile(r"https?://universe\.roboflow\.com/([^/]+)/([^/]+)/dataset/(\d+)")
|
280 |
+
|
281 |
def download_rf_dataset(url: str, rf_api: Roboflow, dest: Path) -> Path:
|
282 |
m = RF_RE.match(url.strip())
|
283 |
if not m:
|
284 |
raise ValueError(f"Bad RF URL: {url}")
|
285 |
ws, proj, ver = m.groups()
|
286 |
ds_dir = dest/f"{ws}_{proj}_v{ver}"
|
287 |
+
if ds_dir.exists(): return ds_dir
|
288 |
+
pr = rf_api.workspace(ws).project(proj)
|
289 |
+
pr.version(int(ver)).download("yolov8", location=str(ds_dir))
|
|
|
290 |
return ds_dir
|
291 |
|
|
|
292 |
def run_quality(root: Path, yaml_file: Path | None, weights: Path | None, cfg: QCConfig) -> Tuple[str,pd.DataFrame]:
|
293 |
imgs,lbls,meta = gather_dataset(root, yaml_file)
|
294 |
results = [
|
|
|
300 |
qc_label_issues(imgs,lbls,cfg),
|
301 |
]
|
302 |
final = aggregate(results)
|
303 |
+
md = [f"## **{meta.get('name',root.name)}** β ScoreΒ {final:.1f}/100"]
|
304 |
for r in results:
|
305 |
md.append(f"### {r['name']}Β Β {r['score']:.1f}")
|
306 |
md.append("<details><summary>details</summary>\n```json")
|
307 |
+
md.append(json.dumps(r['details'],indent=2))
|
308 |
md.append("```\n</details>\n")
|
309 |
df = pd.DataFrame.from_dict(
|
310 |
next(r for r in results if r['name']=='Class balance')['details']['class_counts'],
|
|
|
319 |
|
320 |
* Configurable blur, IOU & confidence thresholds
|
321 |
* Cleanlab label-issue detection
|
322 |
+
* Fastdup-only duplicates (no hashing fallback)
|
323 |
* Model caching for speed
|
324 |
""")
|
325 |
with gr.Row():
|
|
|
344 |
reports, dfs = [], []
|
345 |
cfg = QCConfig(blur_thr, iou_thr, conf_thr, weights.name if weights else None)
|
346 |
rf = Roboflow(api_key) if api_key and Roboflow else None
|
|
|
347 |
if url_txt:
|
348 |
for line in Path(url_txt.name).read_text().splitlines():
|
349 |
if not line.strip(): continue
|
|
|
353 |
reports.append(md); dfs.append(df)
|
354 |
except Exception as e:
|
355 |
reports.append(f"### {line}\nβ οΈΒ {e}")
|
|
|
356 |
if zip_file:
|
357 |
tmp = Path(tempfile.mkdtemp())
|
358 |
shutil.unpack_archive(zip_file.name, tmp)
|
|
|
360 |
Path(weights.name) if weights else None, cfg)
|
361 |
reports.append(md); dfs.append(df)
|
362 |
shutil.rmtree(tmp, ignore_errors=True)
|
|
|
363 |
if server_path:
|
364 |
ds = Path(server_path)
|
365 |
md, df = run_quality(ds, Path(yaml_file.name) if yaml_file else None,
|