Spaces:

wuhp
/

evaltest

Sleeping

App Files Files Community

wuhp commited on 19 days ago

Commit

b8d4606

verified ·

1 Parent(s): 43dfbd2

Update app.py

Browse files

Files changed (1) hide show

app.py +44 -33

app.py CHANGED Viewed

@@ -47,8 +47,8 @@ except ImportError:
 # ───────────────── Config & Constants ───────────────────────────────────────
 TMP_ROOT = Path(tempfile.gettempdir()) / "rf_datasets"
 TMP_ROOT.mkdir(parents=True, exist_ok=True)
-CPU_COUNT    = int(os.getenv("QC_CPU",      1))
-BATCH_SIZE   = int(os.getenv("QC_BATCH",    4))
 SAMPLE_LIMIT = int(os.getenv("QC_SAMPLE", 200))
 DEFAULT_W = {
@@ -64,21 +64,20 @@ _model_cache: dict[str, YOLO] = {}
 @dataclass
 class QCConfig:
-    blur_thr:     float
-    iou_thr:      float
-    conf_thr:     float
-    weights:      str | None
-    cpu_count:    int = CPU_COUNT
-    batch_size:   int = BATCH_SIZE
-    sample_limit: int = SAMPLE_LIMIT
 # ─────────── Helpers & Caching ─────────────────────────────────────────────
 def load_yaml(path: Path) -> Dict:
     with path.open('r', encoding='utf-8') as f:
         return yaml.safe_load(f)
-def parse_label_file(path: Path) -> List[tuple[int, float, float, float, float]]:
     if not path or not path.exists() or path.stat().st_size == 0:
         return []
     try:
@@ -124,7 +123,6 @@ def get_model(weights: str) -> YOLO | None:
         _model_cache[weights] = YOLO(weights)
     return _model_cache[weights]
 # ───────── Functions for I/O-bound concurrency ─────────────────────────────
 def _quality_stat_args(args: Tuple[Path, float]) -> Tuple[Path, bool, bool, bool]:
     path, thr = args
@@ -146,12 +144,11 @@ def _is_corrupt(path: Path) -> bool:
     except:
         return True
 # ───────────────── Quality Checks ──────────────────────────────────────────
 def qc_integrity(imgs: List[Path], lbls: List[Path], cfg: QCConfig) -> Dict:
     missing = [i for i, l in zip(imgs, lbls) if l is None]
     corrupt = []
-    sample = imgs if len(imgs) <= cfg.sample_limit else imgs[:cfg.sample_limit]
     with ThreadPoolExecutor(max_workers=cfg.cpu_count) as ex:
         fut = {ex.submit(_is_corrupt, p): p for p in sample}
         for f in as_completed(fut):
@@ -193,7 +190,7 @@ def qc_image_quality(imgs: List[Path], cfg: QCConfig) -> Dict:
     if cv2 is None:
         return {"name":"Image quality","score":100,"details":"cv2 missing"}
     blurry, dark, bright = [], [], []
-    sample = imgs if len(imgs) <= cfg.sample_limit else imgs[:cfg.sample_limit]
     with ThreadPoolExecutor(max_workers=cfg.cpu_count) as ex:
         args = [(p, cfg.blur_thr) for p in sample]
         for p, isb, isd, isB in ex.map(_quality_stat_args, args):
@@ -220,15 +217,23 @@ def qc_duplicates(imgs: List[Path], cfg: QCConfig) -> Dict:
                 work_dir=str(TMP_ROOT / "fastdup")
             )
             fd.run()
-            # try DataFrame API
             try:
                 cc = fd.connected_components_grouped(sort_by="comp_size", ascending=False)
                 if "files" in cc.columns:
                     clusters = cc["files"].tolist()
                 else:
-                    clusters = cc.groupby("component")["filename"].apply(list).tolist()
             except Exception:
                 clusters = fd.connected_components()
             dup = sum(len(c) - 1 for c in clusters)
             score = max(0.0, 100 - dup / len(imgs) * 100)
             return {
@@ -242,14 +247,14 @@ def qc_duplicates(imgs: List[Path], cfg: QCConfig) -> Dict:
                 "score": 100.0,
                 "details": {"fastdup_error": str(e)}
             }
-    return {"name":"Duplicates","score":100.0,"details":{"note":"skipped"}}
 def qc_model_qa(imgs: List[Path], lbls: List[Path], cfg: QCConfig) -> Dict:
     model = get_model(cfg.weights)
     if model is None:
         return {"name":"Model QA","score":100,"details":"skipped"}
     ious, mism = [], []
-    sample = imgs if len(imgs) <= cfg.sample_limit else imgs[:cfg.sample_limit]
     for i in range(0, len(sample), cfg.batch_size):
         batch = sample[i:i+cfg.batch_size]
         results = model.predict(batch, verbose=False, half=True, dynamic=True)
@@ -279,7 +284,7 @@ def qc_label_issues(imgs: List[Path], lbls: List[Path], cfg: QCConfig) -> Dict:
     if get_noise_indices is None:
         return {"name":"Label issues","score":100,"details":"skipped"}
     labels, idxs = [], []
-    sample = imgs if len(imgs) <= cfg.sample_limit else imgs[:cfg.sample_limit]
     for i, p in enumerate(sample):
         bs = parse_label_file(lbls[i]) if lbls[i] else []
         for cls, *_ in bs:
@@ -382,7 +387,7 @@ with gr.Blocks(title="YOLO Dataset Quality Evaluator v3") as demo:
         yaml_in    = gr.File(label="Custom YAML", file_types=['.yaml'])
         weights_in = gr.File(label="YOLO weights (.pt)")
     with gr.Row():
-        blur_sl    = gr.Slider(0.0, 500.0, value=150.0, label="Blur threshold")
         iou_sl     = gr.Slider(0.0, 1.0,   value=0.5,   label="IOU threshold")
         conf_sl    = gr.Slider(0.0, 1.0,   value=0.25,  label="Min detection confidence")
     with gr.Row():
@@ -405,14 +410,16 @@ with gr.Blocks(title="YOLO Dataset Quality Evaluator v3") as demo:
         # Roboflow URLs
         if url_txt:
-            lines = Path(url_txt.name).read_text().splitlines()
-            for line in lines:
                 if not line.strip():
                     continue
                 try:
                     ds = download_rf_dataset(line, rf, TMP_ROOT)
-                    md, df = run_quality(ds, None, Path(weights.name) if weights else None,
-                                         cfg, run_dup, run_modelqa)
                     reports.append(md)
                     dfs.append(df)
                 except Exception as e:
@@ -422,10 +429,12 @@ with gr.Blocks(title="YOLO Dataset Quality Evaluator v3") as demo:
         if zip_file:
             tmp = Path(tempfile.mkdtemp())
             shutil.unpack_archive(zip_file.name, tmp)
-            md, df = run_quality(tmp,
-                                 Path(yaml_file.name) if yaml_file else None,
-                                 Path(weights.name) if weights else None,
-                                 cfg, run_dup, run_modelqa)
             reports.append(md)
             dfs.append(df)
             shutil.rmtree(tmp, ignore_errors=True)
@@ -433,10 +442,12 @@ with gr.Blocks(title="YOLO Dataset Quality Evaluator v3") as demo:
         # Server path
         if server_path:
             ds = Path(server_path)
-            md, df = run_quality(ds,
-                                 Path(yaml_file.name) if yaml_file else None,
-                                 Path(weights.name) if weights else None,
-                                 cfg, run_dup, run_modelqa)
             reports.append(md)
             dfs.append(df)

 # ───────────────── Config & Constants ───────────────────────────────────────
 TMP_ROOT = Path(tempfile.gettempdir()) / "rf_datasets"
 TMP_ROOT.mkdir(parents=True, exist_ok=True)
+CPU_COUNT    = int(os.getenv("QC_CPU",      1))   # force single-core by default
+BATCH_SIZE   = int(os.getenv("QC_BATCH",    4))   # small batches
 SAMPLE_LIMIT = int(os.getenv("QC_SAMPLE", 200))
 DEFAULT_W = {
 @dataclass
 class QCConfig:
+    blur_thr:    float
+    iou_thr:     float
+    conf_thr:    float
+    weights:     str | None
+    cpu_count:   int = CPU_COUNT
+    batch_size:  int = BATCH_SIZE
+    sample_limit:int = SAMPLE_LIMIT
 # ─────────── Helpers & Caching ─────────────────────────────────────────────
 def load_yaml(path: Path) -> Dict:
     with path.open('r', encoding='utf-8') as f:
         return yaml.safe_load(f)
+def parse_label_file(path: Path) -> list[tuple[int, float, float, float, float]]:
     if not path or not path.exists() or path.stat().st_size == 0:
         return []
     try:
         _model_cache[weights] = YOLO(weights)
     return _model_cache[weights]
 # ───────── Functions for I/O-bound concurrency ─────────────────────────────
 def _quality_stat_args(args: Tuple[Path, float]) -> Tuple[Path, bool, bool, bool]:
     path, thr = args
     except:
         return True
 # ───────────────── Quality Checks ──────────────────────────────────────────
 def qc_integrity(imgs: List[Path], lbls: List[Path], cfg: QCConfig) -> Dict:
     missing = [i for i, l in zip(imgs, lbls) if l is None]
     corrupt = []
+    sample = imgs[:cfg.sample_limit]
     with ThreadPoolExecutor(max_workers=cfg.cpu_count) as ex:
         fut = {ex.submit(_is_corrupt, p): p for p in sample}
         for f in as_completed(fut):
     if cv2 is None:
         return {"name":"Image quality","score":100,"details":"cv2 missing"}
     blurry, dark, bright = [], [], []
+    sample = imgs[:cfg.sample_limit]
     with ThreadPoolExecutor(max_workers=cfg.cpu_count) as ex:
         args = [(p, cfg.blur_thr) for p in sample]
         for p, isb, isd, isB in ex.map(_quality_stat_args, args):
                 work_dir=str(TMP_ROOT / "fastdup")
             )
             fd.run()
+            # Try the grouped-DataFrame API first:
             try:
                 cc = fd.connected_components_grouped(sort_by="comp_size", ascending=False)
                 if "files" in cc.columns:
                     clusters = cc["files"].tolist()
                 else:
+                    # fallback: group by component ID, collect filenames
+                    clusters = (
+                        cc.groupby("component")["filename"]
+                          .apply(list)
+                          .tolist()
+                    )
             except Exception:
+                # final fallback to the old list-based API
                 clusters = fd.connected_components()
             dup = sum(len(c) - 1 for c in clusters)
             score = max(0.0, 100 - dup / len(imgs) * 100)
             return {
                 "score": 100.0,
                 "details": {"fastdup_error": str(e)}
             }
+    return {"name": "Duplicates", "score": 100.0, "details": {"note": "skipped"}}
 def qc_model_qa(imgs: List[Path], lbls: List[Path], cfg: QCConfig) -> Dict:
     model = get_model(cfg.weights)
     if model is None:
         return {"name":"Model QA","score":100,"details":"skipped"}
     ious, mism = [], []
+    sample = imgs[:cfg.sample_limit]
     for i in range(0, len(sample), cfg.batch_size):
         batch = sample[i:i+cfg.batch_size]
         results = model.predict(batch, verbose=False, half=True, dynamic=True)
     if get_noise_indices is None:
         return {"name":"Label issues","score":100,"details":"skipped"}
     labels, idxs = [], []
+    sample = imgs[:cfg.sample_limit]
     for i, p in enumerate(sample):
         bs = parse_label_file(lbls[i]) if lbls[i] else []
         for cls, *_ in bs:
         yaml_in    = gr.File(label="Custom YAML", file_types=['.yaml'])
         weights_in = gr.File(label="YOLO weights (.pt)")
     with gr.Row():
+        blur_sl    = gr.Slider(0.0, 500.0, value=100.0, label="Blur threshold")
         iou_sl     = gr.Slider(0.0, 1.0,   value=0.5,   label="IOU threshold")
         conf_sl    = gr.Slider(0.0, 1.0,   value=0.25,  label="Min detection confidence")
     with gr.Row():
         # Roboflow URLs
         if url_txt:
+            for line in Path(url_txt.name).read_text().splitlines():
                 if not line.strip():
                     continue
                 try:
                     ds = download_rf_dataset(line, rf, TMP_ROOT)
+                    md, df = run_quality(
+                        ds, None,
+                        Path(weights.name) if weights else None,
+                        cfg, run_dup, run_modelqa
+                    )
                     reports.append(md)
                     dfs.append(df)
                 except Exception as e:
         if zip_file:
             tmp = Path(tempfile.mkdtemp())
             shutil.unpack_archive(zip_file.name, tmp)
+            md, df = run_quality(
+                tmp,
+                Path(yaml_file.name) if yaml_file else None,
+                Path(weights.name) if weights else None,
+                cfg, run_dup, run_modelqa
+            )
             reports.append(md)
             dfs.append(df)
             shutil.rmtree(tmp, ignore_errors=True)
         # Server path
         if server_path:
             ds = Path(server_path)
+            md, df = run_quality(
+                ds,
+                Path(yaml_file.name) if yaml_file else None,
+                Path(weights.name) if weights else None,
+                cfg, run_dup, run_modelqa
+            )
             reports.append(md)
             dfs.append(df)