Spaces:

developer0hye
/

D-FINE

Running on Zero

App Files Files Community

developer0hye commited on Mar 24

Commit

e85fecb

verified ·

1 Parent(s): 8765dbd

Upload 76 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

src/__init__.py +6 -0
src/core/__init__.py +9 -0
src/core/_config.py +299 -0
src/core/workspace.py +178 -0
src/core/yaml_config.py +187 -0
src/core/yaml_utils.py +126 -0
src/data/__init__.py +20 -0
src/data/_misc.py +62 -0
src/data/dataloader.py +122 -0
src/data/dataset/__init__.py +17 -0
src/data/dataset/_dataset.py +27 -0
src/data/dataset/cifar_dataset.py +25 -0
src/data/dataset/coco_dataset.py +282 -0
src/data/dataset/coco_eval.py +214 -0
src/data/dataset/coco_utils.py +191 -0
src/data/dataset/voc_detection.py +86 -0
src/data/dataset/voc_eval.py +12 -0
src/data/transforms/__init__.py +21 -0
src/data/transforms/_transforms.py +161 -0
src/data/transforms/container.py +99 -0
src/data/transforms/functional.py +172 -0
src/data/transforms/mosaic.py +83 -0
src/data/transforms/presets.py +4 -0
src/misc/__init__.py +9 -0
src/misc/box_ops.py +106 -0
src/misc/dist_utils.py +281 -0
src/misc/lazy_loader.py +70 -0
src/misc/logger.py +255 -0
src/misc/profiler_utils.py +30 -0
src/misc/visualizer.py +121 -0
src/nn/__init__.py +16 -0
src/nn/arch/__init__.py +7 -0
src/nn/arch/classification.py +45 -0
src/nn/arch/yolo.py +42 -0
src/nn/backbone/__init__.py +17 -0
src/nn/backbone/common.py +117 -0
src/nn/backbone/csp_darknet.py +203 -0
src/nn/backbone/csp_resnet.py +302 -0
src/nn/backbone/hgnetv2.py +581 -0
src/nn/backbone/presnet.py +263 -0
src/nn/backbone/test_resnet.py +83 -0
src/nn/backbone/timm_model.py +66 -0
src/nn/backbone/torchvision_model.py +50 -0
src/nn/backbone/utils.py +56 -0
src/nn/criterion/__init__.py +11 -0
src/nn/criterion/det_criterion.py +188 -0
src/nn/postprocessor/__init__.py +6 -0
src/nn/postprocessor/box_revert.py +66 -0
src/nn/postprocessor/detr_postprocessor.py +86 -0
src/nn/postprocessor/nms_postprocessor.py +86 -0

src/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+"""
+Copyright (c) 2024 The D-FINE Authors. All Rights Reserved.
+"""
+# for register purpose
+from . import data, nn, optim, zoo

src/core/__init__.py ADDED Viewed

	@@ -0,0 +1,9 @@

+"""
+Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+from ._config import BaseConfig
+from .workspace import GLOBAL_CONFIG, create, register
+from .yaml_config import YAMLConfig
+from .yaml_utils import *

src/core/_config.py ADDED Viewed

	@@ -0,0 +1,299 @@

+"""
+Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+from pathlib import Path
+from typing import Callable, Dict, List
+import torch
+import torch.nn as nn
+from torch.cuda.amp.grad_scaler import GradScaler
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import LRScheduler
+from torch.utils.data import DataLoader, Dataset
+from torch.utils.tensorboard import SummaryWriter
+__all__ = [
+    "BaseConfig",
+]
+class BaseConfig(object):
+    # TODO property
+    def __init__(self) -> None:
+        super().__init__()
+        self.task: str = None
+        # instance / function
+        self._model: nn.Module = None
+        self._postprocessor: nn.Module = None
+        self._criterion: nn.Module = None
+        self._optimizer: Optimizer = None
+        self._lr_scheduler: LRScheduler = None
+        self._lr_warmup_scheduler: LRScheduler = None
+        self._train_dataloader: DataLoader = None
+        self._val_dataloader: DataLoader = None
+        self._ema: nn.Module = None
+        self._scaler: GradScaler = None
+        self._train_dataset: Dataset = None
+        self._val_dataset: Dataset = None
+        self._collate_fn: Callable = None
+        self._evaluator: Callable[[nn.Module, DataLoader, str],] = None
+        self._writer: SummaryWriter = None
+        # dataset
+        self.num_workers: int = 0
+        self.batch_size: int = None
+        self._train_batch_size: int = None
+        self._val_batch_size: int = None
+        self._train_shuffle: bool = None
+        self._val_shuffle: bool = None
+        # runtime
+        self.resume: str = None
+        self.tuning: str = None
+        self.epochs: int = None
+        self.last_epoch: int = -1
+        self.use_amp: bool = False
+        self.use_ema: bool = False
+        self.ema_decay: float = 0.9999
+        self.ema_warmups: int = 2000
+        self.sync_bn: bool = False
+        self.clip_max_norm: float = 0.0
+        self.find_unused_parameters: bool = None
+        self.seed: int = None
+        self.print_freq: int = None
+        self.checkpoint_freq: int = 1
+        self.output_dir: str = None
+        self.summary_dir: str = None
+        self.device: str = ""
+    @property
+    def model(self) -> nn.Module:
+        return self._model
+    @model.setter
+    def model(self, m):
+        assert isinstance(m, nn.Module), f"{type(m)} != nn.Module, please check your model class"
+        self._model = m
+    @property
+    def postprocessor(self) -> nn.Module:
+        return self._postprocessor
+    @postprocessor.setter
+    def postprocessor(self, m):
+        assert isinstance(m, nn.Module), f"{type(m)} != nn.Module, please check your model class"
+        self._postprocessor = m
+    @property
+    def criterion(self) -> nn.Module:
+        return self._criterion
+    @criterion.setter
+    def criterion(self, m):
+        assert isinstance(m, nn.Module), f"{type(m)} != nn.Module, please check your model class"
+        self._criterion = m
+    @property
+    def optimizer(self) -> Optimizer:
+        return self._optimizer
+    @optimizer.setter
+    def optimizer(self, m):
+        assert isinstance(
+            m, Optimizer
+        ), f"{type(m)} != optim.Optimizer, please check your model class"
+        self._optimizer = m
+    @property
+    def lr_scheduler(self) -> LRScheduler:
+        return self._lr_scheduler
+    @lr_scheduler.setter
+    def lr_scheduler(self, m):
+        assert isinstance(
+            m, LRScheduler
+        ), f"{type(m)} != LRScheduler, please check your model class"
+        self._lr_scheduler = m
+    @property
+    def lr_warmup_scheduler(self) -> LRScheduler:
+        return self._lr_warmup_scheduler
+    @lr_warmup_scheduler.setter
+    def lr_warmup_scheduler(self, m):
+        self._lr_warmup_scheduler = m
+    @property
+    def train_dataloader(self) -> DataLoader:
+        if self._train_dataloader is None and self.train_dataset is not None:
+            loader = DataLoader(
+                self.train_dataset,
+                batch_size=self.train_batch_size,
+                num_workers=self.num_workers,
+                collate_fn=self.collate_fn,
+                shuffle=self.train_shuffle,
+            )
+            loader.shuffle = self.train_shuffle
+            self._train_dataloader = loader
+        return self._train_dataloader
+    @train_dataloader.setter
+    def train_dataloader(self, loader):
+        self._train_dataloader = loader
+    @property
+    def val_dataloader(self) -> DataLoader:
+        if self._val_dataloader is None and self.val_dataset is not None:
+            loader = DataLoader(
+                self.val_dataset,
+                batch_size=self.val_batch_size,
+                num_workers=self.num_workers,
+                drop_last=False,
+                collate_fn=self.collate_fn,
+                shuffle=self.val_shuffle,
+                persistent_workers=True,
+            )
+            loader.shuffle = self.val_shuffle
+            self._val_dataloader = loader
+        return self._val_dataloader
+    @val_dataloader.setter
+    def val_dataloader(self, loader):
+        self._val_dataloader = loader
+    @property
+    def ema(self) -> nn.Module:
+        if self._ema is None and self.use_ema and self.model is not None:
+            from ..optim import ModelEMA
+            self._ema = ModelEMA(self.model, self.ema_decay, self.ema_warmups)
+        return self._ema
+    @ema.setter
+    def ema(self, obj):
+        self._ema = obj
+    @property
+    def scaler(self) -> GradScaler:
+        if self._scaler is None and self.use_amp and torch.cuda.is_available():
+            self._scaler = GradScaler()
+        return self._scaler
+    @scaler.setter
+    def scaler(self, obj: GradScaler):
+        self._scaler = obj
+    @property
+    def val_shuffle(self) -> bool:
+        if self._val_shuffle is None:
+            print("warning: set default val_shuffle=False")
+            return False
+        return self._val_shuffle
+    @val_shuffle.setter
+    def val_shuffle(self, shuffle):
+        assert isinstance(shuffle, bool), "shuffle must be bool"
+        self._val_shuffle = shuffle
+    @property
+    def train_shuffle(self) -> bool:
+        if self._train_shuffle is None:
+            print("warning: set default train_shuffle=True")
+            return True
+        return self._train_shuffle
+    @train_shuffle.setter
+    def train_shuffle(self, shuffle):
+        assert isinstance(shuffle, bool), "shuffle must be bool"
+        self._train_shuffle = shuffle
+    @property
+    def train_batch_size(self) -> int:
+        if self._train_batch_size is None and isinstance(self.batch_size, int):
+            print(f"warning: set train_batch_size=batch_size={self.batch_size}")
+            return self.batch_size
+        return self._train_batch_size
+    @train_batch_size.setter
+    def train_batch_size(self, batch_size):
+        assert isinstance(batch_size, int), "batch_size must be int"
+        self._train_batch_size = batch_size
+    @property
+    def val_batch_size(self) -> int:
+        if self._val_batch_size is None:
+            print(f"warning: set val_batch_size=batch_size={self.batch_size}")
+            return self.batch_size
+        return self._val_batch_size
+    @val_batch_size.setter
+    def val_batch_size(self, batch_size):
+        assert isinstance(batch_size, int), "batch_size must be int"
+        self._val_batch_size = batch_size
+    @property
+    def train_dataset(self) -> Dataset:
+        return self._train_dataset
+    @train_dataset.setter
+    def train_dataset(self, dataset):
+        assert isinstance(dataset, Dataset), f"{type(dataset)} must be Dataset"
+        self._train_dataset = dataset
+    @property
+    def val_dataset(self) -> Dataset:
+        return self._val_dataset
+    @val_dataset.setter
+    def val_dataset(self, dataset):
+        assert isinstance(dataset, Dataset), f"{type(dataset)} must be Dataset"
+        self._val_dataset = dataset
+    @property
+    def collate_fn(self) -> Callable:
+        return self._collate_fn
+    @collate_fn.setter
+    def collate_fn(self, fn):
+        assert isinstance(fn, Callable), f"{type(fn)} must be Callable"
+        self._collate_fn = fn
+    @property
+    def evaluator(self) -> Callable:
+        return self._evaluator
+    @evaluator.setter
+    def evaluator(self, fn):
+        assert isinstance(fn, Callable), f"{type(fn)} must be Callable"
+        self._evaluator = fn
+    @property
+    def writer(self) -> SummaryWriter:
+        if self._writer is None:
+            if self.summary_dir:
+                self._writer = SummaryWriter(self.summary_dir)
+            elif self.output_dir:
+                self._writer = SummaryWriter(Path(self.output_dir) / "summary")
+        return self._writer
+    @writer.setter
+    def writer(self, m):
+        assert isinstance(m, SummaryWriter), f"{type(m)} must be SummaryWriter"
+        self._writer = m
+    def __repr__(self):
+        s = ""
+        for k, v in self.__dict__.items():
+            if not k.startswith("_"):
+                s += f"{k}: {v}\n"
+        return s

src/core/workspace.py ADDED Viewed

	@@ -0,0 +1,178 @@

+"""
+Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+import functools
+import importlib
+import inspect
+from collections import defaultdict
+from typing import Any, Dict, List, Optional
+GLOBAL_CONFIG = defaultdict(dict)
+def register(dct: Any = GLOBAL_CONFIG, name=None, force=False):
+    """
+    dct:
+        if dct is Dict, register foo into dct as key-value pair
+        if dct is Clas, register as modules attibute
+    force
+        whether force register.
+    """
+    def decorator(foo):
+        register_name = foo.__name__ if name is None else name
+        if not force:
+            if inspect.isclass(dct):
+                assert not hasattr(dct, foo.__name__), f"module {dct.__name__} has {foo.__name__}"
+            else:
+                assert foo.__name__ not in dct, f"{foo.__name__} has been already registered"
+        if inspect.isfunction(foo):
+            @functools.wraps(foo)
+            def wrap_func(*args, **kwargs):
+                return foo(*args, **kwargs)
+            if isinstance(dct, dict):
+                dct[foo.__name__] = wrap_func
+            elif inspect.isclass(dct):
+                setattr(dct, foo.__name__, wrap_func)
+            else:
+                raise AttributeError("")
+            return wrap_func
+        elif inspect.isclass(foo):
+            dct[register_name] = extract_schema(foo)
+        else:
+            raise ValueError(f"Do not support {type(foo)} register")
+        return foo
+    return decorator
+def extract_schema(module: type):
+    """
+    Args:
+        module (type),
+    Return:
+        Dict,
+    """
+    argspec = inspect.getfullargspec(module.__init__)
+    arg_names = [arg for arg in argspec.args if arg != "self"]
+    num_defualts = len(argspec.defaults) if argspec.defaults is not None else 0
+    num_requires = len(arg_names) - num_defualts
+    schame = dict()
+    schame["_name"] = module.__name__
+    schame["_pymodule"] = importlib.import_module(module.__module__)
+    schame["_inject"] = getattr(module, "__inject__", [])
+    schame["_share"] = getattr(module, "__share__", [])
+    schame["_kwargs"] = {}
+    for i, name in enumerate(arg_names):
+        if name in schame["_share"]:
+            assert i >= num_requires, "share config must have default value."
+            value = argspec.defaults[i - num_requires]
+        elif i >= num_requires:
+            value = argspec.defaults[i - num_requires]
+        else:
+            value = None
+        schame[name] = value
+        schame["_kwargs"][name] = value
+    return schame
+def create(type_or_name, global_cfg=GLOBAL_CONFIG, **kwargs):
+    """ """
+    assert type(type_or_name) in (type, str), "create should be modules or name."
+    name = type_or_name if isinstance(type_or_name, str) else type_or_name.__name__
+    if name in global_cfg:
+        if hasattr(global_cfg[name], "__dict__"):
+            return global_cfg[name]
+    else:
+        raise ValueError("The module {} is not registered".format(name))
+    cfg = global_cfg[name]
+    if isinstance(cfg, dict) and "type" in cfg:
+        _cfg: dict = global_cfg[cfg["type"]]
+        # clean args
+        _keys = [k for k in _cfg.keys() if not k.startswith("_")]
+        for _arg in _keys:
+            del _cfg[_arg]
+        _cfg.update(_cfg["_kwargs"])  # restore default args
+        _cfg.update(cfg)  # load config args
+        _cfg.update(kwargs)  # TODO recive extra kwargs
+        name = _cfg.pop("type")  # pop extra key `type` (from cfg)
+        return create(name, global_cfg)
+    module = getattr(cfg["_pymodule"], name)
+    module_kwargs = {}
+    module_kwargs.update(cfg)
+    # shared var
+    for k in cfg["_share"]:
+        if k in global_cfg:
+            module_kwargs[k] = global_cfg[k]
+        else:
+            module_kwargs[k] = cfg[k]
+    # inject
+    for k in cfg["_inject"]:
+        _k = cfg[k]
+        if _k is None:
+            continue
+        if isinstance(_k, str):
+            if _k not in global_cfg:
+                raise ValueError(f"Missing inject config of {_k}.")
+            _cfg = global_cfg[_k]
+            if isinstance(_cfg, dict):
+                module_kwargs[k] = create(_cfg["_name"], global_cfg)
+            else:
+                module_kwargs[k] = _cfg
+        elif isinstance(_k, dict):
+            if "type" not in _k.keys():
+                raise ValueError("Missing inject for `type` style.")
+            _type = str(_k["type"])
+            if _type not in global_cfg:
+                raise ValueError(f"Missing {_type} in inspect stage.")
+            # TODO
+            _cfg: dict = global_cfg[_type]
+            # clean args
+            _keys = [k for k in _cfg.keys() if not k.startswith("_")]
+            for _arg in _keys:
+                del _cfg[_arg]
+            _cfg.update(_cfg["_kwargs"])  # restore default values
+            _cfg.update(_k)  # load config args
+            name = _cfg.pop("type")  # pop extra key (`type` from _k)
+            module_kwargs[k] = create(name, global_cfg)
+        else:
+            raise ValueError(f"Inject does not support {_k}")
+    # TODO hard code
+    module_kwargs = {k: v for k, v in module_kwargs.items() if not k.startswith("_")}
+    # TODO for **kwargs
+    # extra_args = set(module_kwargs.keys()) - set(arg_names)
+    # if len(extra_args) > 0:
+    #     raise RuntimeError(f'Error: unknown args {extra_args} for {module}')
+    return module(**module_kwargs)

src/core/yaml_config.py ADDED Viewed

	@@ -0,0 +1,187 @@

+"""
+Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+import copy
+import re
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from torch.utils.data import DataLoader
+from ._config import BaseConfig
+from .workspace import create
+from .yaml_utils import load_config, merge_config, merge_dict
+class YAMLConfig(BaseConfig):
+    def __init__(self, cfg_path: str, **kwargs) -> None:
+        super().__init__()
+        cfg = load_config(cfg_path)
+        cfg = merge_dict(cfg, kwargs)
+        self.yaml_cfg = copy.deepcopy(cfg)
+        for k in super().__dict__:
+            if not k.startswith("_") and k in cfg:
+                self.__dict__[k] = cfg[k]
+    @property
+    def global_cfg(self):
+        return merge_config(self.yaml_cfg, inplace=False, overwrite=False)
+    @property
+    def model(self) -> torch.nn.Module:
+        if self._model is None and "model" in self.yaml_cfg:
+            self._model = create(self.yaml_cfg["model"], self.global_cfg)
+        return super().model
+    @property
+    def postprocessor(self) -> torch.nn.Module:
+        if self._postprocessor is None and "postprocessor" in self.yaml_cfg:
+            self._postprocessor = create(self.yaml_cfg["postprocessor"], self.global_cfg)
+        return super().postprocessor
+    @property
+    def criterion(self) -> torch.nn.Module:
+        if self._criterion is None and "criterion" in self.yaml_cfg:
+            self._criterion = create(self.yaml_cfg["criterion"], self.global_cfg)
+        return super().criterion
+    @property
+    def optimizer(self) -> optim.Optimizer:
+        if self._optimizer is None and "optimizer" in self.yaml_cfg:
+            params = self.get_optim_params(self.yaml_cfg["optimizer"], self.model)
+            self._optimizer = create("optimizer", self.global_cfg, params=params)
+        return super().optimizer
+    @property
+    def lr_scheduler(self) -> optim.lr_scheduler.LRScheduler:
+        if self._lr_scheduler is None and "lr_scheduler" in self.yaml_cfg:
+            self._lr_scheduler = create("lr_scheduler", self.global_cfg, optimizer=self.optimizer)
+            print(f"Initial lr: {self._lr_scheduler.get_last_lr()}")
+        return super().lr_scheduler
+    @property
+    def lr_warmup_scheduler(self) -> optim.lr_scheduler.LRScheduler:
+        if self._lr_warmup_scheduler is None and "lr_warmup_scheduler" in self.yaml_cfg:
+            self._lr_warmup_scheduler = create(
+                "lr_warmup_scheduler", self.global_cfg, lr_scheduler=self.lr_scheduler
+            )
+        return super().lr_warmup_scheduler
+    @property
+    def train_dataloader(self) -> DataLoader:
+        if self._train_dataloader is None and "train_dataloader" in self.yaml_cfg:
+            self._train_dataloader = self.build_dataloader("train_dataloader")
+        return super().train_dataloader
+    @property
+    def val_dataloader(self) -> DataLoader:
+        if self._val_dataloader is None and "val_dataloader" in self.yaml_cfg:
+            self._val_dataloader = self.build_dataloader("val_dataloader")
+        return super().val_dataloader
+    @property
+    def ema(self) -> torch.nn.Module:
+        if self._ema is None and self.yaml_cfg.get("use_ema", False):
+            self._ema = create("ema", self.global_cfg, model=self.model)
+        return super().ema
+    @property
+    def scaler(self):
+        if self._scaler is None and self.yaml_cfg.get("use_amp", False):
+            self._scaler = create("scaler", self.global_cfg)
+        return super().scaler
+    @property
+    def evaluator(self):
+        if self._evaluator is None and "evaluator" in self.yaml_cfg:
+            if self.yaml_cfg["evaluator"]["type"] == "CocoEvaluator":
+                from ..data import get_coco_api_from_dataset
+                base_ds = get_coco_api_from_dataset(self.val_dataloader.dataset)
+                self._evaluator = create("evaluator", self.global_cfg, coco_gt=base_ds)
+            else:
+                raise NotImplementedError(f"{self.yaml_cfg['evaluator']['type']}")
+        return super().evaluator
+    @property
+    def use_wandb(self) -> bool:
+        return self.yaml_cfg.get("use_wandb", False)
+    @staticmethod
+    def get_optim_params(cfg: dict, model: nn.Module):
+        """
+        E.g.:
+            ^(?=.*a)(?=.*b).*$  means including a and b
+            ^(?=.*(?:a|b)).*$   means including a or b
+            ^(?=.*a)(?!.*b).*$  means including a, but not b
+        """
+        assert "type" in cfg, ""
+        cfg = copy.deepcopy(cfg)
+        if "params" not in cfg:
+            return model.parameters()
+        assert isinstance(cfg["params"], list), ""
+        param_groups = []
+        visited = []
+        for pg in cfg["params"]:
+            pattern = pg["params"]
+            params = {
+                k: v
+                for k, v in model.named_parameters()
+                if v.requires_grad and len(re.findall(pattern, k)) > 0
+            }
+            pg["params"] = params.values()
+            param_groups.append(pg)
+            visited.extend(list(params.keys()))
+            # print(params.keys())
+        names = [k for k, v in model.named_parameters() if v.requires_grad]
+        if len(visited) < len(names):
+            unseen = set(names) - set(visited)
+            params = {k: v for k, v in model.named_parameters() if v.requires_grad and k in unseen}
+            param_groups.append({"params": params.values()})
+            visited.extend(list(params.keys()))
+            # print(params.keys())
+        assert len(visited) == len(names), ""
+        return param_groups
+    @staticmethod
+    def get_rank_batch_size(cfg):
+        """compute batch size for per rank if total_batch_size is provided."""
+        assert ("total_batch_size" in cfg or "batch_size" in cfg) and not (
+            "total_batch_size" in cfg and "batch_size" in cfg
+        ), "`batch_size` or `total_batch_size` should be choosed one"
+        total_batch_size = cfg.get("total_batch_size", None)
+        if total_batch_size is None:
+            bs = cfg.get("batch_size")
+        else:
+            from ..misc import dist_utils
+            assert (
+                total_batch_size % dist_utils.get_world_size() == 0
+            ), "total_batch_size should be divisible by world size"
+            bs = total_batch_size // dist_utils.get_world_size()
+        return bs
+    def build_dataloader(self, name: str):
+        bs = self.get_rank_batch_size(self.yaml_cfg[name])
+        global_cfg = self.global_cfg
+        if "total_batch_size" in global_cfg[name]:
+            # pop unexpected key for dataloader init
+            _ = global_cfg[name].pop("total_batch_size")
+        print(f"building {name} with batch_size={bs}...")
+        loader = create(name, global_cfg, batch_size=bs)
+        loader.shuffle = self.yaml_cfg[name].get("shuffle", False)
+        return loader

src/core/yaml_utils.py ADDED Viewed

	@@ -0,0 +1,126 @@

+"""
+Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+import copy
+import os
+from typing import Any, Dict, List, Optional
+import yaml
+from .workspace import GLOBAL_CONFIG
+__all__ = [
+    "load_config",
+    "merge_config",
+    "merge_dict",
+    "parse_cli",
+]
+INCLUDE_KEY = "__include__"
+def load_config(file_path, cfg=dict()):
+    """load config"""
+    _, ext = os.path.splitext(file_path)
+    assert ext in [".yml", ".yaml"], "only support yaml files"
+    with open(file_path) as f:
+        file_cfg = yaml.load(f, Loader=yaml.Loader)
+        if file_cfg is None:
+            return {}
+    if INCLUDE_KEY in file_cfg:
+        base_yamls = list(file_cfg[INCLUDE_KEY])
+        for base_yaml in base_yamls:
+            if base_yaml.startswith("~"):
+                base_yaml = os.path.expanduser(base_yaml)
+            if not base_yaml.startswith("/"):
+                base_yaml = os.path.join(os.path.dirname(file_path), base_yaml)
+            with open(base_yaml) as f:
+                base_cfg = load_config(base_yaml, cfg)
+                merge_dict(cfg, base_cfg)
+    return merge_dict(cfg, file_cfg)
+def merge_dict(dct, another_dct, inplace=True) -> Dict:
+    """merge another_dct into dct"""
+    def _merge(dct, another) -> Dict:
+        for k in another:
+            if k in dct and isinstance(dct[k], dict) and isinstance(another[k], dict):
+                _merge(dct[k], another[k])
+            else:
+                dct[k] = another[k]
+        return dct
+    if not inplace:
+        dct = copy.deepcopy(dct)
+    return _merge(dct, another_dct)
+def dictify(s: str, v: Any) -> Dict:
+    if "." not in s:
+        return {s: v}
+    key, rest = s.split(".", 1)
+    return {key: dictify(rest, v)}
+def parse_cli(nargs: List[str]) -> Dict:
+    """
+    parse command-line arguments
+        convert `a.c=3 b=10` to `{'a': {'c': 3}, 'b': 10}`
+    """
+    cfg = {}
+    if nargs is None or len(nargs) == 0:
+        return cfg
+    for s in nargs:
+        s = s.strip()
+        k, v = s.split("=", 1)
+        d = dictify(k, yaml.load(v, Loader=yaml.Loader))
+        cfg = merge_dict(cfg, d)
+    return cfg
+def merge_config(cfg, another_cfg=GLOBAL_CONFIG, inplace: bool = False, overwrite: bool = False):
+    """
+    Merge another_cfg into cfg, return the merged config
+    Example:
+        cfg1 = load_config('./dfine_r18vd_6x_coco.yml')
+        cfg1 = merge_config(cfg, inplace=True)
+        cfg2 = load_config('./dfine_r50vd_6x_coco.yml')
+        cfg2 = merge_config(cfg2, inplace=True)
+        model1 = create(cfg1['model'], cfg1)
+        model2 = create(cfg2['model'], cfg2)
+    """
+    def _merge(dct, another):
+        for k in another:
+            if k not in dct:
+                dct[k] = another[k]
+            elif isinstance(dct[k], dict) and isinstance(another[k], dict):
+                _merge(dct[k], another[k])
+            elif overwrite:
+                dct[k] = another[k]
+        return cfg
+    if not inplace:
+        cfg = copy.deepcopy(cfg)
+    return _merge(cfg, another_cfg)

src/data/__init__.py ADDED Viewed

	@@ -0,0 +1,20 @@

+"""
+Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+from ._misc import convert_to_tv_tensor
+from .dataloader import *
+from .dataset import *
+from .transforms import *
+# def set_epoch(self, epoch) -> None:
+#     self.epoch = epoch
+# def _set_epoch_func(datasets):
+#     """Add `set_epoch` for datasets
+#     """
+#     from ..core import register
+#     for ds in datasets:
+#         register(ds)(set_epoch)
+# _set_epoch_func([CIFAR10, VOCDetection, CocoDetection])

src/data/_misc.py ADDED Viewed

	@@ -0,0 +1,62 @@

+"""
+Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+import importlib.metadata
+from torch import Tensor
+if "0.15.2" in importlib.metadata.version("torchvision"):
+    import torchvision
+    torchvision.disable_beta_transforms_warning()
+    from torchvision.datapoints import BoundingBox as BoundingBoxes
+    from torchvision.datapoints import BoundingBoxFormat, Image, Mask, Video
+    from torchvision.transforms.v2 import SanitizeBoundingBox as SanitizeBoundingBoxes
+    _boxes_keys = ["format", "spatial_size"]
+elif "0.17" > importlib.metadata.version("torchvision") >= "0.16":
+    import torchvision
+    torchvision.disable_beta_transforms_warning()
+    from torchvision.transforms.v2 import SanitizeBoundingBoxes
+    from torchvision.tv_tensors import BoundingBoxes, BoundingBoxFormat, Image, Mask, Video
+    _boxes_keys = ["format", "canvas_size"]
+elif importlib.metadata.version("torchvision") >= "0.17":
+    import torchvision
+    from torchvision.transforms.v2 import SanitizeBoundingBoxes
+    from torchvision.tv_tensors import BoundingBoxes, BoundingBoxFormat, Image, Mask, Video
+    _boxes_keys = ["format", "canvas_size"]
+else:
+    raise RuntimeError("Please make sure torchvision version >= 0.15.2")
+def convert_to_tv_tensor(tensor: Tensor, key: str, box_format="xyxy", spatial_size=None) -> Tensor:
+    """
+    Args:
+        tensor (Tensor): input tensor
+        key (str): transform to key
+    Return:
+        Dict[str, TV_Tensor]
+    """
+    assert key in (
+        "boxes",
+        "masks",
+    ), "Only support 'boxes' and 'masks'"
+    if key == "boxes":
+        box_format = getattr(BoundingBoxFormat, box_format.upper())
+        _kwargs = dict(zip(_boxes_keys, [box_format, spatial_size]))
+        return BoundingBoxes(tensor, **_kwargs)
+    if key == "masks":
+        return Mask(tensor)

src/data/dataloader.py ADDED Viewed

	@@ -0,0 +1,122 @@

+"""
+Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+import random
+from functools import partial
+import torch
+import torch.nn.functional as F
+import torch.utils.data as data
+import torchvision
+import torchvision.transforms.v2 as VT
+from torch.utils.data import default_collate
+from torchvision.transforms.v2 import InterpolationMode
+from torchvision.transforms.v2 import functional as VF
+from ..core import register
+torchvision.disable_beta_transforms_warning()
+__all__ = [
+    "DataLoader",
+    "BaseCollateFunction",
+    "BatchImageCollateFunction",
+    "batch_image_collate_fn",
+]
+@register()
+class DataLoader(data.DataLoader):
+    __inject__ = ["dataset", "collate_fn"]
+    def __repr__(self) -> str:
+        format_string = self.__class__.__name__ + "("
+        for n in ["dataset", "batch_size", "num_workers", "drop_last", "collate_fn"]:
+            format_string += "\n"
+            format_string += "    {0}: {1}".format(n, getattr(self, n))
+        format_string += "\n)"
+        return format_string
+    def set_epoch(self, epoch):
+        self._epoch = epoch
+        self.dataset.set_epoch(epoch)
+        self.collate_fn.set_epoch(epoch)
+    @property
+    def epoch(self):
+        return self._epoch if hasattr(self, "_epoch") else -1
+    @property
+    def shuffle(self):
+        return self._shuffle
+    @shuffle.setter
+    def shuffle(self, shuffle):
+        assert isinstance(shuffle, bool), "shuffle must be a boolean"
+        self._shuffle = shuffle
+@register()
+def batch_image_collate_fn(items):
+    """only batch image"""
+    return torch.cat([x[0][None] for x in items], dim=0), [x[1] for x in items]
+class BaseCollateFunction(object):
+    def set_epoch(self, epoch):
+        self._epoch = epoch
+    @property
+    def epoch(self):
+        return self._epoch if hasattr(self, "_epoch") else -1
+    def __call__(self, items):
+        raise NotImplementedError("")
+def generate_scales(base_size, base_size_repeat):
+    scale_repeat = (base_size - int(base_size * 0.75 / 32) * 32) // 32
+    scales = [int(base_size * 0.75 / 32) * 32 + i * 32 for i in range(scale_repeat)]
+    scales += [base_size] * base_size_repeat
+    scales += [int(base_size * 1.25 / 32) * 32 - i * 32 for i in range(scale_repeat)]
+    return scales
+@register()
+class BatchImageCollateFunction(BaseCollateFunction):
+    def __init__(
+        self,
+        stop_epoch=None,
+        ema_restart_decay=0.9999,
+        base_size=640,
+        base_size_repeat=None,
+    ) -> None:
+        super().__init__()
+        self.base_size = base_size
+        self.scales = (
+            generate_scales(base_size, base_size_repeat) if base_size_repeat is not None else None
+        )
+        self.stop_epoch = stop_epoch if stop_epoch is not None else 100000000
+        self.ema_restart_decay = ema_restart_decay
+        # self.interpolation = interpolation
+    def __call__(self, items):
+        images = torch.cat([x[0][None] for x in items], dim=0)
+        targets = [x[1] for x in items]
+        if self.scales is not None and self.epoch < self.stop_epoch:
+            # sz = random.choice(self.scales)
+            # sz = [sz] if isinstance(sz, int) else list(sz)
+            # VF.resize(inpt, sz, interpolation=self.interpolation)
+            sz = random.choice(self.scales)
+            images = F.interpolate(images, size=sz)
+            if "masks" in targets[0]:
+                for tg in targets:
+                    tg["masks"] = F.interpolate(tg["masks"], size=sz, mode="nearest")
+                raise NotImplementedError("")
+        return images, targets

src/data/dataset/__init__.py ADDED Viewed

	@@ -0,0 +1,17 @@

+"""
+Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+# from ._dataset import DetDataset
+from .cifar_dataset import CIFAR10
+from .coco_dataset import (
+    CocoDetection,
+    mscoco_category2label,
+    mscoco_category2name,
+    mscoco_label2category,
+)
+from .coco_eval import CocoEvaluator
+from .coco_utils import get_coco_api_from_dataset
+from .voc_detection import VOCDetection
+from .voc_eval import VOCEvaluator

src/data/dataset/_dataset.py ADDED Viewed

	@@ -0,0 +1,27 @@

+"""
+Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+import torch
+import torch.utils.data as data
+class DetDataset(data.Dataset):
+    def __getitem__(self, index):
+        img, target = self.load_item(index)
+        if self.transforms is not None:
+            img, target, _ = self.transforms(img, target, self)
+        return img, target
+    def load_item(self, index):
+        raise NotImplementedError(
+            "Please implement this function to return item before `transforms`."
+        )
+    def set_epoch(self, epoch) -> None:
+        self._epoch = epoch
+    @property
+    def epoch(self):
+        return self._epoch if hasattr(self, "_epoch") else -1

src/data/dataset/cifar_dataset.py ADDED Viewed

	@@ -0,0 +1,25 @@

+"""
+Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+from typing import Callable, Optional
+import torchvision
+from ...core import register
+@register()
+class CIFAR10(torchvision.datasets.CIFAR10):
+    __inject__ = ["transform", "target_transform"]
+    def __init__(
+        self,
+        root: str,
+        train: bool = True,
+        transform: Optional[Callable] = None,
+        target_transform: Optional[Callable] = None,
+        download: bool = False,
+    ) -> None:
+        super().__init__(root, train, transform, target_transform, download)

src/data/dataset/coco_dataset.py ADDED Viewed

	@@ -0,0 +1,282 @@

+"""
+Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+Mostly copy-paste from https://github.com/pytorch/vision/blob/13b35ff/references/detection/coco_utils.py
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+import faster_coco_eval
+import faster_coco_eval.core.mask as coco_mask
+import torch
+import torch.utils.data
+import torchvision
+import os
+from PIL import Image
+from ...core import register
+from .._misc import convert_to_tv_tensor
+from ._dataset import DetDataset
+torchvision.disable_beta_transforms_warning()
+faster_coco_eval.init_as_pycocotools()
+Image.MAX_IMAGE_PIXELS = None
+__all__ = ["CocoDetection"]
+@register()
+class CocoDetection(torchvision.datasets.CocoDetection, DetDataset):
+    __inject__ = [
+        "transforms",
+    ]
+    __share__ = ["remap_mscoco_category"]
+    def __init__(
+        self, img_folder, ann_file, transforms, return_masks=False, remap_mscoco_category=False
+    ):
+        super(CocoDetection, self).__init__(img_folder, ann_file)
+        self._transforms = transforms
+        self.prepare = ConvertCocoPolysToMask(return_masks)
+        self.img_folder = img_folder
+        self.ann_file = ann_file
+        self.return_masks = return_masks
+        self.remap_mscoco_category = remap_mscoco_category
+    def __getitem__(self, idx):
+        img, target = self.load_item(idx)
+        if self._transforms is not None:
+            img, target, _ = self._transforms(img, target, self)
+        return img, target
+    def load_item(self, idx):
+        image, target = super(CocoDetection, self).__getitem__(idx)
+        image_id = self.ids[idx]
+        image_path = os.path.join(self.img_folder, self.coco.loadImgs(image_id)[0]["file_name"])
+        target = {"image_id": image_id, "image_path": image_path, "annotations": target}
+        if self.remap_mscoco_category:
+            image, target = self.prepare(image, target, category2label=mscoco_category2label)
+        else:
+            image, target = self.prepare(image, target)
+        target["idx"] = torch.tensor([idx])
+        if "boxes" in target:
+            target["boxes"] = convert_to_tv_tensor(
+                target["boxes"], key="boxes", spatial_size=image.size[::-1]
+            )
+        if "masks" in target:
+            target["masks"] = convert_to_tv_tensor(target["masks"], key="masks")
+        return image, target
+    def extra_repr(self) -> str:
+        s = f" img_folder: {self.img_folder}\n ann_file: {self.ann_file}\n"
+        s += f" return_masks: {self.return_masks}\n"
+        if hasattr(self, "_transforms") and self._transforms is not None:
+            s += f" transforms:\n   {repr(self._transforms)}"
+        if hasattr(self, "_preset") and self._preset is not None:
+            s += f" preset:\n   {repr(self._preset)}"
+        return s
+    @property
+    def categories(
+        self,
+    ):
+        return self.coco.dataset["categories"]
+    @property
+    def category2name(
+        self,
+    ):
+        return {cat["id"]: cat["name"] for cat in self.categories}
+    @property
+    def category2label(
+        self,
+    ):
+        return {cat["id"]: i for i, cat in enumerate(self.categories)}
+    @property
+    def label2category(
+        self,
+    ):
+        return {i: cat["id"] for i, cat in enumerate(self.categories)}
+def convert_coco_poly_to_mask(segmentations, height, width):
+    masks = []
+    for polygons in segmentations:
+        rles = coco_mask.frPyObjects(polygons, height, width)
+        mask = coco_mask.decode(rles)
+        if len(mask.shape) < 3:
+            mask = mask[..., None]
+        mask = torch.as_tensor(mask, dtype=torch.uint8)
+        mask = mask.any(dim=2)
+        masks.append(mask)
+    if masks:
+        masks = torch.stack(masks, dim=0)
+    else:
+        masks = torch.zeros((0, height, width), dtype=torch.uint8)
+    return masks
+class ConvertCocoPolysToMask(object):
+    def __init__(self, return_masks=False):
+        self.return_masks = return_masks
+    def __call__(self, image: Image.Image, target, **kwargs):
+        w, h = image.size
+        image_id = target["image_id"]
+        image_id = torch.tensor([image_id])
+        image_path = target["image_path"]
+        anno = target["annotations"]
+        anno = [obj for obj in anno if "iscrowd" not in obj or obj["iscrowd"] == 0]
+        boxes = [obj["bbox"] for obj in anno]
+        # guard against no boxes via resizing
+        boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
+        boxes[:, 2:] += boxes[:, :2]
+        boxes[:, 0::2].clamp_(min=0, max=w)
+        boxes[:, 1::2].clamp_(min=0, max=h)
+        category2label = kwargs.get("category2label", None)
+        if category2label is not None:
+            labels = [category2label[obj["category_id"]] for obj in anno]
+        else:
+            labels = [obj["category_id"] for obj in anno]
+        labels = torch.tensor(labels, dtype=torch.int64)
+        if self.return_masks:
+            segmentations = [obj["segmentation"] for obj in anno]
+            masks = convert_coco_poly_to_mask(segmentations, h, w)
+        keypoints = None
+        if anno and "keypoints" in anno[0]:
+            keypoints = [obj["keypoints"] for obj in anno]
+            keypoints = torch.as_tensor(keypoints, dtype=torch.float32)
+            num_keypoints = keypoints.shape[0]
+            if num_keypoints:
+                keypoints = keypoints.view(num_keypoints, -1, 3)
+        keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
+        boxes = boxes[keep]
+        labels = labels[keep]
+        if self.return_masks:
+            masks = masks[keep]
+        if keypoints is not None:
+            keypoints = keypoints[keep]
+        target = {}
+        target["boxes"] = boxes
+        target["labels"] = labels
+        if self.return_masks:
+            target["masks"] = masks
+        target["image_id"] = image_id
+        target["image_path"] = image_path
+        if keypoints is not None:
+            target["keypoints"] = keypoints
+        # for conversion to coco api
+        area = torch.tensor([obj["area"] for obj in anno])
+        iscrowd = torch.tensor([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in anno])
+        target["area"] = area[keep]
+        target["iscrowd"] = iscrowd[keep]
+        target["orig_size"] = torch.as_tensor([int(w), int(h)])
+        # target["size"] = torch.as_tensor([int(w), int(h)])
+        return image, target
+mscoco_category2name = {
+    1: "person",
+    2: "bicycle",
+    3: "car",
+    4: "motorcycle",
+    5: "airplane",
+    6: "bus",
+    7: "train",
+    8: "truck",
+    9: "boat",
+    10: "traffic light",
+    11: "fire hydrant",
+    13: "stop sign",
+    14: "parking meter",
+    15: "bench",
+    16: "bird",
+    17: "cat",
+    18: "dog",
+    19: "horse",
+    20: "sheep",
+    21: "cow",
+    22: "elephant",
+    23: "bear",
+    24: "zebra",
+    25: "giraffe",
+    27: "backpack",
+    28: "umbrella",
+    31: "handbag",
+    32: "tie",
+    33: "suitcase",
+    34: "frisbee",
+    35: "skis",
+    36: "snowboard",
+    37: "sports ball",
+    38: "kite",
+    39: "baseball bat",
+    40: "baseball glove",
+    41: "skateboard",
+    42: "surfboard",
+    43: "tennis racket",
+    44: "bottle",
+    46: "wine glass",
+    47: "cup",
+    48: "fork",
+    49: "knife",
+    50: "spoon",
+    51: "bowl",
+    52: "banana",
+    53: "apple",
+    54: "sandwich",
+    55: "orange",
+    56: "broccoli",
+    57: "carrot",
+    58: "hot dog",
+    59: "pizza",
+    60: "donut",
+    61: "cake",
+    62: "chair",
+    63: "couch",
+    64: "potted plant",
+    65: "bed",
+    67: "dining table",
+    70: "toilet",
+    72: "tv",
+    73: "laptop",
+    74: "mouse",
+    75: "remote",
+    76: "keyboard",
+    77: "cell phone",
+    78: "microwave",
+    79: "oven",
+    80: "toaster",
+    81: "sink",
+    82: "refrigerator",
+    84: "book",
+    85: "clock",
+    86: "vase",
+    87: "scissors",
+    88: "teddy bear",
+    89: "hair drier",
+    90: "toothbrush",
+}
+mscoco_category2label = {k: i for i, k in enumerate(mscoco_category2name.keys())}
+mscoco_label2category = {v: k for k, v in mscoco_category2label.items()}

src/data/dataset/coco_eval.py ADDED Viewed

	@@ -0,0 +1,214 @@

+"""
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+COCO evaluator that works in distributed mode.
+Mostly copy-paste from https://github.com/pytorch/vision/blob/edfd5a7/references/detection/coco_eval.py
+The difference is that there is less copy-pasting from pycocotools
+in the end of the file, as python3 can suppress prints with contextlib
+"""
+import contextlib
+import copy
+import os
+import faster_coco_eval.core.mask as mask_util
+import numpy as np
+import torch
+from faster_coco_eval import COCO, COCOeval_faster
+from ...core import register
+from ...misc import dist_utils
+__all__ = [
+    "CocoEvaluator",
+]
+@register()
+class CocoEvaluator(object):
+    def __init__(self, coco_gt, iou_types):
+        assert isinstance(iou_types, (list, tuple))
+        coco_gt = copy.deepcopy(coco_gt)
+        self.coco_gt: COCO = coco_gt
+        self.iou_types = iou_types
+        self.coco_eval = {}
+        for iou_type in iou_types:
+            self.coco_eval[iou_type] = COCOeval_faster(
+                coco_gt, iouType=iou_type, print_function=print, separate_eval=True
+            )
+        self.img_ids = []
+        self.eval_imgs = {k: [] for k in iou_types}
+    def cleanup(self):
+        self.coco_eval = {}
+        for iou_type in self.iou_types:
+            self.coco_eval[iou_type] = COCOeval_faster(
+                self.coco_gt, iouType=iou_type, print_function=print, separate_eval=True
+            )
+        self.img_ids = []
+        self.eval_imgs = {k: [] for k in self.iou_types}
+    def update(self, predictions):
+        img_ids = list(np.unique(list(predictions.keys())))
+        self.img_ids.extend(img_ids)
+        for iou_type in self.iou_types:
+            results = self.prepare(predictions, iou_type)
+            coco_eval = self.coco_eval[iou_type]
+            # suppress pycocotools prints
+            with open(os.devnull, "w") as devnull:
+                with contextlib.redirect_stdout(devnull):
+                    coco_dt = self.coco_gt.loadRes(results) if results else COCO()
+                    coco_eval.cocoDt = coco_dt
+                    coco_eval.params.imgIds = list(img_ids)
+                    coco_eval.evaluate()
+            self.eval_imgs[iou_type].append(
+                np.array(coco_eval._evalImgs_cpp).reshape(
+                    len(coco_eval.params.catIds),
+                    len(coco_eval.params.areaRng),
+                    len(coco_eval.params.imgIds),
+                )
+            )
+    def synchronize_between_processes(self):
+        for iou_type in self.iou_types:
+            img_ids, eval_imgs = merge(self.img_ids, self.eval_imgs[iou_type])
+            coco_eval = self.coco_eval[iou_type]
+            coco_eval.params.imgIds = img_ids
+            coco_eval._paramsEval = copy.deepcopy(coco_eval.params)
+            coco_eval._evalImgs_cpp = eval_imgs
+    def accumulate(self):
+        for coco_eval in self.coco_eval.values():
+            coco_eval.accumulate()
+    def summarize(self):
+        for iou_type, coco_eval in self.coco_eval.items():
+            print("IoU metric: {}".format(iou_type))
+            coco_eval.summarize()
+    def prepare(self, predictions, iou_type):
+        if iou_type == "bbox":
+            return self.prepare_for_coco_detection(predictions)
+        elif iou_type == "segm":
+            return self.prepare_for_coco_segmentation(predictions)
+        elif iou_type == "keypoints":
+            return self.prepare_for_coco_keypoint(predictions)
+        else:
+            raise ValueError("Unknown iou type {}".format(iou_type))
+    def prepare_for_coco_detection(self, predictions):
+        coco_results = []
+        for original_id, prediction in predictions.items():
+            if len(prediction) == 0:
+                continue
+            boxes = prediction["boxes"]
+            boxes = convert_to_xywh(boxes).tolist()
+            scores = prediction["scores"].tolist()
+            labels = prediction["labels"].tolist()
+            coco_results.extend(
+                [
+                    {
+                        "image_id": original_id,
+                        "category_id": labels[k],
+                        "bbox": box,
+                        "score": scores[k],
+                    }
+                    for k, box in enumerate(boxes)
+                ]
+            )
+        return coco_results
+    def prepare_for_coco_segmentation(self, predictions):
+        coco_results = []
+        for original_id, prediction in predictions.items():
+            if len(prediction) == 0:
+                continue
+            scores = prediction["scores"]
+            labels = prediction["labels"]
+            masks = prediction["masks"]
+            masks = masks > 0.5
+            scores = prediction["scores"].tolist()
+            labels = prediction["labels"].tolist()
+            rles = [
+                mask_util.encode(np.array(mask[0, :, :, np.newaxis], dtype=np.uint8, order="F"))[0]
+                for mask in masks
+            ]
+            for rle in rles:
+                rle["counts"] = rle["counts"].decode("utf-8")
+            coco_results.extend(
+                [
+                    {
+                        "image_id": original_id,
+                        "category_id": labels[k],
+                        "segmentation": rle,
+                        "score": scores[k],
+                    }
+                    for k, rle in enumerate(rles)
+                ]
+            )
+        return coco_results
+    def prepare_for_coco_keypoint(self, predictions):
+        coco_results = []
+        for original_id, prediction in predictions.items():
+            if len(prediction) == 0:
+                continue
+            boxes = prediction["boxes"]
+            boxes = convert_to_xywh(boxes).tolist()
+            scores = prediction["scores"].tolist()
+            labels = prediction["labels"].tolist()
+            keypoints = prediction["keypoints"]
+            keypoints = keypoints.flatten(start_dim=1).tolist()
+            coco_results.extend(
+                [
+                    {
+                        "image_id": original_id,
+                        "category_id": labels[k],
+                        "keypoints": keypoint,
+                        "score": scores[k],
+                    }
+                    for k, keypoint in enumerate(keypoints)
+                ]
+            )
+        return coco_results
+def convert_to_xywh(boxes):
+    xmin, ymin, xmax, ymax = boxes.unbind(1)
+    return torch.stack((xmin, ymin, xmax - xmin, ymax - ymin), dim=1)
+def merge(img_ids, eval_imgs):
+    all_img_ids = dist_utils.all_gather(img_ids)
+    all_eval_imgs = dist_utils.all_gather(eval_imgs)
+    merged_img_ids = []
+    for p in all_img_ids:
+        merged_img_ids.extend(p)
+    merged_eval_imgs = []
+    for p in all_eval_imgs:
+        merged_eval_imgs.extend(p)
+    merged_img_ids = np.array(merged_img_ids)
+    merged_eval_imgs = np.concatenate(merged_eval_imgs, axis=2).ravel()
+    # merged_eval_imgs = np.array(merged_eval_imgs).T.ravel()
+    # keep only unique (and in sorted order) images
+    merged_img_ids, idx = np.unique(merged_img_ids, return_index=True)
+    return merged_img_ids.tolist(), merged_eval_imgs.tolist()

src/data/dataset/coco_utils.py ADDED Viewed

	@@ -0,0 +1,191 @@

+"""
+copy and modified https://github.com/pytorch/vision/blob/main/references/detection/coco_utils.py
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+import faster_coco_eval.core.mask as coco_mask
+import torch
+import torch.utils.data
+import torchvision
+import torchvision.transforms.functional as TVF
+from faster_coco_eval import COCO
+def convert_coco_poly_to_mask(segmentations, height, width):
+    masks = []
+    for polygons in segmentations:
+        rles = coco_mask.frPyObjects(polygons, height, width)
+        mask = coco_mask.decode(rles)
+        if len(mask.shape) < 3:
+            mask = mask[..., None]
+        mask = torch.as_tensor(mask, dtype=torch.uint8)
+        mask = mask.any(dim=2)
+        masks.append(mask)
+    if masks:
+        masks = torch.stack(masks, dim=0)
+    else:
+        masks = torch.zeros((0, height, width), dtype=torch.uint8)
+    return masks
+class ConvertCocoPolysToMask:
+    def __call__(self, image, target):
+        w, h = image.size
+        image_id = target["image_id"]
+        anno = target["annotations"]
+        anno = [obj for obj in anno if obj["iscrowd"] == 0]
+        boxes = [obj["bbox"] for obj in anno]
+        # guard against no boxes via resizing
+        boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
+        boxes[:, 2:] += boxes[:, :2]
+        boxes[:, 0::2].clamp_(min=0, max=w)
+        boxes[:, 1::2].clamp_(min=0, max=h)
+        classes = [obj["category_id"] for obj in anno]
+        classes = torch.tensor(classes, dtype=torch.int64)
+        segmentations = [obj["segmentation"] for obj in anno]
+        masks = convert_coco_poly_to_mask(segmentations, h, w)
+        keypoints = None
+        if anno and "keypoints" in anno[0]:
+            keypoints = [obj["keypoints"] for obj in anno]
+            keypoints = torch.as_tensor(keypoints, dtype=torch.float32)
+            num_keypoints = keypoints.shape[0]
+            if num_keypoints:
+                keypoints = keypoints.view(num_keypoints, -1, 3)
+        keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
+        boxes = boxes[keep]
+        classes = classes[keep]
+        masks = masks[keep]
+        if keypoints is not None:
+            keypoints = keypoints[keep]
+        target = {}
+        target["boxes"] = boxes
+        target["labels"] = classes
+        target["masks"] = masks
+        target["image_id"] = image_id
+        if keypoints is not None:
+            target["keypoints"] = keypoints
+        # for conversion to coco api
+        area = torch.tensor([obj["area"] for obj in anno])
+        iscrowd = torch.tensor([obj["iscrowd"] for obj in anno])
+        target["area"] = area
+        target["iscrowd"] = iscrowd
+        return image, target
+def _coco_remove_images_without_annotations(dataset, cat_list=None):
+    def _has_only_empty_bbox(anno):
+        return all(any(o <= 1 for o in obj["bbox"][2:]) for obj in anno)
+    def _count_visible_keypoints(anno):
+        return sum(sum(1 for v in ann["keypoints"][2::3] if v > 0) for ann in anno)
+    min_keypoints_per_image = 10
+    def _has_valid_annotation(anno):
+        # if it's empty, there is no annotation
+        if len(anno) == 0:
+            return False
+        # if all boxes have close to zero area, there is no annotation
+        if _has_only_empty_bbox(anno):
+            return False
+        # keypoints task have a slight different criteria for considering
+        # if an annotation is valid
+        if "keypoints" not in anno[0]:
+            return True
+        # for keypoint detection tasks, only consider valid images those
+        # containing at least min_keypoints_per_image
+        if _count_visible_keypoints(anno) >= min_keypoints_per_image:
+            return True
+        return False
+    ids = []
+    for ds_idx, img_id in enumerate(dataset.ids):
+        ann_ids = dataset.coco.getAnnIds(imgIds=img_id, iscrowd=None)
+        anno = dataset.coco.loadAnns(ann_ids)
+        if cat_list:
+            anno = [obj for obj in anno if obj["category_id"] in cat_list]
+        if _has_valid_annotation(anno):
+            ids.append(ds_idx)
+    dataset = torch.utils.data.Subset(dataset, ids)
+    return dataset
+def convert_to_coco_api(ds):
+    coco_ds = COCO()
+    # annotation IDs need to start at 1, not 0, see torchvision issue #1530
+    ann_id = 1
+    dataset = {"images": [], "categories": [], "annotations": []}
+    categories = set()
+    for img_idx in range(len(ds)):
+        # find better way to get target
+        # targets = ds.get_annotations(img_idx)
+        # img, targets = ds[img_idx]
+        img, targets = ds.load_item(img_idx)
+        width, height = img.size
+        image_id = targets["image_id"].item()
+        img_dict = {}
+        img_dict["id"] = image_id
+        img_dict["width"] = width
+        img_dict["height"] = height
+        dataset["images"].append(img_dict)
+        bboxes = targets["boxes"].clone()
+        bboxes[:, 2:] -= bboxes[:, :2]  # xyxy -> xywh
+        bboxes = bboxes.tolist()
+        labels = targets["labels"].tolist()
+        areas = targets["area"].tolist()
+        iscrowd = targets["iscrowd"].tolist()
+        if "masks" in targets:
+            masks = targets["masks"]
+            # make masks Fortran contiguous for coco_mask
+            masks = masks.permute(0, 2, 1).contiguous().permute(0, 2, 1)
+        if "keypoints" in targets:
+            keypoints = targets["keypoints"]
+            keypoints = keypoints.reshape(keypoints.shape[0], -1).tolist()
+        num_objs = len(bboxes)
+        for i in range(num_objs):
+            ann = {}
+            ann["image_id"] = image_id
+            ann["bbox"] = bboxes[i]
+            ann["category_id"] = labels[i]
+            categories.add(labels[i])
+            ann["area"] = areas[i]
+            ann["iscrowd"] = iscrowd[i]
+            ann["id"] = ann_id
+            if "masks" in targets:
+                ann["segmentation"] = coco_mask.encode(masks[i].numpy())
+            if "keypoints" in targets:
+                ann["keypoints"] = keypoints[i]
+                ann["num_keypoints"] = sum(k != 0 for k in keypoints[i][2::3])
+            dataset["annotations"].append(ann)
+            ann_id += 1
+    dataset["categories"] = [{"id": i} for i in sorted(categories)]
+    coco_ds.dataset = dataset
+    coco_ds.createIndex()
+    return coco_ds
+def get_coco_api_from_dataset(dataset):
+    # FIXME: This is... awful?
+    for _ in range(10):
+        if isinstance(dataset, torchvision.datasets.CocoDetection):
+            break
+        if isinstance(dataset, torch.utils.data.Subset):
+            dataset = dataset.dataset
+    if isinstance(dataset, torchvision.datasets.CocoDetection):
+        return dataset.coco
+    return convert_to_coco_api(dataset)

src/data/dataset/voc_detection.py ADDED Viewed

	@@ -0,0 +1,86 @@

+"""
+Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+import os
+from typing import Callable, Optional
+import torch
+import torchvision
+import torchvision.transforms.functional as TVF
+from PIL import Image
+from sympy import im
+try:
+    from defusedxml.ElementTree import parse as ET_parse
+except ImportError:
+    from xml.etree.ElementTree import parse as ET_parse
+from ...core import register
+from .._misc import convert_to_tv_tensor
+from ._dataset import DetDataset
+@register()
+class VOCDetection(torchvision.datasets.VOCDetection, DetDataset):
+    __inject__ = [
+        "transforms",
+    ]
+    def __init__(
+        self,
+        root: str,
+        ann_file: str = "trainval.txt",
+        label_file: str = "label_list.txt",
+        transforms: Optional[Callable] = None,
+    ):
+        with open(os.path.join(root, ann_file), "r") as f:
+            lines = [x.strip() for x in f.readlines()]
+            lines = [x.split(" ") for x in lines]
+        self.images = [os.path.join(root, lin[0]) for lin in lines]
+        self.targets = [os.path.join(root, lin[1]) for lin in lines]
+        assert len(self.images) == len(self.targets)
+        with open(os.path.join(root + label_file), "r") as f:
+            labels = f.readlines()
+            labels = [lab.strip() for lab in labels]
+        self.transforms = transforms
+        self.labels_map = {lab: i for i, lab in enumerate(labels)}
+    def __getitem__(self, index: int):
+        image, target = self.load_item(index)
+        if self.transforms is not None:
+            image, target, _ = self.transforms(image, target, self)
+        # target["orig_size"] = torch.tensor(TVF.get_image_size(image))
+        return image, target
+    def load_item(self, index: int):
+        image = Image.open(self.images[index]).convert("RGB")
+        target = self.parse_voc_xml(ET_parse(self.annotations[index]).getroot())
+        output = {}
+        output["image_id"] = torch.tensor([index])
+        for k in ["area", "boxes", "labels", "iscrowd"]:
+            output[k] = []
+        for blob in target["annotation"]["object"]:
+            box = [float(v) for v in blob["bndbox"].values()]
+            output["boxes"].append(box)
+            output["labels"].append(blob["name"])
+            output["area"].append((box[2] - box[0]) * (box[3] - box[1]))
+            output["iscrowd"].append(0)
+        w, h = image.size
+        boxes = torch.tensor(output["boxes"]) if len(output["boxes"]) > 0 else torch.zeros(0, 4)
+        output["boxes"] = convert_to_tv_tensor(
+            boxes, "boxes", box_format="xyxy", spatial_size=[h, w]
+        )
+        output["labels"] = torch.tensor([self.labels_map[lab] for lab in output["labels"]])
+        output["area"] = torch.tensor(output["area"])
+        output["iscrowd"] = torch.tensor(output["iscrowd"])
+        output["orig_size"] = torch.tensor([w, h])
+        return image, output

src/data/dataset/voc_eval.py ADDED Viewed

	@@ -0,0 +1,12 @@

+"""
+Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+import torch
+import torchvision
+class VOCEvaluator(object):
+    def __init__(self) -> None:
+        pass

src/data/transforms/__init__.py ADDED Viewed

	@@ -0,0 +1,21 @@

+"""
+Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+from ._transforms import (
+    ConvertBoxes,
+    ConvertPILImage,
+    EmptyTransform,
+    Normalize,
+    PadToSize,
+    RandomCrop,
+    RandomHorizontalFlip,
+    RandomIoUCrop,
+    RandomPhotometricDistort,
+    RandomZoomOut,
+    Resize,
+    SanitizeBoundingBoxes,
+)
+from .container import Compose
+from .mosaic import Mosaic

src/data/transforms/_transforms.py ADDED Viewed

	@@ -0,0 +1,161 @@

+"""
+Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+from typing import Any, Dict, List, Optional
+import PIL
+import PIL.Image
+import torch
+import torch.nn as nn
+import torchvision
+import torchvision.transforms.v2 as T
+import torchvision.transforms.v2.functional as F
+from ...core import register
+from .._misc import (
+    BoundingBoxes,
+    Image,
+    Mask,
+    SanitizeBoundingBoxes,
+    Video,
+    _boxes_keys,
+    convert_to_tv_tensor,
+)
+torchvision.disable_beta_transforms_warning()
+RandomPhotometricDistort = register()(T.RandomPhotometricDistort)
+RandomZoomOut = register()(T.RandomZoomOut)
+RandomHorizontalFlip = register()(T.RandomHorizontalFlip)
+Resize = register()(T.Resize)
+# ToImageTensor = register()(T.ToImageTensor)
+# ConvertDtype = register()(T.ConvertDtype)
+# PILToTensor = register()(T.PILToTensor)
+SanitizeBoundingBoxes = register(name="SanitizeBoundingBoxes")(SanitizeBoundingBoxes)
+RandomCrop = register()(T.RandomCrop)
+Normalize = register()(T.Normalize)
+@register()
+class EmptyTransform(T.Transform):
+    def __init__(
+        self,
+    ) -> None:
+        super().__init__()
+    def forward(self, *inputs):
+        inputs = inputs if len(inputs) > 1 else inputs[0]
+        return inputs
+@register()
+class PadToSize(T.Pad):
+    _transformed_types = (
+        PIL.Image.Image,
+        Image,
+        Video,
+        Mask,
+        BoundingBoxes,
+    )
+    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
+        sp = F.get_spatial_size(flat_inputs[0])
+        h, w = self.size[1] - sp[0], self.size[0] - sp[1]
+        self.padding = [0, 0, w, h]
+        return dict(padding=self.padding)
+    def __init__(self, size, fill=0, padding_mode="constant") -> None:
+        if isinstance(size, int):
+            size = (size, size)
+        self.size = size
+        super().__init__(0, fill, padding_mode)
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        fill = self._fill[type(inpt)]
+        padding = params["padding"]
+        return F.pad(inpt, padding=padding, fill=fill, padding_mode=self.padding_mode)  # type: ignore[arg-type]
+    def __call__(self, *inputs: Any) -> Any:
+        outputs = super().forward(*inputs)
+        if len(outputs) > 1 and isinstance(outputs[1], dict):
+            outputs[1]["padding"] = torch.tensor(self.padding)
+        return outputs
+@register()
+class RandomIoUCrop(T.RandomIoUCrop):
+    def __init__(
+        self,
+        min_scale: float = 0.3,
+        max_scale: float = 1,
+        min_aspect_ratio: float = 0.5,
+        max_aspect_ratio: float = 2,
+        sampler_options: Optional[List[float]] = None,
+        trials: int = 40,
+        p: float = 1.0,
+    ):
+        super().__init__(
+            min_scale, max_scale, min_aspect_ratio, max_aspect_ratio, sampler_options, trials
+        )
+        self.p = p
+    def __call__(self, *inputs: Any) -> Any:
+        if torch.rand(1) >= self.p:
+            return inputs if len(inputs) > 1 else inputs[0]
+        return super().forward(*inputs)
+@register()
+class ConvertBoxes(T.Transform):
+    _transformed_types = (BoundingBoxes,)
+    def __init__(self, fmt="", normalize=False) -> None:
+        super().__init__()
+        self.fmt = fmt
+        self.normalize = normalize
+    def transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        return self._transform(inpt, params)
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        spatial_size = getattr(inpt, _boxes_keys[1])
+        if self.fmt:
+            in_fmt = inpt.format.value.lower()
+            inpt = torchvision.ops.box_convert(inpt, in_fmt=in_fmt, out_fmt=self.fmt.lower())
+            inpt = convert_to_tv_tensor(
+                inpt, key="boxes", box_format=self.fmt.upper(), spatial_size=spatial_size
+            )
+        if self.normalize:
+            inpt = inpt / torch.tensor(spatial_size[::-1]).tile(2)[None]
+        return inpt
+@register()
+class ConvertPILImage(T.Transform):
+    _transformed_types = (PIL.Image.Image,)
+    def __init__(self, dtype="float32", scale=True) -> None:
+        super().__init__()
+        self.dtype = dtype
+        self.scale = scale
+    def transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        return self._transform(inpt, params)
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        inpt = F.pil_to_tensor(inpt)
+        if self.dtype == "float32":
+            inpt = inpt.float()
+        if self.scale:
+            inpt = inpt / 255.0
+        inpt = Image(inpt)
+        return inpt

src/data/transforms/container.py ADDED Viewed

	@@ -0,0 +1,99 @@

+"""
+Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+from typing import Any, Dict, List, Optional
+import torch
+import torch.nn as nn
+import torchvision
+import torchvision.transforms.v2 as T
+from ...core import GLOBAL_CONFIG, register
+from ._transforms import EmptyTransform
+torchvision.disable_beta_transforms_warning()
+@register()
+class Compose(T.Compose):
+    def __init__(self, ops, policy=None) -> None:
+        transforms = []
+        if ops is not None:
+            for op in ops:
+                if isinstance(op, dict):
+                    name = op.pop("type")
+                    transform = getattr(
+                        GLOBAL_CONFIG[name]["_pymodule"], GLOBAL_CONFIG[name]["_name"]
+                    )(**op)
+                    transforms.append(transform)
+                    op["type"] = name
+                elif isinstance(op, nn.Module):
+                    transforms.append(op)
+                else:
+                    raise ValueError("")
+        else:
+            transforms = [
+                EmptyTransform(),
+            ]
+        super().__init__(transforms=transforms)
+        if policy is None:
+            policy = {"name": "default"}
+        self.policy = policy
+        self.global_samples = 0
+    def forward(self, *inputs: Any) -> Any:
+        return self.get_forward(self.policy["name"])(*inputs)
+    def get_forward(self, name):
+        forwards = {
+            "default": self.default_forward,
+            "stop_epoch": self.stop_epoch_forward,
+            "stop_sample": self.stop_sample_forward,
+        }
+        return forwards[name]
+    def default_forward(self, *inputs: Any) -> Any:
+        sample = inputs if len(inputs) > 1 else inputs[0]
+        for transform in self.transforms:
+            sample = transform(sample)
+        return sample
+    def stop_epoch_forward(self, *inputs: Any):
+        sample = inputs if len(inputs) > 1 else inputs[0]
+        dataset = sample[-1]
+        cur_epoch = dataset.epoch
+        policy_ops = self.policy["ops"]
+        policy_epoch = self.policy["epoch"]
+        for transform in self.transforms:
+            if type(transform).__name__ in policy_ops and cur_epoch >= policy_epoch:
+                pass
+            else:
+                sample = transform(sample)
+        return sample
+    def stop_sample_forward(self, *inputs: Any):
+        sample = inputs if len(inputs) > 1 else inputs[0]
+        dataset = sample[-1]
+        cur_epoch = dataset.epoch
+        policy_ops = self.policy["ops"]
+        policy_sample = self.policy["sample"]
+        for transform in self.transforms:
+            if type(transform).__name__ in policy_ops and self.global_samples >= policy_sample:
+                pass
+            else:
+                sample = transform(sample)
+        self.global_samples += 1
+        return sample

src/data/transforms/functional.py ADDED Viewed

	@@ -0,0 +1,172 @@

+from typing import List, Optional
+import torch
+# needed due to empty tensor bug in pytorch and torchvision 0.5
+import torchvision
+import torchvision.transforms.functional as F
+from packaging import version
+from torch import Tensor
+if version.parse(torchvision.__version__) < version.parse("0.7"):
+    from torchvision.ops import _new_empty_tensor
+    from torchvision.ops.misc import _output_size
+def interpolate(input, size=None, scale_factor=None, mode="nearest", align_corners=None):
+    # type: (Tensor, Optional[List[int]], Optional[float], str, Optional[bool]) -> Tensor
+    """
+    Equivalent to nn.functional.interpolate, but with support for empty batch sizes.
+    This will eventually be supported natively by PyTorch, and this
+    class can go away.
+    """
+    if version.parse(torchvision.__version__) < version.parse("0.7"):
+        if input.numel() > 0:
+            return torch.nn.functional.interpolate(input, size, scale_factor, mode, align_corners)
+        output_shape = _output_size(2, input, size, scale_factor)
+        output_shape = list(input.shape[:-2]) + list(output_shape)
+        return _new_empty_tensor(input, output_shape)
+    else:
+        return torchvision.ops.misc.interpolate(input, size, scale_factor, mode, align_corners)
+def crop(image, target, region):
+    cropped_image = F.crop(image, *region)
+    target = target.copy()
+    i, j, h, w = region
+    # should we do something wrt the original size?
+    target["size"] = torch.tensor([h, w])
+    fields = ["labels", "area", "iscrowd"]
+    if "boxes" in target:
+        boxes = target["boxes"]
+        max_size = torch.as_tensor([w, h], dtype=torch.float32)
+        cropped_boxes = boxes - torch.as_tensor([j, i, j, i])
+        cropped_boxes = torch.min(cropped_boxes.reshape(-1, 2, 2), max_size)
+        cropped_boxes = cropped_boxes.clamp(min=0)
+        area = (cropped_boxes[:, 1, :] - cropped_boxes[:, 0, :]).prod(dim=1)
+        target["boxes"] = cropped_boxes.reshape(-1, 4)
+        target["area"] = area
+        fields.append("boxes")
+    if "masks" in target:
+        # FIXME should we update the area here if there are no boxes?
+        target["masks"] = target["masks"][:, i : i + h, j : j + w]
+        fields.append("masks")
+    # remove elements for which the boxes or masks that have zero area
+    if "boxes" in target or "masks" in target:
+        # favor boxes selection when defining which elements to keep
+        # this is compatible with previous implementation
+        if "boxes" in target:
+            cropped_boxes = target["boxes"].reshape(-1, 2, 2)
+            keep = torch.all(cropped_boxes[:, 1, :] > cropped_boxes[:, 0, :], dim=1)
+        else:
+            keep = target["masks"].flatten(1).any(1)
+        for field in fields:
+            target[field] = target[field][keep]
+    return cropped_image, target
+def hflip(image, target):
+    flipped_image = F.hflip(image)
+    w, h = image.size
+    target = target.copy()
+    if "boxes" in target:
+        boxes = target["boxes"]
+        boxes = boxes[:, [2, 1, 0, 3]] * torch.as_tensor([-1, 1, -1, 1]) + torch.as_tensor(
+            [w, 0, w, 0]
+        )
+        target["boxes"] = boxes
+    if "masks" in target:
+        target["masks"] = target["masks"].flip(-1)
+    return flipped_image, target
+def resize(image, target, size, max_size=None):
+    # size can be min_size (scalar) or (w, h) tuple
+    def get_size_with_aspect_ratio(image_size, size, max_size=None):
+        w, h = image_size
+        if max_size is not None:
+            min_original_size = float(min((w, h)))
+            max_original_size = float(max((w, h)))
+            if max_original_size / min_original_size * size > max_size:
+                size = int(round(max_size * min_original_size / max_original_size))
+        if (w <= h and w == size) or (h <= w and h == size):
+            return (h, w)
+        if w < h:
+            ow = size
+            oh = int(size * h / w)
+        else:
+            oh = size
+            ow = int(size * w / h)
+        # r = min(size / min(h, w), max_size / max(h, w))
+        # ow = int(w * r)
+        # oh = int(h * r)
+        return (oh, ow)
+    def get_size(image_size, size, max_size=None):
+        if isinstance(size, (list, tuple)):
+            return size[::-1]
+        else:
+            return get_size_with_aspect_ratio(image_size, size, max_size)
+    size = get_size(image.size, size, max_size)
+    rescaled_image = F.resize(image, size)
+    if target is None:
+        return rescaled_image, None
+    ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(rescaled_image.size, image.size))
+    ratio_width, ratio_height = ratios
+    target = target.copy()
+    if "boxes" in target:
+        boxes = target["boxes"]
+        scaled_boxes = boxes * torch.as_tensor(
+            [ratio_width, ratio_height, ratio_width, ratio_height]
+        )
+        target["boxes"] = scaled_boxes
+    if "area" in target:
+        area = target["area"]
+        scaled_area = area * (ratio_width * ratio_height)
+        target["area"] = scaled_area
+    h, w = size
+    target["size"] = torch.tensor([h, w])
+    if "masks" in target:
+        target["masks"] = (
+            interpolate(target["masks"][:, None].float(), size, mode="nearest")[:, 0] > 0.5
+        )
+    return rescaled_image, target
+def pad(image, target, padding):
+    # assumes that we only pad on the bottom right corners
+    padded_image = F.pad(image, (0, 0, padding[0], padding[1]))
+    if target is None:
+        return padded_image, None
+    target = target.copy()
+    # should we do something wrt the original size?
+    target["size"] = torch.tensor(padded_image.size[::-1])
+    if "masks" in target:
+        target["masks"] = torch.nn.functional.pad(target["masks"], (0, padding[0], 0, padding[1]))
+    return padded_image, target

src/data/transforms/mosaic.py ADDED Viewed

	@@ -0,0 +1,83 @@

+"""
+Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+import random
+import torch
+import torchvision
+import torchvision.transforms.v2 as T
+import torchvision.transforms.v2.functional as F
+from PIL import Image
+from ...core import register
+from .._misc import convert_to_tv_tensor
+torchvision.disable_beta_transforms_warning()
+@register()
+class Mosaic(T.Transform):
+    def __init__(
+        self,
+        size,
+        max_size=None,
+    ) -> None:
+        super().__init__()
+        self.resize = T.Resize(size=size, max_size=max_size)
+        self.crop = T.RandomCrop(size=max_size if max_size else size)
+        # TODO add arg `output_size` for affine`
+        # self.random_perspective = T.RandomPerspective(distortion_scale=0.5, p=1., )
+        self.random_affine = T.RandomAffine(
+            degrees=0, translate=(0.1, 0.1), scale=(0.5, 1.5), fill=114
+        )
+    def forward(self, *inputs):
+        inputs = inputs if len(inputs) > 1 else inputs[0]
+        image, target, dataset = inputs
+        images = []
+        targets = []
+        indices = random.choices(range(len(dataset)), k=3)
+        for i in indices:
+            image, target = dataset.load_item(i)
+            image, target = self.resize(image, target)
+            images.append(image)
+            targets.append(target)
+        h, w = F.get_spatial_size(images[0])
+        offset = [[0, 0], [w, 0], [0, h], [w, h]]
+        image = Image.new(mode=images[0].mode, size=(w * 2, h * 2), color=0)
+        for i, im in enumerate(images):
+            image.paste(im, offset[i])
+        offset = torch.tensor([[0, 0], [w, 0], [0, h], [w, h]]).repeat(1, 2)
+        target = {}
+        for k in targets[0]:
+            if k == "boxes":
+                v = [t[k] + offset[i] for i, t in enumerate(targets)]
+            else:
+                v = [t[k] for t in targets]
+            if isinstance(v[0], torch.Tensor):
+                v = torch.cat(v, dim=0)
+            target[k] = v
+        if "boxes" in target:
+            # target['boxes'] = target['boxes'].clamp(0, 640 * 2 - 1)
+            w, h = image.size
+            target["boxes"] = convert_to_tv_tensor(
+                target["boxes"], "boxes", box_format="xyxy", spatial_size=[h, w]
+            )
+        if "masks" in target:
+            target["masks"] = convert_to_tv_tensor(target["masks"], "masks")
+        image, target = self.random_affine(image, target)
+        # image, target = self.resize(image, target)
+        image, target = self.crop(image, target)
+        return image, target, dataset

src/data/transforms/presets.py ADDED Viewed

	@@ -0,0 +1,4 @@

+"""
+Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""

src/misc/__init__.py ADDED Viewed

	@@ -0,0 +1,9 @@

+"""
+Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+from .dist_utils import setup_print, setup_seed
+from .logger import *
+from .profiler_utils import stats
+from .visualizer import *

src/misc/box_ops.py ADDED Viewed

	@@ -0,0 +1,106 @@

+"""
+Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+from typing import List, Tuple
+import torch
+import torchvision
+from torch import Tensor
+def generalized_box_iou(boxes1: Tensor, boxes2: Tensor) -> Tensor:
+    assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
+    assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
+    return torchvision.ops.generalized_box_iou(boxes1, boxes2)
+# elementwise
+def elementwise_box_iou(boxes1: Tensor, boxes2: Tensor) -> Tensor:
+    """
+    Args:
+        boxes1, [N, 4]
+        boxes2, [N, 4]
+    Returns:
+        iou, [N, ]
+        union, [N, ]
+    """
+    area1 = torchvision.ops.box_area(boxes1)  # [N, ]
+    area2 = torchvision.ops.box_area(boxes2)  # [N, ]
+    lt = torch.max(boxes1[:, :2], boxes2[:, :2])  # [N, 2]
+    rb = torch.min(boxes1[:, 2:], boxes2[:, 2:])  # [N, 2]
+    wh = (rb - lt).clamp(min=0)  # [N, 2]
+    inter = wh[:, 0] * wh[:, 1]  # [N, ]
+    union = area1 + area2 - inter
+    iou = inter / union
+    return iou, union
+def elementwise_generalized_box_iou(boxes1: Tensor, boxes2: Tensor) -> Tensor:
+    """
+    Args:
+        boxes1, [N, 4] with [x1, y1, x2, y2]
+        boxes2, [N, 4] with [x1, y1, x2, y2]
+    Returns:
+        giou, [N, ]
+    """
+    assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
+    assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
+    iou, union = elementwise_box_iou(boxes1, boxes2)
+    lt = torch.min(boxes1[:, :2], boxes2[:, :2])  # [N, 2]
+    rb = torch.max(boxes1[:, 2:], boxes2[:, 2:])  # [N, 2]
+    wh = (rb - lt).clamp(min=0)  # [N, 2]
+    area = wh[:, 0] * wh[:, 1]
+    return iou - (area - union) / area
+def check_point_inside_box(points: Tensor, boxes: Tensor, eps=1e-9) -> Tensor:
+    """
+    Args:
+        points, [K, 2], (x, y)
+        boxes, [N, 4], (x1, y1, y2, y2)
+    Returns:
+        Tensor (bool), [K, N]
+    """
+    x, y = [p.unsqueeze(-1) for p in points.unbind(-1)]
+    x1, y1, x2, y2 = [x.unsqueeze(0) for x in boxes.unbind(-1)]
+    l = x - x1
+    t = y - y1
+    r = x2 - x
+    b = y2 - y
+    ltrb = torch.stack([l, t, r, b], dim=-1)
+    mask = ltrb.min(dim=-1).values > eps
+    return mask
+def point_box_distance(points: Tensor, boxes: Tensor) -> Tensor:
+    """
+    Args:
+        boxes, [N, 4], (x1, y1, x2, y2)
+        points, [N, 2], (x, y)
+    Returns:
+        Tensor (N, 4), (l, t, r, b)
+    """
+    x1y1, x2y2 = torch.split(boxes, 2, dim=-1)
+    lt = points - x1y1
+    rb = x2y2 - points
+    return torch.concat([lt, rb], dim=-1)
+def point_distance_box(points: Tensor, distances: Tensor) -> Tensor:
+    """
+    Args:
+        points (Tensor), [N, 2], (x, y)
+        distances (Tensor), [N, 4], (l, t, r, b)
+    Returns:
+        boxes (Tensor),  (N, 4), (x1, y1, x2, y2)
+    """
+    lt, rb = torch.split(distances, 2, dim=-1)
+    x1y1 = -lt + points
+    x2y2 = rb + points
+    boxes = torch.concat([x1y1, x2y2], dim=-1)
+    return boxes

src/misc/dist_utils.py ADDED Viewed

	@@ -0,0 +1,281 @@

+"""
+reference
+- https://github.com/pytorch/vision/blob/main/references/detection/utils.py
+- https://github.com/facebookresearch/detr/blob/master/util/misc.py#L406
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+import atexit
+import os
+import random
+import time
+import numpy as np
+import torch
+import torch.backends.cudnn
+import torch.distributed
+import torch.nn as nn
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.nn.parallel import DataParallel as DP
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.utils.data import DistributedSampler
+# from torch.utils.data.dataloader import DataLoader
+from ..data import DataLoader
+def setup_distributed(
+    print_rank: int = 0,
+    print_method: str = "builtin",
+    seed: int = None,
+):
+    """
+    env setup
+    args:
+        print_rank,
+        print_method, (builtin, rich)
+        seed,
+    """
+    try:
+        # https://pytorch.org/docs/stable/elastic/run.html
+        RANK = int(os.getenv("RANK", -1))
+        LOCAL_RANK = int(os.getenv("LOCAL_RANK", -1))
+        WORLD_SIZE = int(os.getenv("WORLD_SIZE", 1))
+        # torch.distributed.init_process_group(backend=backend, init_method='env://')
+        torch.distributed.init_process_group(init_method="env://")
+        torch.distributed.barrier()
+        rank = torch.distributed.get_rank()
+        torch.cuda.set_device(rank)
+        torch.cuda.empty_cache()
+        enabled_dist = True
+        if get_rank() == print_rank:
+            print("Initialized distributed mode...")
+    except Exception:
+        enabled_dist = False
+        print("Not init distributed mode.")
+    setup_print(get_rank() == print_rank, method=print_method)
+    if seed is not None:
+        setup_seed(seed)
+    return enabled_dist
+def setup_print(is_main, method="builtin"):
+    """This function disables printing when not in master process"""
+    import builtins as __builtin__
+    if method == "builtin":
+        builtin_print = __builtin__.print
+    elif method == "rich":
+        import rich
+        builtin_print = rich.print
+    else:
+        raise AttributeError("")
+    def print(*args, **kwargs):
+        force = kwargs.pop("force", False)
+        if is_main or force:
+            builtin_print(*args, **kwargs)
+    __builtin__.print = print
+def is_dist_available_and_initialized():
+    if not torch.distributed.is_available():
+        return False
+    if not torch.distributed.is_initialized():
+        return False
+    return True
+@atexit.register
+def cleanup():
+    """cleanup distributed environment"""
+    if is_dist_available_and_initialized():
+        torch.distributed.barrier()
+        torch.distributed.destroy_process_group()
+def get_rank():
+    if not is_dist_available_and_initialized():
+        return 0
+    return torch.distributed.get_rank()
+def get_world_size():
+    if not is_dist_available_and_initialized():
+        return 1
+    return torch.distributed.get_world_size()
+def is_main_process():
+    return get_rank() == 0
+def save_on_master(*args, **kwargs):
+    if is_main_process():
+        torch.save(*args, **kwargs)
+def warp_model(
+    model: torch.nn.Module,
+    sync_bn: bool = False,
+    dist_mode: str = "ddp",
+    find_unused_parameters: bool = False,
+    compile: bool = False,
+    compile_mode: str = "reduce-overhead",
+    **kwargs,
+):
+    if is_dist_available_and_initialized():
+        rank = get_rank()
+        model = nn.SyncBatchNorm.convert_sync_batchnorm(model) if sync_bn else model
+        if dist_mode == "dp":
+            model = DP(model, device_ids=[rank], output_device=rank)
+        elif dist_mode == "ddp":
+            model = DDP(
+                model,
+                device_ids=[rank],
+                output_device=rank,
+                find_unused_parameters=find_unused_parameters,
+            )
+        else:
+            raise AttributeError("")
+    if compile:
+        model = torch.compile(model, mode=compile_mode)
+    return model
+def de_model(model):
+    return de_parallel(de_complie(model))
+def warp_loader(loader, shuffle=False):
+    if is_dist_available_and_initialized():
+        sampler = DistributedSampler(loader.dataset, shuffle=shuffle)
+        loader = DataLoader(
+            loader.dataset,
+            loader.batch_size,
+            sampler=sampler,
+            drop_last=loader.drop_last,
+            collate_fn=loader.collate_fn,
+            pin_memory=loader.pin_memory,
+            num_workers=loader.num_workers,
+        )
+    return loader
+def is_parallel(model) -> bool:
+    # Returns True if model is of type DP or DDP
+    return type(model) in (
+        torch.nn.parallel.DataParallel,
+        torch.nn.parallel.DistributedDataParallel,
+    )
+def de_parallel(model) -> nn.Module:
+    # De-parallelize a model: returns single-GPU model if model is of type DP or DDP
+    return model.module if is_parallel(model) else model
+def reduce_dict(data, avg=True):
+    """
+    Args
+        data dict: input, {k: v, ...}
+        avg bool: true
+    """
+    world_size = get_world_size()
+    if world_size < 2:
+        return data
+    with torch.no_grad():
+        keys, values = [], []
+        for k in sorted(data.keys()):
+            keys.append(k)
+            values.append(data[k])
+        values = torch.stack(values, dim=0)
+        torch.distributed.all_reduce(values)
+        if avg is True:
+            values /= world_size
+        return {k: v for k, v in zip(keys, values)}
+def all_gather(data):
+    """
+    Run all_gather on arbitrary picklable data (not necessarily tensors)
+    Args:
+        data: any picklable object
+    Returns:
+        list[data]: list of data gathered from each rank
+    """
+    world_size = get_world_size()
+    if world_size == 1:
+        return [data]
+    data_list = [None] * world_size
+    torch.distributed.all_gather_object(data_list, data)
+    return data_list
+def sync_time():
+    """sync_time"""
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+    return time.time()
+def setup_seed(seed: int, deterministic=False):
+    """setup_seed for reproducibility
+    torch.manual_seed(3407) is all you need. https://arxiv.org/abs/2109.08203
+    """
+    seed = seed + get_rank()
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed)
+    # memory will be large when setting deterministic to True
+    if torch.backends.cudnn.is_available() and deterministic:
+        torch.backends.cudnn.deterministic = True
+# for torch.compile
+def check_compile():
+    import warnings
+    import torch
+    gpu_ok = False
+    if torch.cuda.is_available():
+        device_cap = torch.cuda.get_device_capability()
+        if device_cap in ((7, 0), (8, 0), (9, 0)):
+            gpu_ok = True
+    if not gpu_ok:
+        warnings.warn(
+            "GPU is not NVIDIA V100, A100, or H100. Speedup numbers may be lower " "than expected."
+        )
+    return gpu_ok
+def is_compile(model):
+    import torch._dynamo
+    return type(model) in (torch._dynamo.OptimizedModule,)
+def de_complie(model):
+    return model._orig_mod if is_compile(model) else model

src/misc/lazy_loader.py ADDED Viewed

	@@ -0,0 +1,70 @@

+"""
+https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/util/lazy_loader.py
+"""
+import importlib
+import types
+class LazyLoader(types.ModuleType):
+    """Lazily import a module, mainly to avoid pulling in large dependencies.
+    `paddle`, and `ffmpeg` are examples of modules that are large and not always
+    needed, and this allows them to only be loaded when they are used.
+    """
+    # The lint error here is incorrect.
+    def __init__(self, local_name, parent_module_globals, name, warning=None):
+        self._local_name = local_name
+        self._parent_module_globals = parent_module_globals
+        self._warning = warning
+        # These members allows doctest correctly process this module member without
+        # triggering self._load(). self._load() mutates parant_module_globals and
+        # triggers a dict mutated during iteration error from doctest.py.
+        # - for from_module()
+        self.__module__ = name.rsplit(".", 1)[0]
+        # - for is_routine()
+        self.__wrapped__ = None
+        super(LazyLoader, self).__init__(name)
+    def _load(self):
+        """Load the module and insert it into the parent's globals."""
+        # Import the target module and insert it into the parent's namespace
+        module = importlib.import_module(self.__name__)
+        self._parent_module_globals[self._local_name] = module
+        # Emit a warning if one was specified
+        if self._warning:
+            # logging.warning(self._warning)
+            # Make sure to only warn once.
+            self._warning = None
+        # Update this object's dict so that if someone keeps a reference to the
+        #   LazyLoader, lookups are efficient (__getattr__ is only called on lookups
+        #   that fail).
+        self.__dict__.update(module.__dict__)
+        return module
+    def __getattr__(self, item):
+        module = self._load()
+        return getattr(module, item)
+    def __repr__(self):
+        # Carefully to not trigger _load, since repr may be called in very
+        # sensitive places.
+        return f"<LazyLoader {self.__name__} as {self._local_name}>"
+    def __dir__(self):
+        module = self._load()
+        return dir(module)
+# import paddle.nn as nn
+# nn = LazyLoader("nn", globals(), "paddle.nn")
+# class M(nn.Layer):
+#     def __init__(self) -> None:
+#       super().__init__()

src/misc/logger.py ADDED Viewed

	@@ -0,0 +1,255 @@

+"""
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+https://github.com/facebookresearch/detr/blob/main/util/misc.py
+Mostly copy-paste from torchvision references.
+"""
+import datetime
+import pickle
+import time
+from collections import defaultdict, deque
+from typing import Dict
+import torch
+import torch.distributed as tdist
+from .dist_utils import get_world_size, is_dist_available_and_initialized
+class SmoothedValue(object):
+    """Track a series of values and provide access to smoothed values over a
+    window or the global series average.
+    """
+    def __init__(self, window_size=20, fmt=None):
+        if fmt is None:
+            fmt = "{median:.4f} ({global_avg:.4f})"
+        self.deque = deque(maxlen=window_size)
+        self.total = 0.0
+        self.count = 0
+        self.fmt = fmt
+    def update(self, value, n=1):
+        self.deque.append(value)
+        self.count += n
+        self.total += value * n
+    def synchronize_between_processes(self):
+        """
+        Warning: does not synchronize the deque!
+        """
+        if not is_dist_available_and_initialized():
+            return
+        t = torch.tensor([self.count, self.total], dtype=torch.float64, device="cuda")
+        tdist.barrier()
+        tdist.all_reduce(t)
+        t = t.tolist()
+        self.count = int(t[0])
+        self.total = t[1]
+    @property
+    def median(self):
+        d = torch.tensor(list(self.deque))
+        return d.median().item()
+    @property
+    def avg(self):
+        d = torch.tensor(list(self.deque), dtype=torch.float32)
+        return d.mean().item()
+    @property
+    def global_avg(self):
+        return self.total / self.count
+    @property
+    def max(self):
+        return max(self.deque)
+    @property
+    def value(self):
+        return self.deque[-1]
+    def __str__(self):
+        return self.fmt.format(
+            median=self.median,
+            avg=self.avg,
+            global_avg=self.global_avg,
+            max=self.max,
+            value=self.value,
+        )
+def all_gather(data):
+    """
+    Run all_gather on arbitrary picklable data (not necessarily tensors)
+    Args:
+        data: any picklable object
+    Returns:
+        list[data]: list of data gathered from each rank
+    """
+    world_size = get_world_size()
+    if world_size == 1:
+        return [data]
+    # serialized to a Tensor
+    buffer = pickle.dumps(data)
+    storage = torch.ByteStorage.from_buffer(buffer)
+    tensor = torch.ByteTensor(storage).to("cuda")
+    # obtain Tensor size of each rank
+    local_size = torch.tensor([tensor.numel()], device="cuda")
+    size_list = [torch.tensor([0], device="cuda") for _ in range(world_size)]
+    tdist.all_gather(size_list, local_size)
+    size_list = [int(size.item()) for size in size_list]
+    max_size = max(size_list)
+    # receiving Tensor from all ranks
+    # we pad the tensor because torch all_gather does not support
+    # gathering tensors of different shapes
+    tensor_list = []
+    for _ in size_list:
+        tensor_list.append(torch.empty((max_size,), dtype=torch.uint8, device="cuda"))
+    if local_size != max_size:
+        padding = torch.empty(size=(max_size - local_size,), dtype=torch.uint8, device="cuda")
+        tensor = torch.cat((tensor, padding), dim=0)
+    tdist.all_gather(tensor_list, tensor)
+    data_list = []
+    for size, tensor in zip(size_list, tensor_list):
+        buffer = tensor.cpu().numpy().tobytes()[:size]
+        data_list.append(pickle.loads(buffer))
+    return data_list
+def reduce_dict(input_dict, average=True) -> Dict[str, torch.Tensor]:
+    """
+    Args:
+        input_dict (dict): all the values will be reduced
+        average (bool): whether to do average or sum
+    Reduce the values in the dictionary from all processes so that all processes
+    have the averaged results. Returns a dict with the same fields as
+    input_dict, after reduction.
+    """
+    world_size = get_world_size()
+    if world_size < 2:
+        return input_dict
+    with torch.no_grad():
+        names = []
+        values = []
+        # sort the keys so that they are consistent across processes
+        for k in sorted(input_dict.keys()):
+            names.append(k)
+            values.append(input_dict[k])
+        values = torch.stack(values, dim=0)
+        tdist.all_reduce(values)
+        if average:
+            values /= world_size
+        reduced_dict = {k: v for k, v in zip(names, values)}
+    return reduced_dict
+class MetricLogger(object):
+    def __init__(self, delimiter="\t"):
+        self.meters = defaultdict(SmoothedValue)
+        self.delimiter = delimiter
+    def update(self, **kwargs):
+        for k, v in kwargs.items():
+            if isinstance(v, torch.Tensor):
+                v = v.item()
+            assert isinstance(v, (float, int))
+            self.meters[k].update(v)
+    def __getattr__(self, attr):
+        if attr in self.meters:
+            return self.meters[attr]
+        if attr in self.__dict__:
+            return self.__dict__[attr]
+        raise AttributeError("'{}' object has no attribute '{}'".format(type(self).__name__, attr))
+    def __str__(self):
+        loss_str = []
+        for name, meter in self.meters.items():
+            loss_str.append("{}: {}".format(name, str(meter)))
+        return self.delimiter.join(loss_str)
+    def synchronize_between_processes(self):
+        for meter in self.meters.values():
+            meter.synchronize_between_processes()
+    def add_meter(self, name, meter):
+        self.meters[name] = meter
+    def log_every(self, iterable, print_freq, header=None):
+        i = 0
+        if not header:
+            header = ""
+        start_time = time.time()
+        end = time.time()
+        iter_time = SmoothedValue(fmt="{avg:.4f}")
+        data_time = SmoothedValue(fmt="{avg:.4f}")
+        space_fmt = ":" + str(len(str(len(iterable)))) + "d"
+        if torch.cuda.is_available():
+            log_msg = self.delimiter.join(
+                [
+                    header,
+                    "[{0" + space_fmt + "}/{1}]",
+                    "eta: {eta}",
+                    "{meters}",
+                    "time: {time}",
+                    "data: {data}",
+                    "max mem: {memory:.0f}",
+                ]
+            )
+        else:
+            log_msg = self.delimiter.join(
+                [
+                    header,
+                    "[{0" + space_fmt + "}/{1}]",
+                    "eta: {eta}",
+                    "{meters}",
+                    "time: {time}",
+                    "data: {data}",
+                ]
+            )
+        MB = 1024.0 * 1024.0
+        for obj in iterable:
+            data_time.update(time.time() - end)
+            yield obj
+            iter_time.update(time.time() - end)
+            if i % print_freq == 0 or i == len(iterable) - 1:
+                eta_seconds = iter_time.global_avg * (len(iterable) - i)
+                eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
+                if torch.cuda.is_available():
+                    print(
+                        log_msg.format(
+                            i,
+                            len(iterable),
+                            eta=eta_string,
+                            meters=str(self),
+                            time=str(iter_time),
+                            data=str(data_time),
+                            memory=torch.cuda.max_memory_allocated() / MB,
+                        )
+                    )
+                else:
+                    print(
+                        log_msg.format(
+                            i,
+                            len(iterable),
+                            eta=eta_string,
+                            meters=str(self),
+                            time=str(iter_time),
+                            data=str(data_time),
+                        )
+                    )
+            i += 1
+            end = time.time()
+        total_time = time.time() - start_time
+        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+        print(
+            "{} Total time: {} ({:.4f} s / it)".format(
+                header, total_time_str, total_time / len(iterable)
+            )
+        )

src/misc/profiler_utils.py ADDED Viewed

	@@ -0,0 +1,30 @@

+"""
+Copyright (c) 2024 The D-FINE Authors. All Rights Reserved.
+"""
+import copy
+from typing import Tuple
+from calflops import calculate_flops
+def stats(
+    cfg,
+    input_shape: Tuple = (1, 3, 640, 640),
+) -> Tuple[int, dict]:
+    base_size = cfg.train_dataloader.collate_fn.base_size
+    input_shape = (1, 3, base_size, base_size)
+    model_for_info = copy.deepcopy(cfg.model).deploy()
+    flops, macs, _ = calculate_flops(
+        model=model_for_info,
+        input_shape=input_shape,
+        output_as_string=True,
+        output_precision=4,
+        print_detailed=False,
+    )
+    params = sum(p.numel() for p in model_for_info.parameters())
+    del model_for_info
+    return params, {"Model FLOPs:%s   MACs:%s   Params:%s" % (flops, macs, params)}

src/misc/visualizer.py ADDED Viewed

	@@ -0,0 +1,121 @@

+""" "
+Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+import PIL
+import numpy as np
+import torch
+import torch.utils.data
+import torchvision
+from typing import List, Dict
+torchvision.disable_beta_transforms_warning()
+__all__ = ["show_sample", "save_samples"]
+def save_samples(samples: torch.Tensor, targets: List[Dict], output_dir: str, split: str, normalized: bool, box_fmt: str):
+    '''
+    normalized: whether the boxes are normalized to [0, 1]
+    box_fmt: 'xyxy', 'xywh', 'cxcywh', D-FINE uses 'cxcywh' for training, 'xyxy' for validation
+    '''
+    from torchvision.transforms.functional import to_pil_image
+    from torchvision.ops import box_convert
+    from pathlib import Path
+    from PIL import ImageDraw, ImageFont
+    import os
+    os.makedirs(Path(output_dir) / Path(f"{split}_samples"), exist_ok=True)
+    # Predefined colors (standard color names recognized by PIL)
+    BOX_COLORS = [
+        "red", "blue", "green", "orange", "purple",
+        "cyan", "magenta", "yellow", "lime", "pink",
+        "teal", "lavender", "brown", "beige", "maroon",
+        "navy", "olive", "coral", "turquoise", "gold"
+    ]
+    LABEL_TEXT_COLOR = "white"
+    font = ImageFont.load_default()
+    font.size = 32
+    for i, (sample, target) in enumerate(zip(samples, targets)):
+        sample_visualization = sample.clone().cpu()
+        target_boxes = target["boxes"].clone().cpu()
+        target_labels = target["labels"].clone().cpu()
+        target_image_id = target["image_id"].item()
+        target_image_path = target["image_path"]
+        target_image_path_stem = Path(target_image_path).stem
+        sample_visualization = to_pil_image(sample_visualization)
+        sample_visualization_w, sample_visualization_h = sample_visualization.size
+        # normalized to pixel space
+        if normalized:
+            target_boxes[:, 0] = target_boxes[:, 0] * sample_visualization_w
+            target_boxes[:, 2] = target_boxes[:, 2] * sample_visualization_w
+            target_boxes[:, 1] = target_boxes[:, 1] * sample_visualization_h
+            target_boxes[:, 3] = target_boxes[:, 3] * sample_visualization_h
+        # any box format -> xyxy
+        target_boxes = box_convert(target_boxes, in_fmt=box_fmt, out_fmt="xyxy")
+        # clip to image size
+        target_boxes[:, 0] = torch.clamp(target_boxes[:, 0], 0, sample_visualization_w)
+        target_boxes[:, 1] = torch.clamp(target_boxes[:, 1], 0, sample_visualization_h)
+        target_boxes[:, 2] = torch.clamp(target_boxes[:, 2], 0, sample_visualization_w)
+        target_boxes[:, 3] = torch.clamp(target_boxes[:, 3], 0, sample_visualization_h)
+        target_boxes = target_boxes.numpy().astype(np.int32)
+        target_labels = target_labels.numpy().astype(np.int32)
+        draw = ImageDraw.Draw(sample_visualization)
+        # draw target boxes
+        for box, label in zip(target_boxes, target_labels):
+            x1, y1, x2, y2 = box
+            # Select color based on class ID
+            box_color = BOX_COLORS[int(label) % len(BOX_COLORS)]
+            # Draw box (thick)
+            draw.rectangle([x1, y1, x2, y2], outline=box_color, width=3)
+            label_text = f"{label}"
+            # Measure text size
+            text_width, text_height = draw.textbbox((0, 0), label_text, font=font)[2:4]
+            # Draw text background
+            padding = 2
+            draw.rectangle(
+                [x1, y1 - text_height - padding * 2, x1 + text_width + padding * 2, y1],
+                fill=box_color
+            )
+            # Draw text (LABEL_TEXT_COLOR)
+            draw.text((x1 + padding, y1 - text_height - padding), label_text,
+                     fill=LABEL_TEXT_COLOR, font=font)
+        save_path = Path(output_dir) / f"{split}_samples" / f"{target_image_id}_{target_image_path_stem}.webp"
+        sample_visualization.save(save_path)
+def show_sample(sample):
+    """for coco dataset/dataloader"""
+    import matplotlib.pyplot as plt
+    from torchvision.transforms.v2 import functional as F
+    from torchvision.utils import draw_bounding_boxes
+    image, target = sample
+    if isinstance(image, PIL.Image.Image):
+        image = F.to_image_tensor(image)
+    image = F.convert_dtype(image, torch.uint8)
+    annotated_image = draw_bounding_boxes(image, target["boxes"], colors="yellow", width=3)
+    fig, ax = plt.subplots()
+    ax.imshow(annotated_image.permute(1, 2, 0).numpy())
+    ax.set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])
+    fig.tight_layout()
+    fig.show()
+    plt.show()

src/nn/__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+"""
+Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+from .arch import *
+#
+from .backbone import *
+from .backbone import (
+    FrozenBatchNorm2d,
+    freeze_batch_norm2d,
+    get_activation,
+)
+from .criterion import *
+from .postprocessor import *

src/nn/arch/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+"""
+Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+from .classification import ClassHead, Classification
+from .yolo import YOLO

src/nn/arch/classification.py ADDED Viewed

	@@ -0,0 +1,45 @@

+"""
+Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+import torch
+import torch.nn as nn
+from ...core import register
+__all__ = ["Classification", "ClassHead"]
+@register()
+class Classification(torch.nn.Module):
+    __inject__ = ["backbone", "head"]
+    def __init__(self, backbone: nn.Module, head: nn.Module = None):
+        super().__init__()
+        self.backbone = backbone
+        self.head = head
+    def forward(self, x):
+        x = self.backbone(x)
+        if self.head is not None:
+            x = self.head(x)
+        return x
+@register()
+class ClassHead(nn.Module):
+    def __init__(self, hidden_dim, num_classes):
+        super().__init__()
+        self.pool = nn.AdaptiveAvgPool2d(1)
+        self.proj = nn.Linear(hidden_dim, num_classes)
+    def forward(self, x):
+        x = x[0] if isinstance(x, (list, tuple)) else x
+        x = self.pool(x)
+        x = x.reshape(x.shape[0], -1)
+        x = self.proj(x)
+        return x

src/nn/arch/yolo.py ADDED Viewed

	@@ -0,0 +1,42 @@

+"""
+Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+import torch
+from ...core import register
+__all__ = [
+    "YOLO",
+]
+@register()
+class YOLO(torch.nn.Module):
+    __inject__ = [
+        "backbone",
+        "neck",
+        "head",
+    ]
+    def __init__(self, backbone: torch.nn.Module, neck, head):
+        super().__init__()
+        self.backbone = backbone
+        self.neck = neck
+        self.head = head
+    def forward(self, x, **kwargs):
+        x = self.backbone(x)
+        x = self.neck(x)
+        x = self.head(x)
+        return x
+    def deploy(
+        self,
+    ):
+        self.eval()
+        for m in self.modules():
+            if m is not self and hasattr(m, "deploy"):
+                m.deploy()
+        return self

src/nn/backbone/__init__.py ADDED Viewed

	@@ -0,0 +1,17 @@

+"""
+Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+from .common import (
+    FrozenBatchNorm2d,
+    freeze_batch_norm2d,
+    get_activation,
+)
+from .csp_darknet import CSPPAN, CSPDarkNet
+from .csp_resnet import CSPResNet
+from .hgnetv2 import HGNetv2
+from .presnet import PResNet
+from .test_resnet import MResNet
+from .timm_model import TimmModel
+from .torchvision_model import TorchVisionModel

src/nn/backbone/common.py ADDED Viewed

	@@ -0,0 +1,117 @@

+"""
+Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+import torch
+import torch.nn as nn
+class ConvNormLayer(nn.Module):
+    def __init__(self, ch_in, ch_out, kernel_size, stride, padding=None, bias=False, act=None):
+        super().__init__()
+        self.conv = nn.Conv2d(
+            ch_in,
+            ch_out,
+            kernel_size,
+            stride,
+            padding=(kernel_size - 1) // 2 if padding is None else padding,
+            bias=bias,
+        )
+        self.norm = nn.BatchNorm2d(ch_out)
+        self.act = nn.Identity() if act is None else get_activation(act)
+    def forward(self, x):
+        return self.act(self.norm(self.conv(x)))
+class FrozenBatchNorm2d(nn.Module):
+    """copy and modified from https://github.com/facebookresearch/detr/blob/master/models/backbone.py
+    BatchNorm2d where the batch statistics and the affine parameters are fixed.
+    Copy-paste from torchvision.misc.ops with added eps before rqsrt,
+    without which any other models than torchvision.models.resnet[18,34,50,101]
+    produce nans.
+    """
+    def __init__(self, num_features, eps=1e-5):
+        super(FrozenBatchNorm2d, self).__init__()
+        n = num_features
+        self.register_buffer("weight", torch.ones(n))
+        self.register_buffer("bias", torch.zeros(n))
+        self.register_buffer("running_mean", torch.zeros(n))
+        self.register_buffer("running_var", torch.ones(n))
+        self.eps = eps
+        self.num_features = n
+    def _load_from_state_dict(
+        self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+    ):
+        num_batches_tracked_key = prefix + "num_batches_tracked"
+        if num_batches_tracked_key in state_dict:
+            del state_dict[num_batches_tracked_key]
+        super(FrozenBatchNorm2d, self)._load_from_state_dict(
+            state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+        )
+    def forward(self, x):
+        # move reshapes to the beginning
+        # to make it fuser-friendly
+        w = self.weight.reshape(1, -1, 1, 1)
+        b = self.bias.reshape(1, -1, 1, 1)
+        rv = self.running_var.reshape(1, -1, 1, 1)
+        rm = self.running_mean.reshape(1, -1, 1, 1)
+        scale = w * (rv + self.eps).rsqrt()
+        bias = b - rm * scale
+        return x * scale + bias
+    def extra_repr(self):
+        return "{num_features}, eps={eps}".format(**self.__dict__)
+def freeze_batch_norm2d(module: nn.Module) -> nn.Module:
+    if isinstance(module, nn.BatchNorm2d):
+        module = FrozenBatchNorm2d(module.num_features)
+    else:
+        for name, child in module.named_children():
+            _child = freeze_batch_norm2d(child)
+            if _child is not child:
+                setattr(module, name, _child)
+    return module
+def get_activation(act: str, inplace: bool = True):
+    """get activation"""
+    if act is None:
+        return nn.Identity()
+    elif isinstance(act, nn.Module):
+        return act
+    act = act.lower()
+    if act == "silu" or act == "swish":
+        m = nn.SiLU()
+    elif act == "relu":
+        m = nn.ReLU()
+    elif act == "leaky_relu":
+        m = nn.LeakyReLU()
+    elif act == "silu":
+        m = nn.SiLU()
+    elif act == "gelu":
+        m = nn.GELU()
+    elif act == "hardsigmoid":
+        m = nn.Hardsigmoid()
+    else:
+        raise RuntimeError("")
+    if hasattr(m, "inplace"):
+        m.inplace = inplace
+    return m

src/nn/backbone/csp_darknet.py ADDED Viewed

	@@ -0,0 +1,203 @@

+"""
+Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+import math
+import warnings
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from ...core import register
+from .common import get_activation
+def autopad(k, p=None):
+    if p is None:
+        p = k // 2 if isinstance(k, int) else [x // 2 for x in k]
+    return p
+def make_divisible(c, d):
+    return math.ceil(c / d) * d
+class Conv(nn.Module):
+    def __init__(self, cin, cout, k=1, s=1, p=None, g=1, act="silu") -> None:
+        super().__init__()
+        self.conv = nn.Conv2d(cin, cout, k, s, autopad(k, p), groups=g, bias=False)
+        self.bn = nn.BatchNorm2d(cout)
+        self.act = get_activation(act, inplace=True)
+    def forward(self, x):
+        return self.act(self.bn(self.conv(x)))
+class Bottleneck(nn.Module):
+    # Standard bottleneck
+    def __init__(self, c1, c2, shortcut=True, g=1, e=0.5, act="silu"):
+        super().__init__()
+        c_ = int(c2 * e)  # hidden channels
+        self.cv1 = Conv(c1, c_, 1, 1, act=act)
+        self.cv2 = Conv(c_, c2, 3, 1, g=g, act=act)
+        self.add = shortcut and c1 == c2
+    def forward(self, x):
+        return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x))
+class C3(nn.Module):
+    # CSP Bottleneck with 3 convolutions
+    def __init__(
+        self, c1, c2, n=1, shortcut=True, g=1, e=0.5, act="silu"
+    ):  # ch_in, ch_out, number, shortcut, groups, expansion
+        super().__init__()
+        c_ = int(c2 * e)  # hidden channels
+        self.cv1 = Conv(c1, c_, 1, 1, act=act)
+        self.cv2 = Conv(c1, c_, 1, 1, act=act)
+        self.m = nn.Sequential(*(Bottleneck(c_, c_, shortcut, g, e=1.0, act=act) for _ in range(n)))
+        self.cv3 = Conv(2 * c_, c2, 1, act=act)
+    def forward(self, x):
+        return self.cv3(torch.cat((self.m(self.cv1(x)), self.cv2(x)), dim=1))
+class SPPF(nn.Module):
+    # Spatial Pyramid Pooling - Fast (SPPF) layer for YOLOv5 by Glenn Jocher
+    def __init__(self, c1, c2, k=5, act="silu"):  # equivalent to SPP(k=(5, 9, 13))
+        super().__init__()
+        c_ = c1 // 2  # hidden channels
+        self.cv1 = Conv(c1, c_, 1, 1, act=act)
+        self.cv2 = Conv(c_ * 4, c2, 1, 1, act=act)
+        self.m = nn.MaxPool2d(kernel_size=k, stride=1, padding=k // 2)
+    def forward(self, x):
+        x = self.cv1(x)
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")  # suppress torch 1.9.0 max_pool2d() warning
+            y1 = self.m(x)
+            y2 = self.m(y1)
+            return self.cv2(torch.cat([x, y1, y2, self.m(y2)], 1))
+@register()
+class CSPDarkNet(nn.Module):
+    __share__ = ["depth_multi", "width_multi"]
+    def __init__(
+        self,
+        in_channels=3,
+        width_multi=1.0,
+        depth_multi=1.0,
+        return_idx=[2, 3, -1],
+        act="silu",
+    ) -> None:
+        super().__init__()
+        channels = [64, 128, 256, 512, 1024]
+        channels = [make_divisible(c * width_multi, 8) for c in channels]
+        depths = [3, 6, 9, 3]
+        depths = [max(round(d * depth_multi), 1) for d in depths]
+        self.layers = nn.ModuleList([Conv(in_channels, channels[0], 6, 2, 2, act=act)])
+        for i, (c, d) in enumerate(zip(channels, depths), 1):
+            layer = nn.Sequential(
+                *[Conv(c, channels[i], 3, 2, act=act), C3(channels[i], channels[i], n=d, act=act)]
+            )
+            self.layers.append(layer)
+        self.layers.append(SPPF(channels[-1], channels[-1], k=5, act=act))
+        self.return_idx = return_idx
+        self.out_channels = [channels[i] for i in self.return_idx]
+        self.strides = [[2, 4, 8, 16, 32][i] for i in self.return_idx]
+        self.depths = depths
+        self.act = act
+    def forward(self, x):
+        outputs = []
+        for _, m in enumerate(self.layers):
+            x = m(x)
+            outputs.append(x)
+        return [outputs[i] for i in self.return_idx]
+@register()
+class CSPPAN(nn.Module):
+    """
+    P5 ---> 1x1  ---------------------------------> concat --> c3 --> det
+             | up                                     | conv /2
+    P4 ---> concat ---> c3 ---> 1x1  -->  concat ---> c3 -----------> det
+                                 | up       | conv /2
+    P3 -----------------------> concat ---> c3 ---------------------> det
+    """
+    __share__ = [
+        "depth_multi",
+    ]
+    def __init__(self, in_channels=[256, 512, 1024], depth_multi=1.0, act="silu") -> None:
+        super().__init__()
+        depth = max(round(3 * depth_multi), 1)
+        self.out_channels = in_channels
+        self.fpn_stems = nn.ModuleList(
+            [
+                Conv(cin, cout, 1, 1, act=act)
+                for cin, cout in zip(in_channels[::-1], in_channels[::-1][1:])
+            ]
+        )
+        self.fpn_csps = nn.ModuleList(
+            [
+                C3(cin, cout, depth, False, act=act)
+                for cin, cout in zip(in_channels[::-1], in_channels[::-1][1:])
+            ]
+        )
+        self.pan_stems = nn.ModuleList([Conv(c, c, 3, 2, act=act) for c in in_channels[:-1]])
+        self.pan_csps = nn.ModuleList([C3(c, c, depth, False, act=act) for c in in_channels[1:]])
+    def forward(self, feats):
+        fpn_feats = []
+        for i, feat in enumerate(feats[::-1]):
+            if i == 0:
+                feat = self.fpn_stems[i](feat)
+                fpn_feats.append(feat)
+            else:
+                _feat = F.interpolate(fpn_feats[-1], scale_factor=2, mode="nearest")
+                feat = torch.concat([_feat, feat], dim=1)
+                feat = self.fpn_csps[i - 1](feat)
+                if i < len(self.fpn_stems):
+                    feat = self.fpn_stems[i](feat)
+                fpn_feats.append(feat)
+        pan_feats = []
+        for i, feat in enumerate(fpn_feats[::-1]):
+            if i == 0:
+                pan_feats.append(feat)
+            else:
+                _feat = self.pan_stems[i - 1](pan_feats[-1])
+                feat = torch.concat([_feat, feat], dim=1)
+                feat = self.pan_csps[i - 1](feat)
+                pan_feats.append(feat)
+        return pan_feats
+if __name__ == "__main__":
+    data = torch.rand(1, 3, 320, 640)
+    width_multi = 0.75
+    depth_multi = 0.33
+    m = CSPDarkNet(3, width_multi=width_multi, depth_multi=depth_multi, act="silu")
+    outputs = m(data)
+    print([o.shape for o in outputs])
+    m = CSPPAN(in_channels=m.out_channels, depth_multi=depth_multi, act="silu")
+    outputs = m(outputs)
+    print([o.shape for o in outputs])

src/nn/backbone/csp_resnet.py ADDED Viewed

	@@ -0,0 +1,302 @@

+"""
+https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.6/ppdet/modeling/backbones/cspresnet.py
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+from collections import OrderedDict
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from ...core import register
+from .common import get_activation
+__all__ = ["CSPResNet"]
+donwload_url = {
+    "s": "https://github.com/lyuwenyu/storage/releases/download/v0.1/CSPResNetb_s_pretrained_from_paddle.pth",
+    "m": "https://github.com/lyuwenyu/storage/releases/download/v0.1/CSPResNetb_m_pretrained_from_paddle.pth",
+    "l": "https://github.com/lyuwenyu/storage/releases/download/v0.1/CSPResNetb_l_pretrained_from_paddle.pth",
+    "x": "https://github.com/lyuwenyu/storage/releases/download/v0.1/CSPResNetb_x_pretrained_from_paddle.pth",
+}
+class ConvBNLayer(nn.Module):
+    def __init__(self, ch_in, ch_out, filter_size=3, stride=1, groups=1, padding=0, act=None):
+        super().__init__()
+        self.conv = nn.Conv2d(
+            ch_in, ch_out, filter_size, stride, padding, groups=groups, bias=False
+        )
+        self.bn = nn.BatchNorm2d(ch_out)
+        self.act = get_activation(act)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.act(x)
+        return x
+class RepVggBlock(nn.Module):
+    def __init__(self, ch_in, ch_out, act="relu", alpha: bool = False):
+        super().__init__()
+        self.ch_in = ch_in
+        self.ch_out = ch_out
+        self.conv1 = ConvBNLayer(ch_in, ch_out, 3, stride=1, padding=1, act=None)
+        self.conv2 = ConvBNLayer(ch_in, ch_out, 1, stride=1, padding=0, act=None)
+        self.act = get_activation(act)
+        if alpha:
+            self.alpha = nn.Parameter(
+                torch.ones(
+                    1,
+                )
+            )
+        else:
+            self.alpha = None
+    def forward(self, x):
+        if hasattr(self, "conv"):
+            y = self.conv(x)
+        else:
+            if self.alpha:
+                y = self.conv1(x) + self.alpha * self.conv2(x)
+            else:
+                y = self.conv1(x) + self.conv2(x)
+        y = self.act(y)
+        return y
+    def convert_to_deploy(self):
+        if not hasattr(self, "conv"):
+            self.conv = nn.Conv2d(self.ch_in, self.ch_out, 3, 1, padding=1)
+        kernel, bias = self.get_equivalent_kernel_bias()
+        self.conv.weight.data = kernel
+        self.conv.bias.data = bias
+    def get_equivalent_kernel_bias(self):
+        kernel3x3, bias3x3 = self._fuse_bn_tensor(self.conv1)
+        kernel1x1, bias1x1 = self._fuse_bn_tensor(self.conv2)
+        if self.alpha:
+            return kernel3x3 + self.alpha * self._pad_1x1_to_3x3_tensor(
+                kernel1x1
+            ), bias3x3 + self.alpha * bias1x1
+        else:
+            return kernel3x3 + self._pad_1x1_to_3x3_tensor(kernel1x1), bias3x3 + bias1x1
+    def _pad_1x1_to_3x3_tensor(self, kernel1x1):
+        if kernel1x1 is None:
+            return 0
+        else:
+            return F.pad(kernel1x1, [1, 1, 1, 1])
+    def _fuse_bn_tensor(self, branch: ConvBNLayer):
+        if branch is None:
+            return 0, 0
+        kernel = branch.conv.weight
+        running_mean = branch.norm.running_mean
+        running_var = branch.norm.running_var
+        gamma = branch.norm.weight
+        beta = branch.norm.bias
+        eps = branch.norm.eps
+        std = (running_var + eps).sqrt()
+        t = (gamma / std).reshape(-1, 1, 1, 1)
+        return kernel * t, beta - running_mean * gamma / std
+class BasicBlock(nn.Module):
+    def __init__(self, ch_in, ch_out, act="relu", shortcut=True, use_alpha=False):
+        super().__init__()
+        assert ch_in == ch_out
+        self.conv1 = ConvBNLayer(ch_in, ch_out, 3, stride=1, padding=1, act=act)
+        self.conv2 = RepVggBlock(ch_out, ch_out, act=act, alpha=use_alpha)
+        self.shortcut = shortcut
+    def forward(self, x):
+        y = self.conv1(x)
+        y = self.conv2(y)
+        if self.shortcut:
+            return x + y
+        else:
+            return y
+class EffectiveSELayer(nn.Module):
+    """Effective Squeeze-Excitation
+    From `CenterMask : Real-Time Anchor-Free Instance Segmentation` - https://arxiv.org/abs/1911.06667
+    """
+    def __init__(self, channels, act="hardsigmoid"):
+        super(EffectiveSELayer, self).__init__()
+        self.fc = nn.Conv2d(channels, channels, kernel_size=1, padding=0)
+        self.act = get_activation(act)
+    def forward(self, x: torch.Tensor):
+        x_se = x.mean((2, 3), keepdim=True)
+        x_se = self.fc(x_se)
+        x_se = self.act(x_se)
+        return x * x_se
+class CSPResStage(nn.Module):
+    def __init__(self, block_fn, ch_in, ch_out, n, stride, act="relu", attn="eca", use_alpha=False):
+        super().__init__()
+        ch_mid = (ch_in + ch_out) // 2
+        if stride == 2:
+            self.conv_down = ConvBNLayer(ch_in, ch_mid, 3, stride=2, padding=1, act=act)
+        else:
+            self.conv_down = None
+        self.conv1 = ConvBNLayer(ch_mid, ch_mid // 2, 1, act=act)
+        self.conv2 = ConvBNLayer(ch_mid, ch_mid // 2, 1, act=act)
+        self.blocks = nn.Sequential(
+            *[
+                block_fn(ch_mid // 2, ch_mid // 2, act=act, shortcut=True, use_alpha=use_alpha)
+                for i in range(n)
+            ]
+        )
+        if attn:
+            self.attn = EffectiveSELayer(ch_mid, act="hardsigmoid")
+        else:
+            self.attn = None
+        self.conv3 = ConvBNLayer(ch_mid, ch_out, 1, act=act)
+    def forward(self, x):
+        if self.conv_down is not None:
+            x = self.conv_down(x)
+        y1 = self.conv1(x)
+        y2 = self.blocks(self.conv2(x))
+        y = torch.concat([y1, y2], dim=1)
+        if self.attn is not None:
+            y = self.attn(y)
+        y = self.conv3(y)
+        return y
+@register()
+class CSPResNet(nn.Module):
+    layers = [3, 6, 6, 3]
+    channels = [64, 128, 256, 512, 1024]
+    model_cfg = {
+        "s": {
+            "depth_mult": 0.33,
+            "width_mult": 0.50,
+        },
+        "m": {
+            "depth_mult": 0.67,
+            "width_mult": 0.75,
+        },
+        "l": {
+            "depth_mult": 1.00,
+            "width_mult": 1.00,
+        },
+        "x": {
+            "depth_mult": 1.33,
+            "width_mult": 1.25,
+        },
+    }
+    def __init__(
+        self,
+        name: str,
+        act="silu",
+        return_idx=[1, 2, 3],
+        use_large_stem=True,
+        use_alpha=False,
+        pretrained=False,
+    ):
+        super().__init__()
+        depth_mult = self.model_cfg[name]["depth_mult"]
+        width_mult = self.model_cfg[name]["width_mult"]
+        channels = [max(round(c * width_mult), 1) for c in self.channels]
+        layers = [max(round(l * depth_mult), 1) for l in self.layers]
+        act = get_activation(act)
+        if use_large_stem:
+            self.stem = nn.Sequential(
+                OrderedDict(
+                    [
+                        (
+                            "conv1",
+                            ConvBNLayer(3, channels[0] // 2, 3, stride=2, padding=1, act=act),
+                        ),
+                        (
+                            "conv2",
+                            ConvBNLayer(
+                                channels[0] // 2, channels[0] // 2, 3, stride=1, padding=1, act=act
+                            ),
+                        ),
+                        (
+                            "conv3",
+                            ConvBNLayer(
+                                channels[0] // 2, channels[0], 3, stride=1, padding=1, act=act
+                            ),
+                        ),
+                    ]
+                )
+            )
+        else:
+            self.stem = nn.Sequential(
+                OrderedDict(
+                    [
+                        (
+                            "conv1",
+                            ConvBNLayer(3, channels[0] // 2, 3, stride=2, padding=1, act=act),
+                        ),
+                        (
+                            "conv2",
+                            ConvBNLayer(
+                                channels[0] // 2, channels[0], 3, stride=1, padding=1, act=act
+                            ),
+                        ),
+                    ]
+                )
+            )
+        n = len(channels) - 1
+        self.stages = nn.Sequential(
+            OrderedDict(
+                [
+                    (
+                        str(i),
+                        CSPResStage(
+                            BasicBlock,
+                            channels[i],
+                            channels[i + 1],
+                            layers[i],
+                            2,
+                            act=act,
+                            use_alpha=use_alpha,
+                        ),
+                    )
+                    for i in range(n)
+                ]
+            )
+        )
+        self._out_channels = channels[1:]
+        self._out_strides = [4 * 2**i for i in range(n)]
+        self.return_idx = return_idx
+        if pretrained:
+            if isinstance(pretrained, bool) or "http" in pretrained:
+                state = torch.hub.load_state_dict_from_url(donwload_url[name], map_location="cpu")
+            else:
+                state = torch.load(pretrained, map_location="cpu")
+            self.load_state_dict(state)
+            print(f"Load CSPResNet_{name} state_dict")
+    def forward(self, x):
+        x = self.stem(x)
+        outs = []
+        for idx, stage in enumerate(self.stages):
+            x = stage(x)
+            if idx in self.return_idx:
+                outs.append(x)
+        return outs

src/nn/backbone/hgnetv2.py ADDED Viewed

	@@ -0,0 +1,581 @@

+"""
+reference
+- https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/backbones/hgnet_v2.py
+Copyright (c) 2024 The D-FINE Authors. All Rights Reserved.
+"""
+import logging
+import os
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from ...core import register
+from .common import FrozenBatchNorm2d
+# Constants for initialization
+kaiming_normal_ = nn.init.kaiming_normal_
+zeros_ = nn.init.zeros_
+ones_ = nn.init.ones_
+__all__ = ["HGNetv2"]
+def safe_barrier():
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        torch.distributed.barrier()
+    else:
+        pass
+def safe_get_rank():
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        return torch.distributed.get_rank()
+    else:
+        return 0
+class LearnableAffineBlock(nn.Module):
+    def __init__(self, scale_value=1.0, bias_value=0.0):
+        super().__init__()
+        self.scale = nn.Parameter(torch.tensor([scale_value]), requires_grad=True)
+        self.bias = nn.Parameter(torch.tensor([bias_value]), requires_grad=True)
+    def forward(self, x):
+        return self.scale * x + self.bias
+class ConvBNAct(nn.Module):
+    def __init__(
+        self,
+        in_chs,
+        out_chs,
+        kernel_size,
+        stride=1,
+        groups=1,
+        padding="",
+        use_act=True,
+        use_lab=False,
+    ):
+        super().__init__()
+        self.use_act = use_act
+        self.use_lab = use_lab
+        if padding == "same":
+            self.conv = nn.Sequential(
+                nn.ZeroPad2d([0, 1, 0, 1]),
+                nn.Conv2d(in_chs, out_chs, kernel_size, stride, groups=groups, bias=False),
+            )
+        else:
+            self.conv = nn.Conv2d(
+                in_chs,
+                out_chs,
+                kernel_size,
+                stride,
+                padding=(kernel_size - 1) // 2,
+                groups=groups,
+                bias=False,
+            )
+        self.bn = nn.BatchNorm2d(out_chs)
+        if self.use_act:
+            self.act = nn.ReLU()
+        else:
+            self.act = nn.Identity()
+        if self.use_act and self.use_lab:
+            self.lab = LearnableAffineBlock()
+        else:
+            self.lab = nn.Identity()
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.act(x)
+        x = self.lab(x)
+        return x
+class LightConvBNAct(nn.Module):
+    def __init__(
+        self,
+        in_chs,
+        out_chs,
+        kernel_size,
+        groups=1,
+        use_lab=False,
+    ):
+        super().__init__()
+        self.conv1 = ConvBNAct(
+            in_chs,
+            out_chs,
+            kernel_size=1,
+            use_act=False,
+            use_lab=use_lab,
+        )
+        self.conv2 = ConvBNAct(
+            out_chs,
+            out_chs,
+            kernel_size=kernel_size,
+            groups=out_chs,
+            use_act=True,
+            use_lab=use_lab,
+        )
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.conv2(x)
+        return x
+class StemBlock(nn.Module):
+    # for HGNetv2
+    def __init__(self, in_chs, mid_chs, out_chs, use_lab=False):
+        super().__init__()
+        self.stem1 = ConvBNAct(
+            in_chs,
+            mid_chs,
+            kernel_size=3,
+            stride=2,
+            use_lab=use_lab,
+        )
+        self.stem2a = ConvBNAct(
+            mid_chs,
+            mid_chs // 2,
+            kernel_size=2,
+            stride=1,
+            use_lab=use_lab,
+        )
+        self.stem2b = ConvBNAct(
+            mid_chs // 2,
+            mid_chs,
+            kernel_size=2,
+            stride=1,
+            use_lab=use_lab,
+        )
+        self.stem3 = ConvBNAct(
+            mid_chs * 2,
+            mid_chs,
+            kernel_size=3,
+            stride=2,
+            use_lab=use_lab,
+        )
+        self.stem4 = ConvBNAct(
+            mid_chs,
+            out_chs,
+            kernel_size=1,
+            stride=1,
+            use_lab=use_lab,
+        )
+        self.pool = nn.MaxPool2d(kernel_size=2, stride=1, ceil_mode=True)
+    def forward(self, x):
+        x = self.stem1(x)
+        x = F.pad(x, (0, 1, 0, 1))
+        x2 = self.stem2a(x)
+        x2 = F.pad(x2, (0, 1, 0, 1))
+        x2 = self.stem2b(x2)
+        x1 = self.pool(x)
+        x = torch.cat([x1, x2], dim=1)
+        x = self.stem3(x)
+        x = self.stem4(x)
+        return x
+class EseModule(nn.Module):
+    def __init__(self, chs):
+        super().__init__()
+        self.conv = nn.Conv2d(
+            chs,
+            chs,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        )
+        self.sigmoid = nn.Sigmoid()
+    def forward(self, x):
+        identity = x
+        x = x.mean((2, 3), keepdim=True)
+        x = self.conv(x)
+        x = self.sigmoid(x)
+        return torch.mul(identity, x)
+class HG_Block(nn.Module):
+    def __init__(
+        self,
+        in_chs,
+        mid_chs,
+        out_chs,
+        layer_num,
+        kernel_size=3,
+        residual=False,
+        light_block=False,
+        use_lab=False,
+        agg="ese",
+        drop_path=0.0,
+    ):
+        super().__init__()
+        self.residual = residual
+        self.layers = nn.ModuleList()
+        for i in range(layer_num):
+            if light_block:
+                self.layers.append(
+                    LightConvBNAct(
+                        in_chs if i == 0 else mid_chs,
+                        mid_chs,
+                        kernel_size=kernel_size,
+                        use_lab=use_lab,
+                    )
+                )
+            else:
+                self.layers.append(
+                    ConvBNAct(
+                        in_chs if i == 0 else mid_chs,
+                        mid_chs,
+                        kernel_size=kernel_size,
+                        stride=1,
+                        use_lab=use_lab,
+                    )
+                )
+        # feature aggregation
+        total_chs = in_chs + layer_num * mid_chs
+        if agg == "se":
+            aggregation_squeeze_conv = ConvBNAct(
+                total_chs,
+                out_chs // 2,
+                kernel_size=1,
+                stride=1,
+                use_lab=use_lab,
+            )
+            aggregation_excitation_conv = ConvBNAct(
+                out_chs // 2,
+                out_chs,
+                kernel_size=1,
+                stride=1,
+                use_lab=use_lab,
+            )
+            self.aggregation = nn.Sequential(
+                aggregation_squeeze_conv,
+                aggregation_excitation_conv,
+            )
+        else:
+            aggregation_conv = ConvBNAct(
+                total_chs,
+                out_chs,
+                kernel_size=1,
+                stride=1,
+                use_lab=use_lab,
+            )
+            att = EseModule(out_chs)
+            self.aggregation = nn.Sequential(
+                aggregation_conv,
+                att,
+            )
+        self.drop_path = nn.Dropout(drop_path) if drop_path else nn.Identity()
+    def forward(self, x):
+        identity = x
+        output = [x]
+        for layer in self.layers:
+            x = layer(x)
+            output.append(x)
+        x = torch.cat(output, dim=1)
+        x = self.aggregation(x)
+        if self.residual:
+            x = self.drop_path(x) + identity
+        return x
+class HG_Stage(nn.Module):
+    def __init__(
+        self,
+        in_chs,
+        mid_chs,
+        out_chs,
+        block_num,
+        layer_num,
+        downsample=True,
+        light_block=False,
+        kernel_size=3,
+        use_lab=False,
+        agg="se",
+        drop_path=0.0,
+    ):
+        super().__init__()
+        self.downsample = downsample
+        if downsample:
+            self.downsample = ConvBNAct(
+                in_chs,
+                in_chs,
+                kernel_size=3,
+                stride=2,
+                groups=in_chs,
+                use_act=False,
+                use_lab=use_lab,
+            )
+        else:
+            self.downsample = nn.Identity()
+        blocks_list = []
+        for i in range(block_num):
+            blocks_list.append(
+                HG_Block(
+                    in_chs if i == 0 else out_chs,
+                    mid_chs,
+                    out_chs,
+                    layer_num,
+                    residual=False if i == 0 else True,
+                    kernel_size=kernel_size,
+                    light_block=light_block,
+                    use_lab=use_lab,
+                    agg=agg,
+                    drop_path=drop_path[i] if isinstance(drop_path, (list, tuple)) else drop_path,
+                )
+            )
+        self.blocks = nn.Sequential(*blocks_list)
+    def forward(self, x):
+        x = self.downsample(x)
+        x = self.blocks(x)
+        return x
+@register()
+class HGNetv2(nn.Module):
+    """
+    HGNetV2
+    Args:
+        stem_channels: list. Number of channels for the stem block.
+        stage_type: str. The stage configuration of HGNet. such as the number of channels, stride, etc.
+        use_lab: boolean. Whether to use LearnableAffineBlock in network.
+        lr_mult_list: list. Control the learning rate of different stages.
+    Returns:
+        model: nn.Layer. Specific HGNetV2 model depends on args.
+    """
+    arch_configs = {
+        "B0": {
+            "stem_channels": [3, 16, 16],
+            "stage_config": {
+                # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num
+                "stage1": [16, 16, 64, 1, False, False, 3, 3],
+                "stage2": [64, 32, 256, 1, True, False, 3, 3],
+                "stage3": [256, 64, 512, 2, True, True, 5, 3],
+                "stage4": [512, 128, 1024, 1, True, True, 5, 3],
+            },
+            "url": "https://github.com/Peterande/storage/releases/download/dfinev1.0/PPHGNetV2_B0_stage1.pth",
+        },
+        "B1": {
+            "stem_channels": [3, 24, 32],
+            "stage_config": {
+                # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num
+                "stage1": [32, 32, 64, 1, False, False, 3, 3],
+                "stage2": [64, 48, 256, 1, True, False, 3, 3],
+                "stage3": [256, 96, 512, 2, True, True, 5, 3],
+                "stage4": [512, 192, 1024, 1, True, True, 5, 3],
+            },
+            "url": "https://github.com/Peterande/storage/releases/download/dfinev1.0/PPHGNetV2_B1_stage1.pth",
+        },
+        "B2": {
+            "stem_channels": [3, 24, 32],
+            "stage_config": {
+                # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num
+                "stage1": [32, 32, 96, 1, False, False, 3, 4],
+                "stage2": [96, 64, 384, 1, True, False, 3, 4],
+                "stage3": [384, 128, 768, 3, True, True, 5, 4],
+                "stage4": [768, 256, 1536, 1, True, True, 5, 4],
+            },
+            "url": "https://github.com/Peterande/storage/releases/download/dfinev1.0/PPHGNetV2_B2_stage1.pth",
+        },
+        "B3": {
+            "stem_channels": [3, 24, 32],
+            "stage_config": {
+                # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num
+                "stage1": [32, 32, 128, 1, False, False, 3, 5],
+                "stage2": [128, 64, 512, 1, True, False, 3, 5],
+                "stage3": [512, 128, 1024, 3, True, True, 5, 5],
+                "stage4": [1024, 256, 2048, 1, True, True, 5, 5],
+            },
+            "url": "https://github.com/Peterande/storage/releases/download/dfinev1.0/PPHGNetV2_B3_stage1.pth",
+        },
+        "B4": {
+            "stem_channels": [3, 32, 48],
+            "stage_config": {
+                # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num
+                "stage1": [48, 48, 128, 1, False, False, 3, 6],
+                "stage2": [128, 96, 512, 1, True, False, 3, 6],
+                "stage3": [512, 192, 1024, 3, True, True, 5, 6],
+                "stage4": [1024, 384, 2048, 1, True, True, 5, 6],
+            },
+            "url": "https://github.com/Peterande/storage/releases/download/dfinev1.0/PPHGNetV2_B4_stage1.pth",
+        },
+        "B5": {
+            "stem_channels": [3, 32, 64],
+            "stage_config": {
+                # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num
+                "stage1": [64, 64, 128, 1, False, False, 3, 6],
+                "stage2": [128, 128, 512, 2, True, False, 3, 6],
+                "stage3": [512, 256, 1024, 5, True, True, 5, 6],
+                "stage4": [1024, 512, 2048, 2, True, True, 5, 6],
+            },
+            "url": "https://github.com/Peterande/storage/releases/download/dfinev1.0/PPHGNetV2_B5_stage1.pth",
+        },
+        "B6": {
+            "stem_channels": [3, 48, 96],
+            "stage_config": {
+                # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num
+                "stage1": [96, 96, 192, 2, False, False, 3, 6],
+                "stage2": [192, 192, 512, 3, True, False, 3, 6],
+                "stage3": [512, 384, 1024, 6, True, True, 5, 6],
+                "stage4": [1024, 768, 2048, 3, True, True, 5, 6],
+            },
+            "url": "https://github.com/Peterande/storage/releases/download/dfinev1.0/PPHGNetV2_B6_stage1.pth",
+        },
+    }
+    def __init__(
+        self,
+        name,
+        use_lab=False,
+        return_idx=[1, 2, 3],
+        freeze_stem_only=True,
+        freeze_at=0,
+        freeze_norm=True,
+        pretrained=True,
+        local_model_dir="weight/hgnetv2/",
+    ):
+        super().__init__()
+        self.use_lab = use_lab
+        self.return_idx = return_idx
+        stem_channels = self.arch_configs[name]["stem_channels"]
+        stage_config = self.arch_configs[name]["stage_config"]
+        download_url = self.arch_configs[name]["url"]
+        self._out_strides = [4, 8, 16, 32]
+        self._out_channels = [stage_config[k][2] for k in stage_config]
+        # stem
+        self.stem = StemBlock(
+            in_chs=stem_channels[0],
+            mid_chs=stem_channels[1],
+            out_chs=stem_channels[2],
+            use_lab=use_lab,
+        )
+        # stages
+        self.stages = nn.ModuleList()
+        for i, k in enumerate(stage_config):
+            (
+                in_channels,
+                mid_channels,
+                out_channels,
+                block_num,
+                downsample,
+                light_block,
+                kernel_size,
+                layer_num,
+            ) = stage_config[k]
+            self.stages.append(
+                HG_Stage(
+                    in_channels,
+                    mid_channels,
+                    out_channels,
+                    block_num,
+                    layer_num,
+                    downsample,
+                    light_block,
+                    kernel_size,
+                    use_lab,
+                )
+            )
+        if freeze_at >= 0:
+            self._freeze_parameters(self.stem)
+            if not freeze_stem_only:
+                for i in range(min(freeze_at + 1, len(self.stages))):
+                    self._freeze_parameters(self.stages[i])
+        if freeze_norm:
+            self._freeze_norm(self)
+        if pretrained:
+            RED, GREEN, RESET = "\033[91m", "\033[92m", "\033[0m"
+            try:
+                model_path = local_model_dir + "PPHGNetV2_" + name + "_stage1.pth"
+                if os.path.exists(model_path):
+                    state = torch.load(model_path, map_location="cpu")
+                    print(f"Loaded stage1 {name} HGNetV2 from local file.")
+                else:
+                    # If the file doesn't exist locally, download from the URL
+                    if safe_get_rank() == 0:
+                        print(
+                            GREEN
+                            + "If the pretrained HGNetV2 can't be downloaded automatically. Please check your network connection."
+                            + RESET
+                        )
+                        print(
+                            GREEN
+                            + "Please check your network connection. Or download the model manually from "
+                            + RESET
+                            + f"{download_url}"
+                            + GREEN
+                            + " to "
+                            + RESET
+                            + f"{local_model_dir}."
+                            + RESET
+                        )
+                        state = torch.hub.load_state_dict_from_url(
+                            download_url, map_location="cpu", model_dir=local_model_dir
+                        )
+                        safe_barrier()
+                    else:
+                        safe_barrier()
+                        state = torch.load(local_model_dir)
+                    print(f"Loaded stage1 {name} HGNetV2 from URL.")
+                self.load_state_dict(state)
+            except (Exception, KeyboardInterrupt) as e:
+                if safe_get_rank() == 0:
+                    print(f"{str(e)}")
+                    logging.error(
+                        RED + "CRITICAL WARNING: Failed to load pretrained HGNetV2 model" + RESET
+                    )
+                    logging.error(
+                        GREEN
+                        + "Please check your network connection. Or download the model manually from "
+                        + RESET
+                        + f"{download_url}"
+                        + GREEN
+                        + " to "
+                        + RESET
+                        + f"{local_model_dir}."
+                        + RESET
+                    )
+                exit()
+    def _freeze_norm(self, m: nn.Module):
+        if isinstance(m, nn.BatchNorm2d):
+            m = FrozenBatchNorm2d(m.num_features)
+        else:
+            for name, child in m.named_children():
+                _child = self._freeze_norm(child)
+                if _child is not child:
+                    setattr(m, name, _child)
+        return m
+    def _freeze_parameters(self, m: nn.Module):
+        for p in m.parameters():
+            p.requires_grad = False
+    def forward(self, x):
+        x = self.stem(x)
+        outs = []
+        for idx, stage in enumerate(self.stages):
+            x = stage(x)
+            if idx in self.return_idx:
+                outs.append(x)
+        return outs

src/nn/backbone/presnet.py ADDED Viewed

	@@ -0,0 +1,263 @@

+"""
+Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+from collections import OrderedDict
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from ...core import register
+from .common import FrozenBatchNorm2d, get_activation
+__all__ = ["PResNet"]
+ResNet_cfg = {
+    18: [2, 2, 2, 2],
+    34: [3, 4, 6, 3],
+    50: [3, 4, 6, 3],
+    101: [3, 4, 23, 3],
+    # 152: [3, 8, 36, 3],
+}
+donwload_url = {
+    18: "https://github.com/lyuwenyu/storage/releases/download/v0.1/ResNet18_vd_pretrained_from_paddle.pth",
+    34: "https://github.com/lyuwenyu/storage/releases/download/v0.1/ResNet34_vd_pretrained_from_paddle.pth",
+    50: "https://github.com/lyuwenyu/storage/releases/download/v0.1/ResNet50_vd_ssld_v2_pretrained_from_paddle.pth",
+    101: "https://github.com/lyuwenyu/storage/releases/download/v0.1/ResNet101_vd_ssld_pretrained_from_paddle.pth",
+}
+class ConvNormLayer(nn.Module):
+    def __init__(self, ch_in, ch_out, kernel_size, stride, padding=None, bias=False, act=None):
+        super().__init__()
+        self.conv = nn.Conv2d(
+            ch_in,
+            ch_out,
+            kernel_size,
+            stride,
+            padding=(kernel_size - 1) // 2 if padding is None else padding,
+            bias=bias,
+        )
+        self.norm = nn.BatchNorm2d(ch_out)
+        self.act = get_activation(act)
+    def forward(self, x):
+        return self.act(self.norm(self.conv(x)))
+class BasicBlock(nn.Module):
+    expansion = 1
+    def __init__(self, ch_in, ch_out, stride, shortcut, act="relu", variant="b"):
+        super().__init__()
+        self.shortcut = shortcut
+        if not shortcut:
+            if variant == "d" and stride == 2:
+                self.short = nn.Sequential(
+                    OrderedDict(
+                        [
+                            ("pool", nn.AvgPool2d(2, 2, 0, ceil_mode=True)),
+                            ("conv", ConvNormLayer(ch_in, ch_out, 1, 1)),
+                        ]
+                    )
+                )
+            else:
+                self.short = ConvNormLayer(ch_in, ch_out, 1, stride)
+        self.branch2a = ConvNormLayer(ch_in, ch_out, 3, stride, act=act)
+        self.branch2b = ConvNormLayer(ch_out, ch_out, 3, 1, act=None)
+        self.act = nn.Identity() if act is None else get_activation(act)
+    def forward(self, x):
+        out = self.branch2a(x)
+        out = self.branch2b(out)
+        if self.shortcut:
+            short = x
+        else:
+            short = self.short(x)
+        out = out + short
+        out = self.act(out)
+        return out
+class BottleNeck(nn.Module):
+    expansion = 4
+    def __init__(self, ch_in, ch_out, stride, shortcut, act="relu", variant="b"):
+        super().__init__()
+        if variant == "a":
+            stride1, stride2 = stride, 1
+        else:
+            stride1, stride2 = 1, stride
+        width = ch_out
+        self.branch2a = ConvNormLayer(ch_in, width, 1, stride1, act=act)
+        self.branch2b = ConvNormLayer(width, width, 3, stride2, act=act)
+        self.branch2c = ConvNormLayer(width, ch_out * self.expansion, 1, 1)
+        self.shortcut = shortcut
+        if not shortcut:
+            if variant == "d" and stride == 2:
+                self.short = nn.Sequential(
+                    OrderedDict(
+                        [
+                            ("pool", nn.AvgPool2d(2, 2, 0, ceil_mode=True)),
+                            ("conv", ConvNormLayer(ch_in, ch_out * self.expansion, 1, 1)),
+                        ]
+                    )
+                )
+            else:
+                self.short = ConvNormLayer(ch_in, ch_out * self.expansion, 1, stride)
+        self.act = nn.Identity() if act is None else get_activation(act)
+    def forward(self, x):
+        out = self.branch2a(x)
+        out = self.branch2b(out)
+        out = self.branch2c(out)
+        if self.shortcut:
+            short = x
+        else:
+            short = self.short(x)
+        out = out + short
+        out = self.act(out)
+        return out
+class Blocks(nn.Module):
+    def __init__(self, block, ch_in, ch_out, count, stage_num, act="relu", variant="b"):
+        super().__init__()
+        self.blocks = nn.ModuleList()
+        for i in range(count):
+            self.blocks.append(
+                block(
+                    ch_in,
+                    ch_out,
+                    stride=2 if i == 0 and stage_num != 2 else 1,
+                    shortcut=False if i == 0 else True,
+                    variant=variant,
+                    act=act,
+                )
+            )
+            if i == 0:
+                ch_in = ch_out * block.expansion
+    def forward(self, x):
+        out = x
+        for block in self.blocks:
+            out = block(out)
+        return out
+@register()
+class PResNet(nn.Module):
+    def __init__(
+        self,
+        depth,
+        variant="d",
+        num_stages=4,
+        return_idx=[0, 1, 2, 3],
+        act="relu",
+        freeze_at=-1,
+        freeze_norm=True,
+        pretrained=False,
+    ):
+        super().__init__()
+        block_nums = ResNet_cfg[depth]
+        ch_in = 64
+        if variant in ["c", "d"]:
+            conv_def = [
+                [3, ch_in // 2, 3, 2, "conv1_1"],
+                [ch_in // 2, ch_in // 2, 3, 1, "conv1_2"],
+                [ch_in // 2, ch_in, 3, 1, "conv1_3"],
+            ]
+        else:
+            conv_def = [[3, ch_in, 7, 2, "conv1_1"]]
+        self.conv1 = nn.Sequential(
+            OrderedDict(
+                [
+                    (name, ConvNormLayer(cin, cout, k, s, act=act))
+                    for cin, cout, k, s, name in conv_def
+                ]
+            )
+        )
+        ch_out_list = [64, 128, 256, 512]
+        block = BottleNeck if depth >= 50 else BasicBlock
+        _out_channels = [block.expansion * v for v in ch_out_list]
+        _out_strides = [4, 8, 16, 32]
+        self.res_layers = nn.ModuleList()
+        for i in range(num_stages):
+            stage_num = i + 2
+            self.res_layers.append(
+                Blocks(
+                    block, ch_in, ch_out_list[i], block_nums[i], stage_num, act=act, variant=variant
+                )
+            )
+            ch_in = _out_channels[i]
+        self.return_idx = return_idx
+        self.out_channels = [_out_channels[_i] for _i in return_idx]
+        self.out_strides = [_out_strides[_i] for _i in return_idx]
+        if freeze_at >= 0:
+            self._freeze_parameters(self.conv1)
+            for i in range(min(freeze_at, num_stages)):
+                self._freeze_parameters(self.res_layers[i])
+        if freeze_norm:
+            self._freeze_norm(self)
+        if pretrained:
+            if isinstance(pretrained, bool) or "http" in pretrained:
+                state = torch.hub.load_state_dict_from_url(
+                    donwload_url[depth], map_location="cpu", model_dir="weight"
+                )
+            else:
+                state = torch.load(pretrained, map_location="cpu")
+            self.load_state_dict(state)
+            print(f"Load PResNet{depth} state_dict")
+    def _freeze_parameters(self, m: nn.Module):
+        for p in m.parameters():
+            p.requires_grad = False
+    def _freeze_norm(self, m: nn.Module):
+        if isinstance(m, nn.BatchNorm2d):
+            m = FrozenBatchNorm2d(m.num_features)
+        else:
+            for name, child in m.named_children():
+                _child = self._freeze_norm(child)
+                if _child is not child:
+                    setattr(m, name, _child)
+        return m
+    def forward(self, x):
+        conv1 = self.conv1(x)
+        x = F.max_pool2d(conv1, kernel_size=3, stride=2, padding=1)
+        outs = []
+        for idx, stage in enumerate(self.res_layers):
+            x = stage(x)
+            if idx in self.return_idx:
+                outs.append(x)
+        return outs

src/nn/backbone/test_resnet.py ADDED Viewed

	@@ -0,0 +1,83 @@

+from collections import OrderedDict
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from ...core import register
+class BasicBlock(nn.Module):
+    expansion = 1
+    def __init__(self, in_planes, planes, stride=1):
+        super(BasicBlock, self).__init__()
+        self.conv1 = nn.Conv2d(
+            in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False
+        )
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.shortcut = nn.Sequential()
+        if stride != 1 or in_planes != self.expansion * planes:
+            self.shortcut = nn.Sequential(
+                nn.Conv2d(
+                    in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False
+                ),
+                nn.BatchNorm2d(self.expansion * planes),
+            )
+    def forward(self, x):
+        out = F.relu(self.bn1(self.conv1(x)))
+        out = self.bn2(self.conv2(out))
+        out += self.shortcut(x)
+        out = F.relu(out)
+        return out
+class _ResNet(nn.Module):
+    def __init__(self, block, num_blocks, num_classes=10):
+        super().__init__()
+        self.in_planes = 64
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(64)
+        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
+        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
+        self.linear = nn.Linear(512 * block.expansion, num_classes)
+    def _make_layer(self, block, planes, num_blocks, stride):
+        strides = [stride] + [1] * (num_blocks - 1)
+        layers = []
+        for stride in strides:
+            layers.append(block(self.in_planes, planes, stride))
+            self.in_planes = planes * block.expansion
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        out = F.relu(self.bn1(self.conv1(x)))
+        out = self.layer1(out)
+        out = self.layer2(out)
+        out = self.layer3(out)
+        out = self.layer4(out)
+        out = F.avg_pool2d(out, 4)
+        out = out.view(out.size(0), -1)
+        out = self.linear(out)
+        return out
+@register()
+class MResNet(nn.Module):
+    def __init__(self, num_classes=10, num_blocks=[2, 2, 2, 2]) -> None:
+        super().__init__()
+        self.model = _ResNet(BasicBlock, num_blocks, num_classes)
+    def forward(self, x):
+        return self.model(x)

src/nn/backbone/timm_model.py ADDED Viewed

	@@ -0,0 +1,66 @@

+"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+https://towardsdatascience.com/getting-started-with-pytorch-image-models-timm-a-practitioners-guide-4e77b4bf9055#0583
+"""
+import torch
+from torchvision.models.feature_extraction import create_feature_extractor, get_graph_node_names
+from ...core import register
+from .utils import IntermediateLayerGetter
+@register()
+class TimmModel(torch.nn.Module):
+    def __init__(
+        self, name, return_layers, pretrained=False, exportable=True, features_only=True, **kwargs
+    ) -> None:
+        super().__init__()
+        import timm
+        model = timm.create_model(
+            name,
+            pretrained=pretrained,
+            exportable=exportable,
+            features_only=features_only,
+            **kwargs,
+        )
+        # nodes, _ = get_graph_node_names(model)
+        # print(nodes)
+        # features = {'': ''}
+        # model = create_feature_extractor(model, return_nodes=features)
+        assert set(return_layers).issubset(
+            model.feature_info.module_name()
+        ), f"return_layers should be a subset of {model.feature_info.module_name()}"
+        # self.model = model
+        self.model = IntermediateLayerGetter(model, return_layers)
+        return_idx = [model.feature_info.module_name().index(name) for name in return_layers]
+        self.strides = [model.feature_info.reduction()[i] for i in return_idx]
+        self.channels = [model.feature_info.channels()[i] for i in return_idx]
+        self.return_idx = return_idx
+        self.return_layers = return_layers
+    def forward(self, x: torch.Tensor):
+        outputs = self.model(x)
+        # outputs = [outputs[i] for i in self.return_idx]
+        return outputs
+if __name__ == "__main__":
+    model = TimmModel(name="resnet34", return_layers=["layer2", "layer3"])
+    data = torch.rand(1, 3, 640, 640)
+    outputs = model(data)
+    for output in outputs:
+        print(output.shape)
+    """
+    model:
+        type: TimmModel
+        name: resnet34
+        return_layers: ['layer2', 'layer4']
+    """

src/nn/backbone/torchvision_model.py ADDED Viewed

	@@ -0,0 +1,50 @@

+"""
+Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+import torch
+import torchvision
+from ...core import register
+from .utils import IntermediateLayerGetter
+__all__ = ["TorchVisionModel"]
+@register()
+class TorchVisionModel(torch.nn.Module):
+    def __init__(self, name, return_layers, weights=None, **kwargs) -> None:
+        super().__init__()
+        if weights is not None:
+            weights = getattr(torchvision.models.get_model_weights(name), weights)
+        model = torchvision.models.get_model(name, weights=weights, **kwargs)
+        # TODO hard code.
+        if hasattr(model, "features"):
+            model = IntermediateLayerGetter(model.features, return_layers)
+        else:
+            model = IntermediateLayerGetter(model, return_layers)
+        self.model = model
+    def forward(self, x):
+        return self.model(x)
+# TorchVisionModel('swin_t', return_layers=['5', '7'])
+# TorchVisionModel('resnet34', return_layers=['layer2','layer3', 'layer4'])
+# TorchVisionModel:
+#     name: swin_t
+#     return_layers: ['5', '7']
+#     weights: DEFAULT
+# model:
+#     type: TorchVisionModel
+#     name: resnet34
+#     return_layers: ['layer2','layer3', 'layer4']
+#     weights: DEFAULT

src/nn/backbone/utils.py ADDED Viewed

	@@ -0,0 +1,56 @@

+"""
+https://github.com/pytorch/vision/blob/main/torchvision/models/_utils.py
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+from collections import OrderedDict
+from typing import Dict, List
+import torch.nn as nn
+class IntermediateLayerGetter(nn.ModuleDict):
+    """
+    Module wrapper that returns intermediate layers from a model
+    It has a strong assumption that the modules have been registered
+    into the model in the same order as they are used.
+    This means that one should **not** reuse the same nn.Module
+    twice in the forward if you want this to work.
+    Additionally, it is only able to query submodules that are directly
+    assigned to the model. So if `model` is passed, `model.feature1` can
+    be returned, but not `model.feature1.layer2`.
+    """
+    _version = 3
+    def __init__(self, model: nn.Module, return_layers: List[str]) -> None:
+        if not set(return_layers).issubset([name for name, _ in model.named_children()]):
+            raise ValueError(
+                "return_layers are not present in model. {}".format(
+                    [name for name, _ in model.named_children()]
+                )
+            )
+        orig_return_layers = return_layers
+        return_layers = {str(k): str(k) for k in return_layers}
+        layers = OrderedDict()
+        for name, module in model.named_children():
+            layers[name] = module
+            if name in return_layers:
+                del return_layers[name]
+            if not return_layers:
+                break
+        super().__init__(layers)
+        self.return_layers = orig_return_layers
+    def forward(self, x):
+        outputs = []
+        for name, module in self.items():
+            x = module(x)
+            if name in self.return_layers:
+                outputs.append(x)
+        return outputs

src/nn/criterion/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+"""
+Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+import torch.nn as nn
+from ...core import register
+from .det_criterion import DetCriterion
+CrossEntropyLoss = register()(nn.CrossEntropyLoss)

src/nn/criterion/det_criterion.py ADDED Viewed

	@@ -0,0 +1,188 @@

+"""
+Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+import torch
+import torch.distributed
+import torch.nn.functional as F
+import torchvision
+from ...core import register
+from ...misc import box_ops, dist_utils
+@register()
+class DetCriterion(torch.nn.Module):
+    """Default Detection Criterion"""
+    __share__ = ["num_classes"]
+    __inject__ = ["matcher"]
+    def __init__(
+        self,
+        losses,
+        weight_dict,
+        num_classes=80,
+        alpha=0.75,
+        gamma=2.0,
+        box_fmt="cxcywh",
+        matcher=None,
+    ):
+        """
+        Args:
+            losses (list[str]): requested losses, support ['boxes', 'vfl', 'focal']
+            weight_dict (dict[str, float)]: corresponding losses weight, including
+                ['loss_bbox', 'loss_giou', 'loss_vfl', 'loss_focal']
+            box_fmt (str): in box format, 'cxcywh' or 'xyxy'
+            matcher (Matcher): matcher used to match source to target
+        """
+        super().__init__()
+        self.losses = losses
+        self.weight_dict = weight_dict
+        self.alpha = alpha
+        self.gamma = gamma
+        self.num_classes = num_classes
+        self.box_fmt = box_fmt
+        assert matcher is not None, ""
+        self.matcher = matcher
+    def forward(self, outputs, targets, **kwargs):
+        """
+        Args:
+            outputs: Dict[Tensor], 'pred_boxes', 'pred_logits', 'meta'.
+            targets, List[Dict[str, Tensor]], len(targets) == batch_size.
+            kwargs, store other information such as current epoch id.
+        Return:
+            losses, Dict[str, Tensor]
+        """
+        matched = self.matcher(outputs, targets)
+        values = matched["values"]
+        indices = matched["indices"]
+        num_boxes = self._get_positive_nums(indices)
+        # Compute all the requested losses
+        losses = {}
+        for loss in self.losses:
+            l_dict = self.get_loss(loss, outputs, targets, indices, num_boxes)
+            l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict}
+            losses.update(l_dict)
+        return losses
+    def _get_src_permutation_idx(self, indices):
+        # permute predictions following indices
+        batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
+        src_idx = torch.cat([src for (src, _) in indices])
+        return batch_idx, src_idx
+    def _get_tgt_permutation_idx(self, indices):
+        # permute targets following indices
+        batch_idx = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)])
+        tgt_idx = torch.cat([tgt for (_, tgt) in indices])
+        return batch_idx, tgt_idx
+    def _get_positive_nums(self, indices):
+        # number of positive samples
+        num_pos = sum(len(i) for (i, _) in indices)
+        num_pos = torch.as_tensor([num_pos], dtype=torch.float32, device=indices[0][0].device)
+        if dist_utils.is_dist_available_and_initialized():
+            torch.distributed.all_reduce(num_pos)
+        num_pos = torch.clamp(num_pos / dist_utils.get_world_size(), min=1).item()
+        return num_pos
+    def loss_labels_focal(self, outputs, targets, indices, num_boxes):
+        assert "pred_logits" in outputs
+        src_logits = outputs["pred_logits"]
+        idx = self._get_src_permutation_idx(indices)
+        target_classes_o = torch.cat([t["labels"][j] for t, (_, j) in zip(targets, indices)])
+        target_classes = torch.full(
+            src_logits.shape[:2], self.num_classes, dtype=torch.int64, device=src_logits.device
+        )
+        target_classes[idx] = target_classes_o
+        target = F.one_hot(target_classes, num_classes=self.num_classes + 1)[..., :-1].to(
+            src_logits.dtype
+        )
+        loss = torchvision.ops.sigmoid_focal_loss(
+            src_logits, target, self.alpha, self.gamma, reduction="none"
+        )
+        loss = loss.sum() / num_boxes
+        return {"loss_focal": loss}
+    def loss_labels_vfl(self, outputs, targets, indices, num_boxes):
+        assert "pred_boxes" in outputs
+        idx = self._get_src_permutation_idx(indices)
+        src_boxes = outputs["pred_boxes"][idx]
+        target_boxes = torch.cat([t["boxes"][j] for t, (_, j) in zip(targets, indices)], dim=0)
+        src_boxes = torchvision.ops.box_convert(src_boxes, in_fmt=self.box_fmt, out_fmt="xyxy")
+        target_boxes = torchvision.ops.box_convert(
+            target_boxes, in_fmt=self.box_fmt, out_fmt="xyxy"
+        )
+        iou, _ = box_ops.elementwise_box_iou(src_boxes.detach(), target_boxes)
+        src_logits: torch.Tensor = outputs["pred_logits"]
+        target_classes_o = torch.cat([t["labels"][j] for t, (_, j) in zip(targets, indices)])
+        target_classes = torch.full(
+            src_logits.shape[:2], self.num_classes, dtype=torch.int64, device=src_logits.device
+        )
+        target_classes[idx] = target_classes_o
+        target = F.one_hot(target_classes, num_classes=self.num_classes + 1)[..., :-1]
+        target_score_o = torch.zeros_like(target_classes, dtype=src_logits.dtype)
+        target_score_o[idx] = iou.to(src_logits.dtype)
+        target_score = target_score_o.unsqueeze(-1) * target
+        src_score = F.sigmoid(src_logits.detach())
+        weight = self.alpha * src_score.pow(self.gamma) * (1 - target) + target_score
+        loss = F.binary_cross_entropy_with_logits(
+            src_logits, target_score, weight=weight, reduction="none"
+        )
+        loss = loss.sum() / num_boxes
+        return {"loss_vfl": loss}
+    def loss_boxes(self, outputs, targets, indices, num_boxes):
+        assert "pred_boxes" in outputs
+        idx = self._get_src_permutation_idx(indices)
+        src_boxes = outputs["pred_boxes"][idx]
+        target_boxes = torch.cat([t["boxes"][i] for t, (_, i) in zip(targets, indices)], dim=0)
+        losses = {}
+        loss_bbox = F.l1_loss(src_boxes, target_boxes, reduction="none")
+        losses["loss_bbox"] = loss_bbox.sum() / num_boxes
+        src_boxes = torchvision.ops.box_convert(src_boxes, in_fmt=self.box_fmt, out_fmt="xyxy")
+        target_boxes = torchvision.ops.box_convert(
+            target_boxes, in_fmt=self.box_fmt, out_fmt="xyxy"
+        )
+        loss_giou = 1 - box_ops.elementwise_generalized_box_iou(src_boxes, target_boxes)
+        losses["loss_giou"] = loss_giou.sum() / num_boxes
+        return losses
+    def loss_boxes_giou(self, outputs, targets, indices, num_boxes):
+        assert "pred_boxes" in outputs
+        idx = self._get_src_permutation_idx(indices)
+        src_boxes = outputs["pred_boxes"][idx]
+        target_boxes = torch.cat([t["boxes"][i] for t, (_, i) in zip(targets, indices)], dim=0)
+        losses = {}
+        src_boxes = torchvision.ops.box_convert(src_boxes, in_fmt=self.box_fmt, out_fmt="xyxy")
+        target_boxes = torchvision.ops.box_convert(
+            target_boxes, in_fmt=self.box_fmt, out_fmt="xyxy"
+        )
+        loss_giou = 1 - box_ops.elementwise_generalized_box_iou(src_boxes, target_boxes)
+        losses["loss_giou"] = loss_giou.sum() / num_boxes
+        return losses
+    def get_loss(self, loss, outputs, targets, indices, num_boxes, **kwargs):
+        loss_map = {
+            "boxes": self.loss_boxes,
+            "giou": self.loss_boxes_giou,
+            "vfl": self.loss_labels_vfl,
+            "focal": self.loss_labels_focal,
+        }
+        assert loss in loss_map, f"do you really want to compute {loss} loss?"
+        return loss_map[loss](outputs, targets, indices, num_boxes, **kwargs)

src/nn/postprocessor/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+"""
+Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+from .nms_postprocessor import DetNMSPostProcessor

src/nn/postprocessor/box_revert.py ADDED Viewed

	@@ -0,0 +1,66 @@

+"""
+Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+from enum import Enum
+import torch
+import torchvision
+from torch import Tensor
+class BoxProcessFormat(Enum):
+    """Box process format
+    Available formats are
+    * ``RESIZE``
+    * ``RESIZE_KEEP_RATIO``
+    * ``RESIZE_KEEP_RATIO_PADDING``
+    """
+    RESIZE = 1
+    RESIZE_KEEP_RATIO = 2
+    RESIZE_KEEP_RATIO_PADDING = 3
+def box_revert(
+    boxes: Tensor,
+    orig_sizes: Tensor = None,
+    eval_sizes: Tensor = None,
+    inpt_sizes: Tensor = None,
+    inpt_padding: Tensor = None,
+    normalized: bool = True,
+    in_fmt: str = "cxcywh",
+    out_fmt: str = "xyxy",
+    process_fmt=BoxProcessFormat.RESIZE,
+) -> Tensor:
+    """
+    Args:
+        boxes(Tensor), [N, :, 4], (x1, y1, x2, y2), pred boxes.
+        inpt_sizes(Tensor), [N, 2], (w, h). input sizes.
+        orig_sizes(Tensor), [N, 2], (w, h). origin sizes.
+        inpt_padding (Tensor), [N, 2], (w_pad, h_pad, ...).
+        (inpt_sizes + inpt_padding) == eval_sizes
+    """
+    assert in_fmt in ("cxcywh", "xyxy"), ""
+    if normalized and eval_sizes is not None:
+        boxes = boxes * eval_sizes.repeat(1, 2).unsqueeze(1)
+    if inpt_padding is not None:
+        if in_fmt == "xyxy":
+            boxes -= inpt_padding[:, :2].repeat(1, 2).unsqueeze(1)
+        elif in_fmt == "cxcywh":
+            boxes[..., :2] -= inpt_padding[:, :2].repeat(1, 2).unsqueeze(1)
+    if orig_sizes is not None:
+        orig_sizes = orig_sizes.repeat(1, 2).unsqueeze(1)
+        if inpt_sizes is not None:
+            inpt_sizes = inpt_sizes.repeat(1, 2).unsqueeze(1)
+            boxes = boxes * (orig_sizes / inpt_sizes)
+        else:
+            boxes = boxes * orig_sizes
+    boxes = torchvision.ops.box_convert(boxes, in_fmt=in_fmt, out_fmt=out_fmt)
+    return boxes

src/nn/postprocessor/detr_postprocessor.py ADDED Viewed

	@@ -0,0 +1,86 @@

+"""
+Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchvision
+__all__ = ["DetDETRPostProcessor"]
+from .box_revert import BoxProcessFormat, box_revert
+def mod(a, b):
+    out = a - a // b * b
+    return out
+class DetDETRPostProcessor(nn.Module):
+    def __init__(
+        self,
+        num_classes=80,
+        use_focal_loss=True,
+        num_top_queries=300,
+        box_process_format=BoxProcessFormat.RESIZE,
+    ) -> None:
+        super().__init__()
+        self.use_focal_loss = use_focal_loss
+        self.num_top_queries = num_top_queries
+        self.num_classes = int(num_classes)
+        self.box_process_format = box_process_format
+        self.deploy_mode = False
+    def extra_repr(self) -> str:
+        return f"use_focal_loss={self.use_focal_loss}, num_classes={self.num_classes}, num_top_queries={self.num_top_queries}"
+    def forward(self, outputs, **kwargs):
+        logits, boxes = outputs["pred_logits"], outputs["pred_boxes"]
+        if self.use_focal_loss:
+            scores = F.sigmoid(logits)
+            scores, index = torch.topk(scores.flatten(1), self.num_top_queries, dim=-1)
+            labels = index % self.num_classes
+            # labels = mod(index, self.num_classes) # for tensorrt
+            index = index // self.num_classes
+            boxes = boxes.gather(dim=1, index=index.unsqueeze(-1).repeat(1, 1, boxes.shape[-1]))
+        else:
+            scores = F.softmax(logits)[:, :, :-1]
+            scores, labels = scores.max(dim=-1)
+            if scores.shape[1] > self.num_top_queries:
+                scores, index = torch.topk(scores, self.num_top_queries, dim=-1)
+                labels = torch.gather(labels, dim=1, index=index)
+                boxes = torch.gather(
+                    boxes, dim=1, index=index.unsqueeze(-1).tile(1, 1, boxes.shape[-1])
+                )
+        if kwargs is not None:
+            boxes = box_revert(
+                boxes,
+                in_fmt="cxcywh",
+                out_fmt="xyxy",
+                process_fmt=self.box_process_format,
+                normalized=True,
+                **kwargs,
+            )
+        # TODO for onnx export
+        if self.deploy_mode:
+            return labels, boxes, scores
+        results = []
+        for lab, box, sco in zip(labels, boxes, scores):
+            result = dict(labels=lab, boxes=box, scores=sco)
+            results.append(result)
+        return results
+    def deploy(
+        self,
+    ):
+        self.eval()
+        self.deploy_mode = True
+        return self

src/nn/postprocessor/nms_postprocessor.py ADDED Viewed

	@@ -0,0 +1,86 @@

+"""
+Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+from typing import Dict
+import torch
+import torch.distributed
+import torch.nn.functional as F
+import torchvision
+from torch import Tensor
+from ...core import register
+__all__ = [
+    "DetNMSPostProcessor",
+]
+@register()
+class DetNMSPostProcessor(torch.nn.Module):
+    def __init__(
+        self,
+        iou_threshold=0.7,
+        score_threshold=0.01,
+        keep_topk=300,
+        box_fmt="cxcywh",
+        logit_fmt="sigmoid",
+    ) -> None:
+        super().__init__()
+        self.iou_threshold = iou_threshold
+        self.score_threshold = score_threshold
+        self.keep_topk = keep_topk
+        self.box_fmt = box_fmt.lower()
+        self.logit_fmt = logit_fmt.lower()
+        self.logit_func = getattr(F, self.logit_fmt, None)
+        self.deploy_mode = False
+    def forward(self, outputs: Dict[str, Tensor], orig_target_sizes: Tensor):
+        logits, boxes = outputs["pred_logits"], outputs["pred_boxes"]
+        pred_boxes = torchvision.ops.box_convert(boxes, in_fmt=self.box_fmt, out_fmt="xyxy")
+        pred_boxes *= orig_target_sizes.repeat(1, 2).unsqueeze(1)
+        values, pred_labels = torch.max(logits, dim=-1)
+        if self.logit_func:
+            pred_scores = self.logit_func(values)
+        else:
+            pred_scores = values
+        # TODO for onnx export
+        if self.deploy_mode:
+            blobs = {
+                "pred_labels": pred_labels,
+                "pred_boxes": pred_boxes,
+                "pred_scores": pred_scores,
+            }
+            return blobs
+        results = []
+        for i in range(logits.shape[0]):
+            score_keep = pred_scores[i] > self.score_threshold
+            pred_box = pred_boxes[i][score_keep]
+            pred_label = pred_labels[i][score_keep]
+            pred_score = pred_scores[i][score_keep]
+            keep = torchvision.ops.batched_nms(pred_box, pred_score, pred_label, self.iou_threshold)
+            keep = keep[: self.keep_topk]
+            blob = {
+                "labels": pred_label[keep],
+                "boxes": pred_box[keep],
+                "scores": pred_score[keep],
+            }
+            results.append(blob)
+        return results
+    def deploy(
+        self,
+    ):
+        self.eval()
+        self.deploy_mode = True
+        return self