Spaces:

scfive
/

samewind

Build error

File size: 79,341 Bytes

e8f2571

# Copyright (c) OpenMMLab. All rights reserved.
import copy
import os.path
from typing import Dict, List, Tuple, Optional
from torch import Tensor
from mmcv.cnn import Linear
from mmengine.model import bias_init_with_prob, constant_init
from mmengine.structures import InstanceData
from mmengine.model import BaseModule
from mmdet.registry import MODELS
from mmdet.structures import SampleList
from mmdet.structures.bbox import bbox_cxcywh_to_xyxy, bbox_xyxy_to_cxcywh, bbox_overlaps
from mmdet.utils import InstanceList, OptInstanceList, reduce_mean
from ..utils import multi_apply
from ..layers import inverse_sigmoid
from .detr_head import DETRHead
from mmdet.registry import MODELS, TASK_UTILS
from mmdet.utils import (ConfigType, InstanceList, OptInstanceList,OptConfigType,
                         OptMultiConfig, reduce_mean)
from mmcv.ops import nms, batched_nms
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import Transformer
import scipy.io as sio
import os
from ..losses import QualityFocalLoss
# def adjust_bbox_to_pixel(bboxes: Tensor):
#     # 向下取整得到目标的左上角坐标
#     adjusted_bboxes = torch.floor(bboxes)
#     # 向上取整得到目标的右下角坐标
#     adjusted_bboxes[:, 2:] = torch.ceil(bboxes[:, 2:])
#     return adjusted_bboxes


def adjust_bbox_to_pixel(bboxes: Tensor):
    # 四舍五入取整坐标
    adjusted_bboxes = torch.round(bboxes)
    return adjusted_bboxes

@MODELS.register_module()
class EvloveDetHead(BaseModule):
    r"""Head of the DINO: DETR with Improved DeNoising Anchor Boxes
    for End-to-End Object Detection

    Code is modified from the `official github repo
    <https://github.com/IDEA-Research/DINO>`_.

    More details can be found in the `paper
    <https://arxiv.org/abs/2203.03605>`_ .
    """

    def __init__(self,
                 num_classes: int,
                 embed_dims: int = 256,
                 decoder_embed_dims: int = 256,
                 num_reg_fcs: int = 2,
                 center_feat_indice: int=1,
                 sync_cls_avg_factor: bool = False,
                 use_nms: bool = False,
                 score_threshold: float = 0.0,
                 class_wise_nms: bool = True,
                 test_nms: OptConfigType = dict(type='nms', iou_threshold=0.01, ),
                 loss_cls: ConfigType = dict(
                                    type='FocalLoss',
                                    use_sigmoid=True,
                                    gamma=2.0,
                                    alpha=0.25,
                                    loss_weight=1.0),
                 loss_center_cls: ConfigType = dict(
                                    type='FocalLoss',
                                    use_sigmoid=True,
                                    gamma=2.0,
                                    alpha=0.25,
                                    loss_weight=1.0),
                 loss_bbox: ConfigType = dict(type='L1Loss', loss_weight=5.0),
                 loss_iou: OptConfigType = None,
                 loss_seg: ConfigType = dict(
                                    type='FocalLoss',
                                    use_sigmoid=True,
                                    gamma=2.0,
                                    alpha=0.25,
                                    loss_weight=1.0),
                 # loss_seg: ConfigType = dict(type='L1Loss', loss_weight=1.0),
                 loss_abu: ConfigType = dict(type='L1Loss', loss_weight=1.0),
                 train_cfg: ConfigType = dict(
                     assigner=dict(
                         type='HungarianAssigner',
                         match_costs=[
                             dict(type='ClassificationCost', weight=1.),
                             dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'),
                             dict(type='IoUCost', iou_mode='giou', weight=2.0)
                         ])),
                 test_cfg: ConfigType = dict(max_per_img=100),
                 init_cfg: OptMultiConfig = None,
                 share_pred_layer: bool = False,
                 num_pred_layer: int = 6,
                 as_two_stage: bool = False,
                 pre_bboxes_round: bool = True,
                 neg_hard_num: int = 0,
                 seg_neg_hard_num: int = 0,
                 loss_center_th: float = 0.2,
                 loss_iou_th: float = 0.3,
                 center_ds_ratio: int = 1,
                 use_center: bool = True,
                 predict_segmentation: bool = False,
                 predict_abundance: bool = False,
                 save_path: Optional[str]= None,
                 mask_threshold:float = 0.5,
                 mask_extend_pixel: int = 2,
                 ) -> None:
        self.share_pred_layer = share_pred_layer
        self.num_pred_layer = num_pred_layer
        self.as_two_stage = as_two_stage
        self.pre_bboxes_round = pre_bboxes_round
        self.score_threshold = score_threshold
        self.loss_center_th = loss_center_th
        self.loss_iou_th = loss_iou_th
        self.center_feat_indice = center_feat_indice
        self.center_ds_ratio = center_ds_ratio
        super().__init__(init_cfg=init_cfg)
        self.bg_cls_weight = 0
        self.sync_cls_avg_factor = sync_cls_avg_factor
        class_weight = loss_cls.get('class_weight', None)
        if class_weight is not None and (self.__class__ is DETRHead):
            assert isinstance(class_weight, float), 'Expected ' \
                'class_weight to have type float. Found ' \
                f'{type(class_weight)}.'
            # NOTE following the official DETR repo, bg_cls_weight means
            # relative classification weight of the no-object class.
            bg_cls_weight = loss_cls.get('bg_cls_weight', class_weight)
            assert isinstance(bg_cls_weight, float), 'Expected ' \
                'bg_cls_weight to have type float. Found ' \
                f'{type(bg_cls_weight)}.'
            class_weight = torch.ones(num_classes + 1) * class_weight
            # set background class as the last indice
            class_weight[num_classes] = bg_cls_weight
            loss_cls.update({'class_weight': class_weight})
            if 'bg_cls_weight' in loss_cls:
                loss_cls.pop('bg_cls_weight')
            self.bg_cls_weight = bg_cls_weight
        if train_cfg:
            assert 'assigner' in train_cfg, 'assigner should be provided ' \
                                            'when train_cfg is set.'
            assigner = train_cfg['assigner']
            self.assigner = TASK_UTILS.build(assigner)
            if train_cfg.get('sampler', None) is not None:
                raise RuntimeError('DETR do not build sampler.')
        # self.bbox_assigner = TASK_UTILS.build(bbox_assigner)
        # self.dn_assigner = TASK_UTILS.build(dn_assigner)
        self.num_classes = num_classes
        self.embed_dims = embed_dims
        self.num_reg_fcs = num_reg_fcs
        self.train_cfg = train_cfg
        self.test_cfg = test_cfg
        self.loss_cls = MODELS.build(loss_cls)
        self.loss_center_cls = MODELS.build(loss_center_cls)
        self.loss_bbox = MODELS.build(loss_bbox)
        if loss_iou is not None:
            self.loss_iou = MODELS.build(loss_iou)
        else:
            self.loss_iou = None
        self.loss_seg = MODELS.build(loss_seg)
        self.loss_abu = MODELS.build(loss_abu)
        self.use_nms = use_nms
        self.class_wise_nms = class_wise_nms
        self.score_threshold = score_threshold
        self.test_nms = test_nms
        if self.loss_cls.use_sigmoid:
            self.cls_out_channels = num_classes
        else:
            self.cls_out_channels = num_classes + 1
        self.neg_hard_num = neg_hard_num
        self.seg_neg_hard_num = neg_hard_num
        self.use_center = use_center
        self.decoder_embed_dims = decoder_embed_dims
        self.predict_segmentation = predict_segmentation
        self.predict_abundance = predict_abundance
        self.save_path = save_path
        if self.save_path is not None:
            os.makedirs(self.save_path,exist_ok=True)
        self.mask_threshold = mask_threshold
        self.mask_extend_pixel = mask_extend_pixel
        self._init_layers()

    def _init_layers(self) -> None:
        """Initialize classification branch and regression branch of head."""
        # fc_cls = Linear(self.embed_dims, self.cls_out_channels)
        fc_cls = []
        for _ in range(2):
            fc_cls.append(Linear(self.embed_dims, self.embed_dims))
        fc_cls.append(Linear(self.embed_dims, self.cls_out_channels))
        fc_cls = nn.Sequential(*fc_cls)
        self.cls_branch = fc_cls
        if self.predict_segmentation:
            fc_cls = []
            for _ in range(2):
                fc_cls.append(Linear(self.embed_dims, self.embed_dims))
            fc_cls.append(Linear(self.embed_dims, 1))
            fc_cls = nn.Sequential(*fc_cls)
            self.seg_branch = fc_cls
        if self.predict_abundance:
            fc_cls = []
            for _ in range(2):
                fc_cls.append(Linear(self.embed_dims, self.embed_dims))
            fc_cls.append(Linear(self.embed_dims, 1))
            fc_cls = nn.Sequential(*fc_cls)
            self.abu_branch = fc_cls

        reg_branch = []
        ratio=2
        reg_branch.append(Linear(self.decoder_embed_dims, self.decoder_embed_dims*ratio))
        reg_branch.append(nn.ReLU())
        for _ in range(self.num_reg_fcs-1):
            reg_branch.append(Linear(self.decoder_embed_dims*ratio, self.decoder_embed_dims*ratio))
            reg_branch.append(nn.ReLU())
        reg_branch.append(Linear(self.decoder_embed_dims*ratio, 4))
        reg_branch = nn.Sequential(*reg_branch)
        if self.share_pred_layer:
            self.reg_branches = nn.ModuleList(
                [reg_branch for _ in range(self.num_pred_layer)])
        else:
            self.reg_branches = nn.ModuleList([
                copy.deepcopy(reg_branch) for _ in range(self.num_pred_layer)
            ])

        if self.use_center:
            center_cls = []
            for _ in range(2):
                center_cls.append(Linear(self.embed_dims, self.embed_dims))
            center_cls.append(Linear(self.embed_dims, 1))
            center_cls = nn.Sequential(*center_cls)
            self.center_branch = center_cls


    def init_weights(self) -> None:
        """Initialize weights of the Deformable DETR head."""
        if self.loss_cls.use_sigmoid:
            bias_init = bias_init_with_prob(0.01)
            # for m in self.cls_branches:
            nn.init.constant_(self.cls_branch.bias, bias_init)
            if self.use_center:
                nn.init.constant_(self.center_branch.bias, bias_init)
            if self.predict_segmentation:
                nn.init.constant_(self.seg_branch.bias, bias_init)
            if self.predict_abundance:
                nn.init.constant_(self.abu_branch.bias, bias_init)
        for m in self.reg_branches:
            constant_init(m[-1], 0, bias=0)
            nn.init.constant_(m[-1].bias.data[2:], 0.0)
        nn.init.constant_(self.reg_branches[0][-1].bias.data[2:], -2.0)



    # def _get_targets_single(self, cls_score: Tensor, bbox_pred: Tensor,
    #                         gt_instances: InstanceData,
    #                         img_meta: dict,
    #                         with_neg_cls:bool=True,
    #                         assigner_type:str = None) -> tuple:
    #     """Compute regression and classification targets for one image.
    #
    #     Outputs from a single decoder layer of a single feature level are used.
    #
    #     Args:
    #         cls_score (Tensor): Box score logits from a single decoder layer
    #             for one image. Shape [num_queries, cls_out_channels].
    #         bbox_pred (Tensor): Sigmoid outputs from a single decoder layer
    #             for one image, with normalized coordinate (cx, cy, w, h) and
    #             shape [num_queries, 4].
    #         gt_instances (:obj:`InstanceData`): Ground truth of instance
    #             annotations. It should includes ``bboxes`` and ``labels``
    #             attributes.
    #         img_meta (dict): Meta information for one image.
    #
    #     Returns:
    #         tuple[Tensor]: a tuple containing the following for one image.
    #
    #         - labels (Tensor): Labels of each image.
    #         - label_weights (Tensor]): Label weights of each image.
    #         - bbox_targets (Tensor): BBox targets of each image.
    #         - bbox_weights (Tensor): BBox weights of each image.
    #         - pos_inds (Tensor): Sampled positive indices for each image.
    #         - neg_inds (Tensor): Sampled negative indices for each image.
    #     """
    #     img_h, img_w = img_meta['img_shape']
    #     factor = bbox_pred.new_tensor([img_w, img_h, img_w,
    #                                    img_h]).unsqueeze(0)
    #     num_bboxes = bbox_pred.size(0)
    #     # convert bbox_pred from xywh, normalized to xyxy, unnormalized
    #     bbox_pred = bbox_cxcywh_to_xyxy(bbox_pred)
    #     bbox_pred = bbox_pred * factor
    #
    #     pred_instances = InstanceData(scores=cls_score, bboxes=bbox_pred,priors=bbox_pred)
    #     # assigner and sampler
    #     if assigner_type == 'dn':
    #         assign_result = self.dn_assigner.assign(
    #             pred_instances=pred_instances,
    #             gt_instances=gt_instances,
    #             img_meta=img_meta)
    #     else:
    #         assign_result = self.assigner.assign(
    #             pred_instances=pred_instances,
    #             gt_instances=gt_instances,
    #             img_meta=img_meta)
    #
    #     gt_bboxes = gt_instances.bboxes
    #     gt_labels = gt_instances.labels
    #     pos_inds = torch.nonzero(
    #         assign_result.gt_inds > 0, as_tuple=False).squeeze(-1).unique()
    #     neg_inds = torch.nonzero(
    #         assign_result.gt_inds == 0, as_tuple=False).squeeze(-1).unique()
    #     pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1
    #     pos_gt_bboxes = gt_bboxes[pos_assigned_gt_inds.long(), :]
    #     # label targets
    #     labels = gt_bboxes.new_full((num_bboxes,),
    #                                 self.num_classes,
    #                                 dtype=torch.long)
    #     labels[pos_inds] = gt_labels[pos_assigned_gt_inds]
    #     label_weights = gt_bboxes.new_zeros(num_bboxes)
    #     label_weights[pos_inds] = 1
    #     label_weights[neg_inds] = 1
    #     if not with_neg_cls:
    #         label_weights[neg_inds] = 0
    #     # bbox targets
    #     bbox_targets = torch.zeros_like(bbox_pred)
    #     bbox_weights = torch.zeros_like(bbox_pred)
    #     bbox_weights[pos_inds] = 1.0
    #     # DETR regress the relative position of boxes (cxcywh) in the image.
    #     # Thus the learning target should be normalized by the image size, also
    #     # the box format should be converted from defaultly x1y1x2y2 to cxcywh.
    #     pos_gt_bboxes_normalized = pos_gt_bboxes / factor
    #     pos_gt_bboxes_targets = bbox_xyxy_to_cxcywh(pos_gt_bboxes_normalized)
    #     bbox_targets[pos_inds] = pos_gt_bboxes_targets
    #     return (labels, label_weights, bbox_targets, bbox_weights, pos_inds,
    #             neg_inds)

    def _get_targets_single_center(self,
                                   center: Tensor,
                                   center_scores: Tensor,
                                   cls_scores: Tensor,
                                   spatial_shapes: Tensor,
                                   gt_instances: InstanceData,
                                   img_meta: dict) -> tuple:
        """Compute regression and classification targets for one image.

        Outputs from a single decoder layer of a single feature level are used.

        Args:
            cls_score (Tensor): Box score logits from a single decoder layer
                for one image. Shape [num_queries, cls_out_channels].
            bbox_pred (Tensor): Sigmoid outputs from a single decoder layer
                for one image, with normalized coordinate (cx, cy, w, h) and
                shape [num_queries, 4].
            gt_instances (:obj:`InstanceData`): Ground truth of instance
                annotations. It should includes ``bboxes`` and ``labels``
                attributes.
            img_meta (dict): Meta information for one image.

        Returns:
            tuple[Tensor]: a tuple containing the following for one image.

            - labels (Tensor): Labels of each image.
            - label_weights (Tensor]): Label weights of each image.
            - bbox_targets (Tensor): BBox targets of each image.
            - bbox_weights (Tensor): BBox weights of each image.
            - pos_inds (Tensor): Sampled positive indices for each image.
            - neg_inds (Tensor): Sampled negative indices for each image.
        """
        img_h, img_w = img_meta['img_shape']
        feat_h = int(spatial_shapes[self.center_feat_indice][0]/self.center_ds_ratio)
        feat_w = int(spatial_shapes[self.center_feat_indice][1]/self.center_ds_ratio)
        factor = spatial_shapes.new_tensor([feat_w, feat_h]).unsqueeze(0)
        # factor = center.new_tensor([img_w, img_h,]).unsqueeze(0)
        gt_bboxes = gt_instances.bboxes
        gt_labels = gt_instances.labels
        gt_cxcy = bbox_xyxy_to_cxcywh(gt_bboxes)[:, :2]
        gt_cxcy[:, 0] = gt_cxcy[:, 0] * feat_w / img_w
        gt_cxcy[:, 1] = gt_cxcy[:, 1] * feat_h / img_h
        gt_cxcy= gt_cxcy.long()
        gt_bboxes[:, 2:] -= 0.1
        gt_bboxes_x = gt_bboxes[:, 0::2]
        gt_bboxes_y = gt_bboxes[:, 1::2]
        gt_bboxes_x = torch.floor(gt_bboxes_x * feat_w / img_w)
        gt_bboxes_y = torch.floor(gt_bboxes_y * feat_h / img_h)
        gt_bboxes_x = gt_bboxes_x.long()
        gt_bboxes_y = gt_bboxes_y.long()
        heat_map = gt_bboxes.new_full((feat_h, feat_w), 0, dtype=torch.long)
        for t_i in range(gt_bboxes.size(0)):
            # if gt_bboxes_x[t_i, 1] - gt_bboxes_x[t_i, 0] > 3:
            #     gt_bboxes_x[t_i,1] = gt_cxcy[t_i, 0] + 1
            #     gt_bboxes_x[t_i,0] = gt_cxcy[t_i, 0] - 1
            # if gt_bboxes_y[t_i,1] - gt_bboxes_y[t_i,0] > 3:
            #     gt_bboxes_y[t_i,1] = gt_cxcy[t_i, 1] + 1
            #     gt_bboxes_y[t_i,0] = gt_cxcy[t_i, 1] - 1
            grid_y, grid_x = torch.meshgrid(
                torch.linspace(gt_bboxes_y[t_i, 0], gt_bboxes_y[t_i, 1], gt_bboxes_y[t_i, 1]+1-gt_bboxes_y[t_i, 0],
                               dtype=torch.long, device=gt_cxcy.device),
                torch.linspace(gt_bboxes_x[t_i, 0], gt_bboxes_x[t_i, 1], gt_bboxes_x[t_i, 1]+1-gt_bboxes_x[t_i, 0],
                               dtype=torch.long, device=gt_cxcy.device))
            grid = torch.cat([grid_y.unsqueeze(-1), grid_x.unsqueeze(-1)], -1)
            grid = grid.view(-1, 2)
            value_input = gt_bboxes.new_full((grid.size(0),), -1, dtype=torch.long)
            heat_map.index_put_((grid[:,0],grid[:,1]), value_input)
            # cls_labels.index_put_((grid[:,0],grid[:,1]), value_input)
        value_input = gt_bboxes.new_full((gt_cxcy.size(0),), 1, dtype=torch.long)
        heat_map = heat_map.index_put_((gt_cxcy[:,1], gt_cxcy[:,0]), value_input)
        heat_map = heat_map.view(-1)
        mask = heat_map != -1
        # new_center_score = center_score[mask]
        # heat_map = heat_map[mask]
        pos_inds = torch.where(heat_map == 1)[0]
        ignore_inds = torch.where(heat_map == -1)[0]
        neg_inds = torch.where(heat_map == 0)[0]
        cls_labels = gt_bboxes.new_full((feat_h, feat_w), self.num_classes, dtype=torch.long)
        cls_labels = cls_labels.index_put_((gt_cxcy[:,1], gt_cxcy[:,0]), gt_labels)
        cls_labels = cls_labels.view(-1)
        center_labels = gt_bboxes.new_full((heat_map.size(0),), 1, dtype=torch.long)
        center_labels[pos_inds] = 0
        label_weights = gt_bboxes.new_ones(heat_map.size(0))
        if ignore_inds.numel() > 0:
            label_weights[ignore_inds] = 0
        if self.neg_hard_num>0:
            if self.use_center:
                _, indices = torch.sort(center_scores, dim=0, descending=True)
            else:
                cls_scores_max = torch.max(cls_scores, dim=-1, keepdim=True)[0]
                _, indices = torch.sort(cls_scores_max, dim=0, descending=True)
            sorted_inds = indices.squeeze()
            non_neg_inds = torch.cat([pos_inds,ignore_inds],dim=0)
            mask = torch.isin(sorted_inds, non_neg_inds)
            remaining_inds = sorted_inds[~mask]
            neg_hard_inds = remaining_inds[:self.neg_hard_num]
            new_inds = torch.cat([pos_inds, neg_hard_inds], dim=0)
            neg_inds = neg_hard_inds
            center_labels = center_labels[new_inds]
            cls_labels = cls_labels[new_inds]
            center_scores = center_scores[new_inds]
            cls_scores = cls_scores[new_inds]
            label_weights = gt_bboxes.new_ones(new_inds.size(0))
        return (center_labels, cls_labels, center_scores, cls_scores, label_weights, pos_inds, neg_inds)

    def _get_targets_single_pixel(self,
                                   seg_scores: Tensor,
                                   abu_scores: Optional[Tensor],
                                   spatial_shapes: Tensor,
                                   gt_seg: Tensor,
                                   gt_abu: Optional[Tensor],
                                   img_meta: dict) -> tuple:
        assert seg_scores.shape[0] == gt_seg.numel()
        assert spatial_shapes[0][0]*spatial_shapes[0][1] == gt_seg.numel()
        gt_seg = gt_seg.view(-1,1)
        pos_inds = torch.where(gt_seg >0)[0]
        seg_labels = gt_seg.detach().clone()
        seg_labels[gt_seg >0] = 0
        seg_labels[gt_seg == 0] = 1
        seg_labels = seg_labels.long()
        # gt_abu= gt_abu.view(-1,1)
        # pos_inds = torch.where(gt_abu > 0)[0]
        if self.seg_neg_hard_num== 0:
            neg_inds = torch.where(gt_seg == 0)[0]
            seg_label_weights = seg_scores.new_ones(seg_labels.size(0))
        else:
            seg_scores_max = torch.max(seg_scores, dim=-1, keepdim=True)[0]
            _, indices = torch.sort(seg_scores_max, dim=0, descending=True)
            sorted_inds = indices.squeeze()
            mask = torch.isin(sorted_inds, pos_inds)
            remaining_inds = sorted_inds[~mask]
            neg_hard_inds = remaining_inds[:self.seg_neg_hard_num]
            neg_inds = neg_hard_inds
            new_inds = torch.cat([pos_inds, neg_hard_inds], dim=0)
            seg_scores = seg_scores[new_inds]
            seg_labels = seg_labels[new_inds]
            seg_label_weights = seg_scores.new_ones(seg_labels.size(0))
        if self.predict_abundance:
            gt_abu = gt_abu.view(-1,1)
            abu_scores = abu_scores[pos_inds]
            abu_labels = gt_abu[pos_inds]*0.5+0.25
            abu_label_weights = seg_scores.new_ones(abu_labels.size(0))
        else:
            abu_scores = None
            abu_labels = None
            abu_label_weights = None
        return seg_scores, seg_labels, seg_label_weights,abu_scores, abu_labels, abu_label_weights, pos_inds, neg_inds


    # def _get_targets_single_bbox(self,
    #                         cls_score: Tensor,
    #                         bbox_pred: Tensor,
    #                         gt_instances: InstanceData,
    #                         img_meta: dict) -> tuple:
    #     """Compute regression and classification targets for one image.
    #
    #     Outputs from a single decoder layer of a single feature level are used.
    #
    #     Args:
    #         cls_score (Tensor): Box score logits from a single decoder layer
    #             for one image. Shape [num_queries, cls_out_channels].
    #         bbox_pred (Tensor): Sigmoid outputs from a single decoder layer
    #             for one image, with normalized coordinate (cx, cy, w, h) and
    #             shape [num_queries, 4].
    #         gt_instances (:obj:`InstanceData`): Ground truth of instance
    #             annotations. It should includes ``bboxes`` and ``labels``
    #             attributes.
    #         img_meta (dict): Meta information for one image.
    #
    #     Returns:
    #         tuple[Tensor]: a tuple containing the following for one image.
    #
    #         - labels (Tensor): Labels of each image.
    #         - label_weights (Tensor]): Label weights of each image.
    #         - bbox_targets (Tensor): BBox targets of each image.
    #         - bbox_weights (Tensor): BBox weights of each image.
    #         - pos_inds (Tensor): Sampled positive indices for each image.
    #         - neg_inds (Tensor): Sampled negative indices for each image.
    #     """
    #     img_h, img_w = img_meta['img_shape']
    #     factor = bbox_pred.new_tensor([img_w, img_h, img_w,
    #                                    img_h]).unsqueeze(0)
    #     num_bboxes = bbox_pred.size(0)
    #     # convert bbox_pred from xywh, normalized to xyxy, unnormalized
    #     bbox_pred = bbox_cxcywh_to_xyxy(bbox_pred)
    #     bbox_pred = bbox_pred * factor
    #
    #     pred_instances = InstanceData(scores=cls_score, bboxes=bbox_pred,priors=bbox_pred)
    #     # assigner and sampler
    #     assign_result = self.bbox_assigner.assign(
    #         pred_instances=pred_instances,
    #         gt_instances=gt_instances,
    #         img_meta=img_meta)
    #     gt_bboxes = gt_instances.bboxes
    #     gt_labels = gt_instances.labels
    #     pos_inds = torch.nonzero(
    #         assign_result.gt_inds > 0, as_tuple=False).squeeze(-1).unique()
    #     neg_inds = torch.nonzero(
    #         assign_result.gt_inds == 0, as_tuple=False).squeeze(-1).unique()
    #     pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1
    #     pos_gt_bboxes = gt_bboxes[pos_assigned_gt_inds.long(), :]
    #
    #     # bbox targets
    #     bbox_targets = torch.zeros_like(bbox_pred)
    #     bbox_weights = torch.zeros_like(bbox_pred)
    #     bbox_weights[pos_inds] = 1.0
    #
    #     # DETR regress the relative position of boxes (cxcywh) in the image.
    #     # Thus the learning target should be normalized by the image size, also
    #     # the box format should be converted from defaultly x1y1x2y2 to cxcywh.
    #     pos_gt_bboxes_normalized = pos_gt_bboxes / factor
    #     pos_gt_bboxes_targets = bbox_xyxy_to_cxcywh(pos_gt_bboxes_normalized)
    #     bbox_targets[pos_inds] = pos_gt_bboxes_targets
    #     return (bbox_targets, bbox_weights, pos_inds, neg_inds)

    def loss_and_predict(
            self, hidden_states: Tuple[Tensor],
            batch_data_samples: SampleList) -> Tuple[dict, InstanceList]:
        """Perform forward propagation of the head, then calculate loss and
        predictions from the features and data samples. Over-write because
        img_metas are needed as inputs for bbox_head.

        Args:
            hidden_states (tuple[Tensor]): Feature from the transformer
                decoder, has shape (num_decoder_layers, bs, num_queries, dim).
            batch_data_samples (list[:obj:`DetDataSample`]): Each item contains
                the meta information of each image and corresponding
                annotations.

        Returns:
            tuple: the return value is a tuple contains:

            - losses: (dict[str, Tensor]): A dictionary of loss components.
            - predictions (list[:obj:`InstanceData`]): Detection
              results of each image after the post process.
        """
        batch_gt_instances = []
        batch_img_metas = []
        for data_sample in batch_data_samples:
            batch_img_metas.append(data_sample.metainfo)
            batch_gt_instances.append(data_sample.gt_instances)
        outs = self(hidden_states)
        loss_inputs = outs + (batch_gt_instances, batch_img_metas)
        losses = self.loss_by_feat(*loss_inputs)
        predictions = self.predict_by_feat(
            *outs, batch_img_metas=batch_img_metas)
        return losses, predictions

    def forward(self, hidden_states: Tensor,
                references: List[Tensor],
                topk_cls_scores: Tensor,) -> Tuple[Tensor]:
        """Forward function.

        Args:
            hidden_states (Tensor): Hidden states output from each decoder
                layer, has shape (num_decoder_layers, bs, num_queries, dim).
            references (list[Tensor]): List of the reference from the decoder.
                The first reference is the `init_reference` (initial) and the
                other num_decoder_layers(6) references are `inter_references`
                (intermediate). The `init_reference` has shape (bs,
                num_queries, 4) when `as_two_stage` of the detector is `True`,
                otherwise (bs, num_queries, 2). Each `inter_reference` has
                shape (bs, num_queries, 4) when `with_box_refine` of the
                detector is `True`, otherwise (bs, num_queries, 2). The
                coordinates are arranged as (cx, cy) when the last dimension is
                2, and (cx, cy, w, h) when it is 4.

        Returns:
            tuple[Tensor]: results of head containing the following tensor.

            - all_layers_outputs_classes (Tensor): Outputs from the
              classification head, has shape (num_decoder_layers, bs,
              num_queries, cls_out_channels).
            - all_layers_outputs_coords (Tensor): Sigmoid outputs from the
              regression head with normalized coordinate format (cx, cy, w,
              h), has shape (num_decoder_layers, bs, num_queries, 4) with the
              last dimension arranged as (cx, cy, w, h).
        """
        all_layers_outputs_coords = []
        for layer_id in range(hidden_states.shape[0]):
            reference = inverse_sigmoid(references[layer_id])
            # NOTE The last reference will not be used.
            hidden_state = hidden_states[layer_id]
            tmp_reg_preds = self.reg_branches[layer_id](hidden_state)
            if reference.shape[-1] == 4:
                # When `layer` is 0 and `as_two_stage` of the detector
                # is `True`, or when `layer` is greater than 0 and
                # `with_box_refine` of the detector is `True`.
                tmp_reg_preds += reference
            else:
                # When `layer` is 0 and `as_two_stage` of the detector
                # is `False`, or when `layer` is greater than 0 and
                # `with_box_refine` of the detector is `False`.
                assert reference.shape[-1] == 2
                tmp_reg_preds[..., :2] += reference
            outputs_coord = tmp_reg_preds.sigmoid()
            all_layers_outputs_coords.append(outputs_coord)
        all_layers_outputs_classes = topk_cls_scores.unsqueeze(0).repeat(hidden_states.shape[0],1,1,1)
        all_layers_outputs_coords = torch.stack(all_layers_outputs_coords)
        return all_layers_outputs_classes, all_layers_outputs_coords

    def loss(self, hidden_states: Tensor,
             references: List[Tensor],
             centers: Tensor,
             center_scores: Tensor,
             topk_centers_scores: Tensor,
             cls_scores: Tensor,
             topk_cls_scores: Tensor,
             seg_scores:Optional[Tensor],
             abu_scores: Optional[Tensor],
             batch_data_samples: SampleList,
             dn_meta: Dict[str, int],
             spatial_shapes: Tensor) -> dict:
        """Perform forward propagation and loss calculation of the detection
        head on the queries of the upstream network.

        Args:
            hidden_states (Tensor): Hidden states output from each decoder
                layer, has shape (num_decoder_layers, bs, num_queries_total,
                dim), where `num_queries_total` is the sum of
                `num_denoising_queries` and `num_matching_queries` when
                `self.training` is `True`, else `num_matching_queries`.
            references (list[Tensor]): List of the reference from the decoder.
                The first reference is the `init_reference` (initial) and the
                other num_decoder_layers(6) references are `inter_references`
                (intermediate). The `init_reference` has shape (bs,
                num_queries_total, 4) and each `inter_reference` has shape
                (bs, num_queries, 4) with the last dimension arranged as
                (cx, cy, w, h).
            enc_outputs_class (Tensor): The score of each point on encode
                feature map, has shape (bs, num_feat_points, cls_out_channels).
            enc_outputs_coord (Tensor): The proposal generate from the
                encode feature map, has shape (bs, num_feat_points, 4) with the
                last dimension arranged as (cx, cy, w, h).
            batch_data_samples (list[:obj:`DetDataSample`]): The Data
                Samples. It usually includes information such as
                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
            dn_meta (Dict[str, int]): The dictionary saves information about
              group collation, including 'num_denoising_queries' and
              'num_denoising_groups'. It will be used for split outputs of
              denoising and matching parts and loss calculation.

        Returns:
            dict: A dictionary of loss components.
        """
        batch_gt_instances = []
        batch_img_metas = []
        batch_gt_seg = []
        batch_gt_abu = []
        for data_sample in batch_data_samples:
            batch_img_metas.append(data_sample.metainfo)
            batch_gt_instances.append(data_sample.gt_instances)
            if self.predict_segmentation:
                batch_gt_seg.append(data_sample.gt_pixel.seg)
            else:
                batch_gt_seg.append(None)
            if self.predict_abundance:
                batch_gt_abu.append(data_sample.gt_pixel.abu)
            else:
                batch_gt_abu.append(None)
        outs = self(hidden_states, references, topk_cls_scores)
        loss_inputs = outs + (centers, center_scores, topk_centers_scores, cls_scores, topk_cls_scores,
                              seg_scores,abu_scores,
                              batch_gt_instances, batch_img_metas, dn_meta, spatial_shapes,batch_gt_seg,batch_gt_abu)
        losses = self.loss_by_feat(*loss_inputs)
        return losses

    def loss_by_feat(
            self,
            all_layers_cls_scores: Tensor,
            all_layers_bbox_preds: Tensor,
            centers: Tensor,
            center_scores: Tensor,
            topk_centers_scores: Tensor,
            cls_scores: Tensor,
            topk_cls_scores: Tensor,
            seg_scores: Optional[Tensor],
            abu_scores: Optional[Tensor],
            batch_gt_instances: InstanceList,
            batch_img_metas: List[dict],
            dn_meta: Dict[str, int],
            spatial_shapes: Tensor,
            batch_gt_seg: List,
            batch_gt_abu: List,
            batch_gt_instances_ignore: OptInstanceList = None,
    ) -> Dict[str, Tensor]:
        """Loss function.

        Args:
            all_layers_cls_scores (Tensor): Classification scores of all
                decoder layers, has shape (num_decoder_layers, bs,
                num_queries_total, cls_out_channels), where
                `num_queries_total` is the sum of `num_denoising_queries`
                and `num_matching_queries`.
            all_layers_bbox_preds (Tensor): Regression outputs of all decoder
                layers. Each is a 4D-tensor with normalized coordinate format
                (cx, cy, w, h) and has shape (num_decoder_layers, bs,
                num_queries_total, 4).
            enc_cls_scores (Tensor): The score of each point on encode
                feature map, has shape (bs, num_feat_points, cls_out_channels).
            enc_bbox_preds (Tensor): The proposal generate from the encode
                feature map, has shape (bs, num_feat_points, 4) with the last
                dimension arranged as (cx, cy, w, h).
            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
                gt_instance. It usually includes ``bboxes`` and ``labels``
                attributes.
            batch_img_metas (list[dict]): Meta information of each image, e.g.,
                image size, scaling factor, etc.
            dn_meta (Dict[str, int]): The dictionary saves information about
                group collation, including 'num_denoising_queries' and
                'num_denoising_groups'. It will be used for split outputs of
                denoising and matching parts and loss calculation.
            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
                data that is ignored during training and testing.
                Defaults to None.

        Returns:
            dict[str, Tensor]: A dictionary of loss components.
        """
        # extract denoising and matching part of outputs
        weight_bbox = 0
        weight_cls = 0
        # (all_layers_matching_cls_scores, all_layers_matching_bbox_preds,
        #  all_layers_denoising_cls_scores, all_layers_denoising_bbox_preds) = \
        #     self.split_outputs(all_layers_cls_scores, all_layers_bbox_preds, dn_meta)

        # (matching_cls_scores, all_layers_matching_bbox_preds,
        #  denoising_cls_scores, all_layers_denoising_bbox_preds) = \
        #     self.split_outputsv1(cls_scores, all_layers_bbox_preds, dn_meta)
        loss_dict = dict()
        loss_center, loss_cls = self.loss_center(centers,
                                           center_scores,
                                           cls_scores,
                                           spatial_shapes,
                                           batch_gt_instances=batch_gt_instances,
                                           batch_img_metas=batch_img_metas)
        if self.use_center:
            loss_dict['loss_center'] = loss_center
        loss_dict['loss_cls'] = loss_cls
        if loss_cls <= self.loss_center_th:
            weight_bbox = 1
        if self.predict_segmentation:
            # seg_scores = seg_scores.sigmoid()
            if self.predict_abundance:
                abu_scores = abu_scores.sigmoid()
            loss_seg, loss_abu = self.loss_pixel(seg_scores, abu_scores, spatial_shapes,
                                                     batch_gt_seg,batch_gt_abu,
                                                 batch_img_metas=batch_img_metas)
            loss_dict['loss_seg'] = loss_seg*weight_bbox
            if self.predict_abundance:
                loss_dict['loss_abu'] = loss_abu*weight_bbox
        reg_targets = self.get_dn_targets(batch_gt_instances, batch_img_metas, dn_meta)
        dn_losses_bbox, dn_losses_iou = multi_apply(
            self._loss_dn_single,
            all_layers_bbox_preds,
            reg_targets=reg_targets,
            batch_gt_instances=batch_gt_instances,
            batch_img_metas=batch_img_metas,
            dn_meta=dn_meta)
        for num_dec_layer, (loss_bbox_i, loss_iou_i) in \
                enumerate(zip(dn_losses_bbox,  dn_losses_iou)):
            loss_dict[f'd{num_dec_layer+1}.dn_loss_bbox'] = loss_bbox_i*weight_bbox
            if self.loss_iou is not None:
                loss_dict[f'd{num_dec_layer+1}.dn_loss_iou'] = loss_iou_i*weight_bbox
        return loss_dict


    # def loss_by_feat_single(self, cls_scores: Tensor, bbox_preds: Tensor,
    #                         batch_gt_instances: InstanceList,
    #                         batch_img_metas: List[dict]) -> Tuple[Tensor]:
    #     """Loss function for outputs from a single decoder layer of a single
    #     feature level.
    #
    #     Args:
    #         cls_scores (Tensor): Box score logits from a single decoder layer
    #             for all images, has shape (bs, num_queries, cls_out_channels).
    #         bbox_preds (Tensor): Sigmoid outputs from a single decoder layer
    #             for all images, with normalized coordinate (cx, cy, w, h) and
    #             shape (bs, num_queries, 4).
    #         batch_gt_instances (list[:obj:`InstanceData`]): Batch of
    #             gt_instance. It usually includes ``bboxes`` and ``labels``
    #             attributes.
    #         batch_img_metas (list[dict]): Meta information of each image, e.g.,
    #             image size, scaling factor, etc.
    #
    #     Returns:
    #         Tuple[Tensor]: A tuple including `loss_cls`, `loss_box` and
    #         `loss_iou`.
    #     """
    #     num_imgs = cls_scores.size(0)
    #     cls_scores_list = [cls_scores[i] for i in range(num_imgs)]
    #     bbox_preds_list = [bbox_preds[i] for i in range(num_imgs)]
    #     cls_reg_targets = self.get_targets_bbox(cls_scores_list, bbox_preds_list,
    #                                        batch_gt_instances, batch_img_metas)
    #     (bbox_targets_list, bbox_weights_list,
    #      num_total_pos, num_total_neg) = cls_reg_targets
    #     bbox_targets = torch.cat(bbox_targets_list, 0)
    #     bbox_weights = torch.cat(bbox_weights_list, 0)
    #
    #
    #     # Compute the average number of gt boxes across all gpus, for
    #     # normalization purposes
    #     num_total_pos = bbox_targets.new_tensor([num_total_pos])
    #     num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item()
    #
    #     # construct factors used for rescale bboxes
    #     factors = []
    #     for img_meta, bbox_pred in zip(batch_img_metas, bbox_preds):
    #         img_h, img_w, = img_meta['img_shape']
    #         factor = bbox_pred.new_tensor([img_w, img_h, img_w,
    #                                        img_h]).unsqueeze(0).repeat(
    #                                            bbox_pred.size(0), 1)
    #         factors.append(factor)
    #     factors = torch.cat(factors, 0)
    #
    #     # DETR regress the relative position of boxes (cxcywh) in the image,
    #     # thus the learning target is normalized by the image size. So here
    #     # we need to re-scale them for calculating IoU loss
    #     bbox_preds = bbox_preds.reshape(-1, 4)
    #     bboxes = bbox_cxcywh_to_xyxy(bbox_preds) * factors
    #     bboxes_gt = bbox_cxcywh_to_xyxy(bbox_targets) * factors
    #
    #     # regression IoU loss, defaultly GIoU loss
    #     loss_iou = self.loss_iou(
    #         bboxes, bboxes_gt, bbox_weights, avg_factor=num_total_pos)
    #
    #     # regression L1 loss
    #     loss_bbox = self.loss_bbox(
    #         bbox_preds, bbox_targets, bbox_weights, avg_factor=num_total_pos)
    #     return loss_bbox, loss_iou



    def loss_center(self,
                    centers: Tensor,
                    center_scores: Tensor,
                    cls_scores: Tensor,
                    spatial_shapes: Tensor,
                    batch_gt_instances: InstanceList,
                    batch_img_metas: List[dict]) -> Tuple[Tensor]:
        """Loss function for outputs from a single decoder layer of a single
        feature level.

        Args:
            cls_scores (Tensor): Box score logits from a single decoder layer
                for all images, has shape (bs, num_queries, cls_out_channels).
            bbox_preds (Tensor): Sigmoid outputs from a single decoder layer
                for all images, with normalized coordinate (cx, cy, w, h) and
                shape (bs, num_queries, 4).
            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
                gt_instance. It usually includes ``bboxes`` and ``labels``
                attributes.
            batch_img_metas (list[dict]): Meta information of each image, e.g.,
                image size, scaling factor, etc.

        Returns:
            Tuple[Tensor]: A tuple including `loss_cls`, `loss_box` and
            `loss_iou`.
        """
        num_imgs = centers.size(0)
        if center_scores is None:
            center_scores_list = [cls_scores[i] for i in range(num_imgs)]
        else:
            center_scores_list = [center_scores[i] for i in range(num_imgs)]

        cls_scores_list = [cls_scores[i] for i in range(num_imgs)]
        # center_scores =center_scores.view(-1, center_scores.shape[2])
        # cls_scores = cls_scores.view(-1, cls_scores.shape[2])
        centers_list = [centers[i] for i in range(num_imgs)]
        spatial_shapes_list = [spatial_shapes for i in range(num_imgs)]
        (center_labels_list, cls_labels_list, center_scores_list, cls_scores_list, label_weights_list,  pos_inds_list, neg_inds_list,) = multi_apply(self._get_targets_single_center,
                                centers_list, center_scores_list, cls_scores_list, spatial_shapes_list, batch_gt_instances, batch_img_metas)
        num_total_pos = sum((inds.numel() for inds in pos_inds_list))
        num_total_neg = sum((inds.numel() for inds in neg_inds_list))
        center_labels = torch.cat(center_labels_list, 0)
        cls_labels = torch.cat(cls_labels_list, 0)
        center_scores = torch.cat(center_scores_list, 0)
        cls_scores = torch.cat(cls_scores_list, 0)
        label_weights = torch.cat(label_weights_list, 0)
        # construct weighted avg_factor to match with the official DETR repo
        cls_avg_factor = num_total_pos * 1.0 + \
            num_total_neg * 0
        if self.sync_cls_avg_factor:
            cls_avg_factor = reduce_mean(
                centers.new_tensor([cls_avg_factor]))
        cls_avg_factor = max(cls_avg_factor, 1)
        if self.use_center:
            loss_center = self.loss_center_cls(
                center_scores, center_labels, label_weights, avg_factor=cls_avg_factor)
        else:
            loss_center = None
        loss_cls = self.loss_cls(
            cls_scores, cls_labels, label_weights, avg_factor=cls_avg_factor)
        return loss_center, loss_cls

    def loss_pixel(self,
                    seg_scores: Tensor,
                    abu_scores: Tensor,
                    spatial_shapes: Tensor,
                    batch_gt_seg: List,
                    batch_gt_abu: List,
                    batch_img_metas: List) -> Tuple[Tensor]:
        num_imgs = seg_scores.size(0)
        seg_scores_list = [seg_scores[i] for i in range(num_imgs)]
        if abu_scores is not None:
            abu_scores_list = [abu_scores[i] for i in range(num_imgs)]
        else:
            abu_scores_list = [None for i in range(num_imgs)]
        spatial_shapes_list = [spatial_shapes for i in range(num_imgs)]
        (seg_scores_list, seg_labels_list, seg_label_weights_list,abu_scores_list, abu_labels_list, abu_label_weights_list, pos_inds_list, neg_inds_list) = multi_apply(self._get_targets_single_pixel,
                                            seg_scores_list, abu_scores_list,
                                            spatial_shapes_list,
                                            batch_gt_seg,
                                            batch_gt_abu,
                                            batch_img_metas)
        num_total_pos = sum((inds.numel() for inds in pos_inds_list))
        num_total_neg = sum((inds.numel() for inds in neg_inds_list))
        seg_scores = torch.cat(seg_scores_list, 0)
        seg_labels = torch.cat(seg_labels_list, 0)
        seg_label_weights = torch.cat(seg_label_weights_list, 0)
        cls_avg_factor = num_total_pos * 1.0 + num_total_neg * 0
        if self.sync_cls_avg_factor:
            cls_avg_factor = reduce_mean(
                seg_scores.new_tensor([cls_avg_factor]))
        cls_avg_factor = max(cls_avg_factor, 1)
        loss_seg = self.loss_seg(
                    seg_scores, seg_labels, seg_label_weights, avg_factor=cls_avg_factor)
        if self.predict_abundance:
            abu_scores = torch.cat(abu_scores_list, 0)
            abu_labels = torch.cat(abu_labels_list, 0)
            abu_label_weights = torch.cat(abu_label_weights_list, 0)
            # cls_avg_factor = num_total_pos * 1.0
            # if self.sync_cls_avg_factor:
            #     cls_avg_factor = reduce_mean(
            #         seg_scores.new_tensor([cls_avg_factor]))
            # cls_avg_factor = max(cls_avg_factor, 1)
            loss_abu = self.loss_abu(
                abu_scores, abu_labels, abu_label_weights)
        else:
            loss_abu = None
        return loss_seg, loss_abu

    # def get_targets(self, cls_scores_list: List[Tensor],
    #                 bbox_preds_list: List[Tensor],
    #                 batch_gt_instances: InstanceList,
    #                 batch_img_metas: List[dict],
    #                 with_neg_cls:bool=True,
    #                 assigner_type:str = None) -> tuple:
    #     """Compute regression and classification targets for a batch image.
    #
    #     Outputs from a single decoder layer of a single feature level are used.
    #
    #     Args:
    #         cls_scores_list (list[Tensor]): Box score logits from a single
    #             decoder layer for each image, has shape [num_queries,
    #             cls_out_channels].
    #         bbox_preds_list (list[Tensor]): Sigmoid outputs from a single
    #             decoder layer for each image, with normalized coordinate
    #             (cx, cy, w, h) and shape [num_queries, 4].
    #         batch_gt_instances (list[:obj:`InstanceData`]): Batch of
    #             gt_instance. It usually includes ``bboxes`` and ``labels``
    #             attributes.
    #         batch_img_metas (list[dict]): Meta information of each image, e.g.,
    #             image size, scaling factor, etc.
    #
    #     Returns:
    #         tuple: a tuple containing the following targets.
    #
    #         - labels_list (list[Tensor]): Labels for all images.
    #         - label_weights_list (list[Tensor]): Label weights for all images.
    #         - bbox_targets_list (list[Tensor]): BBox targets for all images.
    #         - bbox_weights_list (list[Tensor]): BBox weights for all images.
    #         - num_total_pos (int): Number of positive samples in all images.
    #         - num_total_neg (int): Number of negative samples in all images.
    #     """
    #     (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
    #      pos_inds_list,
    #      neg_inds_list) = multi_apply(self._get_targets_single,
    #                                   cls_scores_list, bbox_preds_list,
    #                                   batch_gt_instances, batch_img_metas,
    #                                   with_neg_cls=with_neg_cls,
    #                                   assigner_type= assigner_type)
    #     num_total_pos = sum((inds.numel() for inds in pos_inds_list))
    #     num_total_neg = sum((inds.numel() for inds in neg_inds_list))
    #     if not with_neg_cls:
    #         num_total_neg = 0
    #     return (labels_list, label_weights_list, bbox_targets_list,
    #             bbox_weights_list, num_total_pos, num_total_neg)

    # def get_targets_bbox(self,
    #                 cls_scores_list: List[Tensor],
    #                 bbox_preds_list: List[Tensor],
    #                 batch_gt_instances: InstanceList,
    #                 batch_img_metas: List[dict]) -> tuple:
    #     """Compute regression and classification targets for a batch image.
    #
    #     Outputs from a single decoder layer of a single feature level are used.
    #
    #     Args:
    #         cls_scores_list (list[Tensor]): Box score logits from a single
    #             decoder layer for each image, has shape [num_queries,
    #             cls_out_channels].
    #         bbox_preds_list (list[Tensor]): Sigmoid outputs from a single
    #             decoder layer for each image, with normalized coordinate
    #             (cx, cy, w, h) and shape [num_queries, 4].
    #         batch_gt_instances (list[:obj:`InstanceData`]): Batch of
    #             gt_instance. It usually includes ``bboxes`` and ``labels``
    #             attributes.
    #         batch_img_metas (list[dict]): Meta information of each image, e.g.,
    #             image size, scaling factor, etc.
    #
    #     Returns:
    #         tuple: a tuple containing the following targets.
    #
    #         - labels_list (list[Tensor]): Labels for all images.
    #         - label_weights_list (list[Tensor]): Label weights for all images.
    #         - bbox_targets_list (list[Tensor]): BBox targets for all images.
    #         - bbox_weights_list (list[Tensor]): BBox weights for all images.
    #         - num_total_pos (int): Number of positive samples in all images.
    #         - num_total_neg (int): Number of negative samples in all images.
    #     """
    #     (bbox_targets_list, bbox_weights_list, pos_inds_list,
    #      neg_inds_list) = multi_apply(self._get_targets_single_bbox, cls_scores_list,
    #                                   bbox_preds_list,
    #                                   batch_gt_instances, batch_img_metas)
    #     num_total_pos = sum((inds.numel() for inds in pos_inds_list))
    #     num_total_neg = sum((inds.numel() for inds in neg_inds_list))
    #     return (bbox_targets_list, bbox_weights_list, num_total_pos, num_total_neg)


    def _loss_dn_single(self, dn_bbox_preds: Tensor,
                        reg_targets: Tuple[list, int],
                        batch_gt_instances: InstanceList,
                        batch_img_metas: List[dict],
                        dn_meta: Dict[str, int]) -> Tuple[Tensor]:
        """Denoising loss for outputs from a single decoder layer.

        Args:
            dn_cls_scores (Tensor): Classification scores of a single decoder
                layer in denoising part, has shape (bs, num_denoising_queries,
                cls_out_channels).
            dn_bbox_preds (Tensor): Regression outputs of a single decoder
                layer in denoising part. Each is a 4D-tensor with normalized
                coordinate format (cx, cy, w, h) and has shape
                (bs, num_denoising_queries, 4).
            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
                gt_instance. It usually includes ``bboxes`` and ``labels``
                attributes.
            batch_img_metas (list[dict]): Meta information of each image, e.g.,
                image size, scaling factor, etc.
            dn_meta (Dict[str, int]): The dictionary saves information about
              group collation, including 'num_denoising_queries' and
              'num_denoising_groups'. It will be used for split outputs of
              denoising and matching parts and loss calculation.

        Returns:
            Tuple[Tensor]: A tuple including `loss_cls`, `loss_box` and
            `loss_iou`.
        """
        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,num_total_pos) = reg_targets
        bbox_targets = torch.cat(bbox_targets_list, 0)
        bbox_weights = torch.cat(bbox_weights_list, 0)

        # Compute the average number of gt boxes across all gpus, for
        # normalization purposes
        num_total_pos = dn_bbox_preds.new_tensor([num_total_pos])
        num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item()

        # construct factors used for rescale bboxes
        factors = []
        for img_meta, bbox_pred in zip(batch_img_metas, dn_bbox_preds):
            img_h, img_w = img_meta['img_shape']
            factor = bbox_pred.new_tensor([img_w, img_h, img_w,
                                           img_h]).unsqueeze(0).repeat(
                                               bbox_pred.size(0), 1)
            factors.append(factor)
        factors = torch.cat(factors)
        # DETR regress the relative position of boxes (cxcywh) in the image,
        # thus the learning target is normalized by the image size. So here
        # we need to re-scale them for calculating IoU loss
        bbox_preds = dn_bbox_preds.reshape(-1, 4)
        bboxes = bbox_cxcywh_to_xyxy(bbox_preds) * factors
        bboxes_gt = bbox_cxcywh_to_xyxy(bbox_targets) * factors
        # regression IoU loss, defaultly GIoU loss
        if self.loss_iou is not None:
            loss_iou = self.loss_iou(
                bboxes, bboxes_gt, bbox_weights, avg_factor=num_total_pos)
        else:
            loss_iou = None
        # regression L1 loss
        loss_bbox = self.loss_bbox(
            bbox_preds, bbox_targets, bbox_weights, avg_factor=num_total_pos)
        return loss_bbox, loss_iou

    def get_dn_targets(self, batch_gt_instances: InstanceList,
                       batch_img_metas: dict, dn_meta: Dict[str,
                                                            int]) -> tuple:
        """Get targets in denoising part for a batch of images.

        Args:
            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
                gt_instance. It usually includes ``bboxes`` and ``labels``
                attributes.
            batch_img_metas (list[dict]): Meta information of each image, e.g.,
                image size, scaling factor, etc.
            dn_meta (Dict[str, int]): The dictionary saves information about
              group collation, including 'num_denoising_queries' and
              'num_denoising_groups'. It will be used for split outputs of
              denoising and matching parts and loss calculation.

        Returns:
            tuple: a tuple containing the following targets.

            - labels_list (list[Tensor]): Labels for all images.
            - label_weights_list (list[Tensor]): Label weights for all images.
            - bbox_targets_list (list[Tensor]): BBox targets for all images.
            - bbox_weights_list (list[Tensor]): BBox weights for all images.
            - num_total_pos (int): Number of positive samples in all images.
            - num_total_neg (int): Number of negative samples in all images.
        """
        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,pos_inds_list) = multi_apply(
             self._get_dn_targets_single,
             batch_gt_instances,
             batch_img_metas,
             dn_meta=dn_meta)
        num_total_pos = sum((inds.numel() for inds in pos_inds_list))
        return (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, num_total_pos,)

    def _get_dn_targets_single(self, gt_instances: InstanceData,
                               img_meta: dict, dn_meta: Dict[str,
                                                             int]) -> tuple:
        """Get targets in denoising part for one image.

        Args:
            gt_instances (:obj:`InstanceData`): Ground truth of instance
                annotations. It should includes ``bboxes`` and ``labels``
                attributes.
            img_meta (dict): Meta information for one image.
            dn_meta (Dict[str, int]): The dictionary saves information about
              group collation, including 'num_denoising_queries' and
              'num_denoising_groups'. It will be used for split outputs of
              denoising and matching parts and loss calculation.

        Returns:
            tuple[Tensor]: a tuple containing the following for one image.

            - labels (Tensor): Labels of each image.
            - label_weights (Tensor]): Label weights of each image.
            - bbox_targets (Tensor): BBox targets of each image.
            - bbox_weights (Tensor): BBox weights of each image.
            - pos_inds (Tensor): Sampled positive indices for each image.
            - neg_inds (Tensor): Sampled negative indices for each image.
        """
        gt_bboxes = gt_instances.bboxes
        gt_labels = gt_instances.labels
        num_groups = dn_meta['num_denoising_groups']
        num_denoising_queries = dn_meta['num_denoising_queries']
        num_queries_each_group = int(num_denoising_queries / num_groups)
        device = gt_bboxes.device

        if len(gt_labels) > 0:
            t = torch.arange(len(gt_labels), dtype=torch.long, device=device)
            t = t.unsqueeze(0).repeat(num_groups, 1)
            pos_assigned_gt_inds = t.flatten()
            pos_inds = torch.arange(num_groups, dtype=torch.long, device=device)
            pos_inds = pos_inds.unsqueeze(1) * num_queries_each_group + t
            pos_inds = pos_inds.flatten()
        else:
            pos_inds = pos_assigned_gt_inds = \
                gt_bboxes.new_tensor([], dtype=torch.long)

        # label targets
        labels = gt_bboxes.new_full((num_denoising_queries, ),
                                    self.num_classes,
                                    dtype=torch.long)
        labels[pos_inds] = gt_labels[pos_assigned_gt_inds]
        label_weights = gt_bboxes.new_zeros(num_denoising_queries)
        label_weights[pos_inds] = 1.0
        # bbox targets
        bbox_targets = torch.zeros(num_denoising_queries, 4, device=device)
        bbox_weights = torch.zeros(num_denoising_queries, 4, device=device)
        bbox_weights[pos_inds] = 1.0
        img_h, img_w = img_meta['img_shape']

        # DETR regress the relative position of boxes (cxcywh) in the image.
        # Thus the learning target should be normalized by the image size, also
        # the box format should be converted from defaultly x1y1x2y2 to cxcywh.
        factor = gt_bboxes.new_tensor([img_w, img_h, img_w,
                                       img_h]).unsqueeze(0)
        gt_bboxes_normalized = gt_bboxes / factor
        gt_bboxes_targets = bbox_xyxy_to_cxcywh(gt_bboxes_normalized)
        bbox_targets[pos_inds] = gt_bboxes_targets.repeat([num_groups, 1])

        return (labels, label_weights,bbox_targets, bbox_weights, pos_inds)


    @staticmethod
    def split_outputs(all_layers_cls_scores: Tensor,
                      all_layers_bbox_preds: Tensor,
                      dn_meta: Dict[str, int]) -> Tuple[Tensor]:
        """Split outputs of the denoising part and the matching part.

        For the total outputs of `num_queries_total` length, the former
        `num_denoising_queries` outputs are from denoising queries, and
        the rest `num_matching_queries` ones are from matching queries,
        where `num_queries_total` is the sum of `num_denoising_queries` and
        `num_matching_queries`.

        Args:
            all_layers_cls_scores (Tensor): Classification scores of all
                decoder layers, has shape (num_decoder_layers, bs,
                num_queries_total, cls_out_channels).
            all_layers_bbox_preds (Tensor): Regression outputs of all decoder
                layers. Each is a 4D-tensor with normalized coordinate format
                (cx, cy, w, h) and has shape (num_decoder_layers, bs,
                num_queries_total, 4).
            dn_meta (Dict[str, int]): The dictionary saves information about
              group collation, including 'num_denoising_queries' and
              'num_denoising_groups'.

        Returns:
            Tuple[Tensor]: a tuple containing the following outputs.

            - all_layers_matching_cls_scores (Tensor): Classification scores
              of all decoder layers in matching part, has shape
              (num_decoder_layers, bs, num_matching_queries, cls_out_channels).
            - all_layers_matching_bbox_preds (Tensor): Regression outputs of
              all decoder layers in matching part. Each is a 4D-tensor with
              normalized coordinate format (cx, cy, w, h) and has shape
              (num_decoder_layers, bs, num_matching_queries, 4).
            - all_layers_denoising_cls_scores (Tensor): Classification scores
              of all decoder layers in denoising part, has shape
              (num_decoder_layers, bs, num_denoising_queries,
              cls_out_channels).
            - all_layers_denoising_bbox_preds (Tensor): Regression outputs of
              all decoder layers in denoising part. Each is a 4D-tensor with
              normalized coordinate format (cx, cy, w, h) and has shape
              (num_decoder_layers, bs, num_denoising_queries, 4).
        """
        if dn_meta is not None:
            num_denoising_queries = dn_meta['num_denoising_queries']
            all_layers_denoising_cls_scores = \
                all_layers_cls_scores[:,:, : num_denoising_queries, :]
            all_layers_denoising_bbox_preds = \
                all_layers_bbox_preds[:, :, : num_denoising_queries, :]
            all_layers_matching_cls_scores = \
                all_layers_cls_scores[:, :, num_denoising_queries:, :]
            all_layers_matching_bbox_preds = \
                all_layers_bbox_preds[:, :, num_denoising_queries:, :]
        else:
            all_layers_denoising_cls_scores = None
            all_layers_denoising_bbox_preds = None
            all_layers_matching_cls_scores = all_layers_cls_scores
            all_layers_matching_bbox_preds = all_layers_bbox_preds
        return (all_layers_matching_cls_scores, all_layers_matching_bbox_preds,
                all_layers_denoising_cls_scores,
                all_layers_denoising_bbox_preds)


    @staticmethod
    def split_outputsv1(all_layers_cls_scores: Tensor,
                      all_layers_bbox_preds: Tensor,
                      dn_meta: Dict[str, int]) -> Tuple[Tensor]:
        """Split outputs of the denoising part and the matching part.

        For the total outputs of `num_queries_total` length, the former
        `num_denoising_queries` outputs are from denoising queries, and
        the rest `num_matching_queries` ones are from matching queries,
        where `num_queries_total` is the sum of `num_denoising_queries` and
        `num_matching_queries`.

        Args:
            all_layers_cls_scores (Tensor): Classification scores of all
                decoder layers, has shape (num_decoder_layers, bs,
                num_queries_total, cls_out_channels).
            all_layers_bbox_preds (Tensor): Regression outputs of all decoder
                layers. Each is a 4D-tensor with normalized coordinate format
                (cx, cy, w, h) and has shape (num_decoder_layers, bs,
                num_queries_total, 4).
            dn_meta (Dict[str, int]): The dictionary saves information about
              group collation, including 'num_denoising_queries' and
              'num_denoising_groups'.

        Returns:
            Tuple[Tensor]: a tuple containing the following outputs.

            - all_layers_matching_cls_scores (Tensor): Classification scores
              of all decoder layers in matching part, has shape
              (num_decoder_layers, bs, num_matching_queries, cls_out_channels).
            - all_layers_matching_bbox_preds (Tensor): Regression outputs of
              all decoder layers in matching part. Each is a 4D-tensor with
              normalized coordinate format (cx, cy, w, h) and has shape
              (num_decoder_layers, bs, num_matching_queries, 4).
            - all_layers_denoising_cls_scores (Tensor): Classification scores
              of all decoder layers in denoising part, has shape
              (num_decoder_layers, bs, num_denoising_queries,
              cls_out_channels).
            - all_layers_denoising_bbox_preds (Tensor): Regression outputs of
              all decoder layers in denoising part. Each is a 4D-tensor with
              normalized coordinate format (cx, cy, w, h) and has shape
              (num_decoder_layers, bs, num_denoising_queries, 4).
        """
        if dn_meta is not None:
            num_denoising_queries = dn_meta['num_denoising_queries']
            all_layers_denoising_cls_scores = \
                all_layers_cls_scores[:, : num_denoising_queries, :]
            all_layers_denoising_bbox_preds = \
                all_layers_bbox_preds[:, :, : num_denoising_queries, :]
            all_layers_matching_cls_scores = \
                all_layers_cls_scores[:, num_denoising_queries:, :]
            all_layers_matching_bbox_preds = \
                all_layers_bbox_preds[:, :, num_denoising_queries:, :]
        else:
            all_layers_denoising_cls_scores = None
            all_layers_denoising_bbox_preds = None
            all_layers_matching_cls_scores = all_layers_cls_scores
            all_layers_matching_bbox_preds = all_layers_bbox_preds
        return (all_layers_matching_cls_scores, all_layers_matching_bbox_preds,
                all_layers_denoising_cls_scores,
                all_layers_denoising_bbox_preds)

    def predict(self,
                hidden_states: Tensor,
                references: List[Tensor],
                centers: Tensor,
                center_scores: Tensor,
                topk_centers_scores: Tensor,
                cls_scores: Tensor,
                topk_cls_scores: Tensor,
                seg_scores: Optional[Tensor],
                abu_scores: Optional[Tensor],
                batch_data_samples: SampleList,
                rescale: bool = True) -> InstanceList:
        """Perform forward propagation and loss calculation of the detection
        head on the queries of the upstream network.

        Args:
            hidden_states (Tensor): Hidden states output from each decoder
                layer, has shape (num_decoder_layers, num_queries, bs, dim).
            references (list[Tensor]): List of the reference from the decoder.
                The first reference is the `init_reference` (initial) and the
                other num_decoder_layers(6) references are `inter_references`
                (intermediate). The `init_reference` has shape (bs,
                num_queries, 4) when `as_two_stage` of the detector is `True`,
                otherwise (bs, num_queries, 2). Each `inter_reference` has
                shape (bs, num_queries, 4) when `with_box_refine` of the
                detector is `True`, otherwise (bs, num_queries, 2). The
                coordinates are arranged as (cx, cy) when the last dimension is
                2, and (cx, cy, w, h) when it is 4.
            batch_data_samples (list[:obj:`DetDataSample`]): The Data
                Samples. It usually includes information such as
                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
            rescale (bool, optional): If `True`, return boxes in original
                image space. Defaults to `True`.

        Returns:
            list[obj:`InstanceData`]: Detection results of each image
            after the post process.
        """
        batch_img_metas = [
            data_samples.metainfo for data_samples in batch_data_samples
        ]
        outs = self(hidden_states, references, topk_cls_scores)
        if self.predict_segmentation:
            seg_scores = seg_scores.sigmoid()
            if self.predict_abundance:
                abu_scores = torch.clamp((abu_scores.sigmoid()-0.25)*2,0,1)
        predictions = self.predict_by_feat(
            *outs,seg_scores,abu_scores, batch_img_metas=batch_img_metas, rescale=rescale)
        return predictions

    def predict_by_feat(self,
                        all_layers_cls_scores: Tensor,
                        all_layers_bbox_preds: Tensor,
                        seg_scores: Optional[Tensor],
                        abu_scores: Optional[Tensor],
                        batch_img_metas: List[Dict],
                        rescale: bool = False) -> InstanceList:
        """Transform a batch of output features extracted from the head into
        bbox results.

        Args:
            all_layers_cls_scores (Tensor): Classification scores of all
                decoder layers, has shape (num_decoder_layers, bs, num_queries,
                cls_out_channels).
            all_layers_bbox_preds (Tensor): Regression outputs of all decoder
                layers. Each is a 4D-tensor with normalized coordinate format
                (cx, cy, w, h) and shape (num_decoder_layers, bs, num_queries,
                4) with the last dimension arranged as (cx, cy, w, h).
            batch_img_metas (list[dict]): Meta information of each image.
            rescale (bool, optional): If `True`, return boxes in original
                image space. Default `False`.

        Returns:
            list[obj:`InstanceData`]: Detection results of each image
            after the post process.
        """
        cls_scores = all_layers_cls_scores[-1]
        bbox_preds = all_layers_bbox_preds[-1]

        result_list = []
        for img_id in range(len(batch_img_metas)):
            cls_score = cls_scores[img_id]
            bbox_pred = bbox_preds[img_id]
            img_meta = batch_img_metas[img_id]
            if self.predict_segmentation:
                seg_score = seg_scores[img_id]
            else:
                seg_score = None
            if self.predict_abundance:
                abu_score = abu_scores[img_id]
            else:
                abu_score = None
            results = self._predict_by_feat_single(cls_score, bbox_pred,
                                                   seg_score,abu_score,
                                                   img_meta, rescale)
            result_list.append(results)
        return result_list

    def _predict_by_feat_single(self,
                                cls_score: Tensor,
                                bbox_pred: Tensor,
                                seg_score: Optional[Tensor],
                                abu_score: Optional[Tensor],
                                img_meta: dict,
                                rescale: bool = True) -> InstanceData:
        """Transform outputs from the last decoder layer into bbox predictions
        for each image.

        Args:
            cls_score (Tensor): Box score logits from the last decoder layer
                for each image. Shape [num_queries, cls_out_channels].
            bbox_pred (Tensor): Sigmoid outputs from the last decoder layer
                for each image, with coordinate format (cx, cy, w, h) and
                shape [num_queries, 4].
            img_meta (dict): Image meta info.
            rescale (bool): If True, return boxes in original image
                space. Default True.

        Returns:
            :obj:`InstanceData`: Detection results of each image
            after the post process.
            Each item usually contains following keys.

                - scores (Tensor): Classification scores, has a shape
                  (num_instance, )
                - labels (Tensor): Labels of bboxes, has a shape
                  (num_instances, ).
                - bboxes (Tensor): Has a shape (num_instances, 4),
                  the last dimension 4 arrange as (x1, y1, x2, y2).
        """
        assert len(cls_score) == len(bbox_pred)  # num_queries
        max_per_img = self.test_cfg.get('max_per_img', len(cls_score))

        img_shape = img_meta['img_shape']
        assert self.loss_cls.use_sigmoid
        cls_score = cls_score.sigmoid()
        # scores, indexes = cls_score.view(-1).topk(max_per_img)
        scores, indexes = torch.sort(cls_score.view(-1), descending=True)
        # indexes = indexes[scores > self.score_threshold]
        # scores = scores[scores > self.score_threshold]
        det_labels = indexes % self.num_classes
        bbox_index = torch.div(indexes, self.num_classes, rounding_mode='trunc')
        bbox_pred = bbox_pred[bbox_index]
        det_bboxes = bbox_cxcywh_to_xyxy(bbox_pred)
        det_bboxes[:, 0::2] = det_bboxes[:, 0::2] * img_shape[1]
        det_bboxes[:, 1::2] = det_bboxes[:, 1::2] * img_shape[0]
        det_bboxes[:, 0::2].clamp_(min=0, max=img_shape[1])
        det_bboxes[:, 1::2].clamp_(min=0, max=img_shape[0])
        if self.use_nms:
            if det_labels.numel() > 0:
                bboxes_scores, keep = batched_nms(det_bboxes, scores.contiguous(), det_labels, self.test_nms, class_agnostic=(not self.class_wise_nms))
                if keep.numel() > max_per_img:
                    bboxes_scores = bboxes_scores[:max_per_img]
                    det_labels = det_labels[keep][:max_per_img]
                else:
                    det_labels = det_labels[keep]
                det_bboxes = bboxes_scores[:, :-1]
                scores = bboxes_scores[:, -1]
        if self.pre_bboxes_round:
            det_bboxes = adjust_bbox_to_pixel(det_bboxes)
        if rescale:
            # assert img_meta.get('scale_factor') is not None
            # det_bboxes /= det_bboxes.new_tensor(
            #     img_meta['scale_factor']).repeat((1, 2))
            # rw by lzx
            if img_meta.get('scale_factor') is not None:
                det_bboxes /= det_bboxes.new_tensor(
                    img_meta['scale_factor']).repeat((1, 2))

        if self.predict_segmentation:
            img_h, img_w = img_meta['ori_shape'][:2]
            seg_score = seg_score.view(img_h, img_w)
            seg_mask=seg_score>self.mask_threshold
            N = det_bboxes.size(0)
            im_mask = torch.zeros(
                N,
                img_shape[0],
                img_shape[1],
                device=det_bboxes.device,
                dtype=torch.bool)
            for i in range(N):
                x0, y0, x1, y1 = det_bboxes[i,:]
                x0 = max(int(x0)-self.mask_extend_pixel,0)
                x1 = min(int(x1)+self.mask_extend_pixel,img_w)
                y0 = max(int(y0)-self.mask_extend_pixel,0)
                y1 = min(int(y1)+self.mask_extend_pixel,img_h)
                im_mask[i, y0:y1,x0:x1] =seg_mask[y0:y1,x0:x1 ]
            new_mask = torch.sum(im_mask,dim=0)
            seg_score_new = seg_score.clone().detach()
            seg_score_new[new_mask==0] = 0
            if self.save_path is not None:
                img_name=img_meta['img_path'].split('/')[-1].replace('.npy','').replace('.png','').replace('.jpg','').replace('.mat','')
                sio.savemat(os.path.join(self.save_path,img_name+'_seg_scoremap.mat'),
                            {'data':seg_score.detach().cpu().numpy()})
                sio.savemat(os.path.join(self.save_path,img_name+'_seg_predictionmap.mat'),
                            {'data':seg_score_new.detach().cpu().numpy()})
                if self.predict_abundance:
                    abu_score = abu_score.view(img_h, img_w)
                    abu_score_new = abu_score.clone().detach()
                    abu_score_new[new_mask == 0] = 0
                    sio.savemat(os.path.join(self.save_path, img_name + '_abu_scoremap.mat'),
                                {'data': abu_score.detach().cpu().numpy()})
                    sio.savemat(os.path.join(self.save_path, img_name + '_abu_predictionmap.mat'),
                                {'data': seg_score_new.detach().cpu().numpy()})
        results = InstanceData()
        results.bboxes = det_bboxes
        results.scores = scores
        results.labels = det_labels
        if self.predict_segmentation:
            results.masks = im_mask
        return results