File size: 9,077 Bytes
e8f2571 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 |
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Dict, List, Tuple
import torch
from mmengine.structures import InstanceData
from torch import Tensor
import torch.nn.functional as F
from mmdet.registry import MODELS
from mmdet.structures import SampleList
from mmdet.structures.bbox import bbox_cxcywh_to_xyxy, bbox_xyxy_to_cxcywh
from mmdet.utils import InstanceList, OptInstanceList, reduce_mean
from ..utils import multi_apply
from .deformable_detr_head import DeformableDETRHead
from .dino_head import DINOHead
# def bbox_cxcywh_to_xyxy_1pixel(bbox: Tensor, factors: Tensor) -> Tensor:
# """Convert bbox coordinates from (cx, cy, w, h) to (x1, y1, x2, y2).
#
# Args:
# bbox (Tensor): Shape (n, 4) for bboxes.
#
# Returns:
# Tensor: Converted bboxes.
#
# """
# bbox1 = bbox*factors
# cx, cy, w, h = bbox.split((1, 1, 1, 1), dim=-1)
#
# bbox_new = [(cx - 0.5 * w), (cy - 0.5 * h), (cx + 0.5 * w), (cy + 0.5 * h)]
# return torch.cat(bbox_new, dim=-1)
def adjust_bbox_to_pixel(bboxes: Tensor):
# 向下取整得到目标的左上角坐标
adjusted_bboxes = torch.floor(bboxes)
# 向上取整得到目标的右下角坐标
adjusted_bboxes[:, 2:] = torch.ceil(bboxes[:, 2:])
return adjusted_bboxes
def adjust_bbox_to_pixelV2(bboxes: Tensor):
# 向下取整得到目标的左上角坐标
adjusted_bboxes = torch.floor(bboxes)
# 向上取整得到目标的右下角坐标
adjusted_bboxes[:, 2:] = torch.ceil(bboxes[:, 2:])
return adjusted_bboxes
@MODELS.register_module()
class DINOSTHead(DINOHead):
def __init__(self,
*args,
round_coord: bool = False,
**kwargs) -> None:
self.round_coord = round_coord
super().__init__(*args, **kwargs)
def loss_by_feat_single(self, cls_scores: Tensor, bbox_preds: Tensor,
batch_gt_instances: InstanceList,
batch_img_metas: List[dict]) -> Tuple[Tensor]:
"""Loss function for outputs from a single decoder layer of a single
feature level.
Args:
cls_scores (Tensor): Box score logits from a single decoder layer
for all images, has shape (bs, num_queries, cls_out_channels).
bbox_preds (Tensor): Sigmoid outputs from a single decoder layer
for all images, with normalized coordinate (cx, cy, w, h) and
shape (bs, num_queries, 4).
batch_gt_instances (list[:obj:`InstanceData`]): Batch of
gt_instance. It usually includes ``bboxes`` and ``labels``
attributes.
batch_img_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
Returns:
Tuple[Tensor]: A tuple including `loss_cls`, `loss_box` and
`loss_iou`.
"""
num_imgs = cls_scores.size(0)
cls_scores_list = [cls_scores[i] for i in range(num_imgs)]
bbox_preds_list = [bbox_preds[i] for i in range(num_imgs)]
cls_reg_targets = self.get_targets(cls_scores_list, bbox_preds_list,
batch_gt_instances, batch_img_metas)
(labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
num_total_pos, num_total_neg) = cls_reg_targets
labels = torch.cat(labels_list, 0)
label_weights = torch.cat(label_weights_list, 0)
bbox_targets = torch.cat(bbox_targets_list, 0)
bbox_weights = torch.cat(bbox_weights_list, 0)
# classification loss
cls_scores = cls_scores.reshape(-1, self.cls_out_channels)
# construct weighted avg_factor to match with the official DETR repo
cls_avg_factor = num_total_pos * 1.0 + \
num_total_neg * self.bg_cls_weight
if self.sync_cls_avg_factor:
cls_avg_factor = reduce_mean(
cls_scores.new_tensor([cls_avg_factor]))
cls_avg_factor = max(cls_avg_factor, 1)
loss_cls = self.loss_cls(
cls_scores, labels, label_weights, avg_factor=cls_avg_factor)
# Compute the average number of gt boxes across all gpus, for
# normalization purposes
num_total_pos = loss_cls.new_tensor([num_total_pos])
num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item()
# construct factors used for rescale bboxes
factors = []
for img_meta, bbox_pred in zip(batch_img_metas, bbox_preds):
img_h, img_w, = img_meta['img_shape']
factor = bbox_pred.new_tensor([img_w, img_h, img_w,
img_h]).unsqueeze(0).repeat(
bbox_pred.size(0), 1)
factors.append(factor)
factors = torch.cat(factors, 0)
# DETR regress the relative position of boxes (cxcywh) in the image,
# thus the learning target is normalized by the image size. So here
# we need to re-scale them for calculating IoU loss
bbox_preds = bbox_preds.reshape(-1, 4)
bboxes = bbox_cxcywh_to_xyxy(bbox_preds) * factors
bboxes=adjust_bbox_to_pixel(bboxes)
bboxes_gt = bbox_cxcywh_to_xyxy(bbox_targets) * factors
#
# regression IoU loss, defaultly GIoU loss
loss_iou = self.loss_iou(
bboxes, bboxes_gt, bbox_weights, avg_factor=num_total_pos)
# regression L1 loss
loss_bbox = self.loss_bbox(
bbox_preds, bbox_targets, bbox_weights, avg_factor=num_total_pos)
return loss_cls, loss_bbox, loss_iou
def _predict_by_feat_single(self,
cls_score: Tensor,
bbox_pred: Tensor,
img_meta: dict,
rescale: bool = True) -> InstanceData:
"""Transform outputs from the last decoder layer into bbox predictions
for each image.
Args:
cls_score (Tensor): Box score logits from the last decoder layer
for each image. Shape [num_queries, cls_out_channels].
bbox_pred (Tensor): Sigmoid outputs from the last decoder layer
for each image, with coordinate format (cx, cy, w, h) and
shape [num_queries, 4].
img_meta (dict): Image meta info.
rescale (bool): If True, return boxes in original image
space. Default True.
Returns:
:obj:`InstanceData`: Detection results of each image
after the post process.
Each item usually contains following keys.
- scores (Tensor): Classification scores, has a shape
(num_instance, )
- labels (Tensor): Labels of bboxes, has a shape
(num_instances, ).
- bboxes (Tensor): Has a shape (num_instances, 4),
the last dimension 4 arrange as (x1, y1, x2, y2).
"""
assert len(cls_score) == len(bbox_pred) # num_queries
max_per_img = self.test_cfg.get('max_per_img', len(cls_score))
img_shape = img_meta['img_shape']
# exclude background
if self.loss_cls.use_sigmoid:
cls_score = cls_score.sigmoid()
scores, indexes = cls_score.view(-1).topk(max_per_img)
det_labels = indexes % self.num_classes
bbox_index = indexes // self.num_classes
bbox_pred = bbox_pred[bbox_index]
else:
scores, det_labels = F.softmax(cls_score, dim=-1)[..., :-1].max(-1)
scores, bbox_index = scores.topk(max_per_img)
bbox_pred = bbox_pred[bbox_index]
det_labels = det_labels[bbox_index]
det_bboxes = bbox_cxcywh_to_xyxy(bbox_pred)
det_bboxes[:, 0::2] = det_bboxes[:, 0::2] * img_shape[1]
det_bboxes[:, 1::2] = det_bboxes[:, 1::2] * img_shape[0]
det_bboxes[:, 0::2].clamp_(min=0, max=img_shape[1])
det_bboxes[:, 1::2].clamp_(min=0, max=img_shape[0])
if rescale:
# assert img_meta.get('scale_factor') is not None
# det_bboxes /= det_bboxes.new_tensor(
# img_meta['scale_factor']).repeat((1, 2))
# rw by lzx
if img_meta.get('scale_factor') is not None:
det_bboxes /= det_bboxes.new_tensor(
img_meta['scale_factor']).repeat((1, 2))
results = InstanceData()
results.bboxes = det_bboxes
results.scores = scores
results.labels = det_labels
return results
"""
import matplotlib.pyplot as plt
# 绘制直方图
bboxes_gt
bboxes_gt_np = bboxes_gt.detach().cpu().numpy()
result = bbox_preds.detach().cpu().numpy()
result = (bboxes[:, 2] - bboxes[:, 0]) * (bboxes[:, 3] - bboxes[:, 1])
result = result.detach().cpu().numpy()
plt.hist(result, bins=50, edgecolor='black')
plt.xlabel('Result')
plt.ylabel('Frequency')
plt.title('Histogram of Result')
plt.show()
""" |