|
|
|
from typing import Dict, List, Tuple |
|
|
|
import torch |
|
from mmengine.structures import InstanceData |
|
from torch import Tensor |
|
import torch.nn.functional as F |
|
from mmdet.registry import MODELS |
|
from mmdet.structures import SampleList |
|
from mmdet.structures.bbox import bbox_cxcywh_to_xyxy, bbox_xyxy_to_cxcywh |
|
from mmdet.utils import InstanceList, OptInstanceList, reduce_mean |
|
from ..utils import multi_apply |
|
from .deformable_detr_head import DeformableDETRHead |
|
from .dino_head import DINOHead |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def adjust_bbox_to_pixel(bboxes: Tensor): |
|
|
|
adjusted_bboxes = torch.floor(bboxes) |
|
|
|
adjusted_bboxes[:, 2:] = torch.ceil(bboxes[:, 2:]) |
|
return adjusted_bboxes |
|
|
|
def adjust_bbox_to_pixelV2(bboxes: Tensor): |
|
|
|
adjusted_bboxes = torch.floor(bboxes) |
|
|
|
adjusted_bboxes[:, 2:] = torch.ceil(bboxes[:, 2:]) |
|
return adjusted_bboxes |
|
|
|
@MODELS.register_module() |
|
class DINOSTHead(DINOHead): |
|
def __init__(self, |
|
*args, |
|
round_coord: bool = False, |
|
**kwargs) -> None: |
|
self.round_coord = round_coord |
|
super().__init__(*args, **kwargs) |
|
|
|
def loss_by_feat_single(self, cls_scores: Tensor, bbox_preds: Tensor, |
|
batch_gt_instances: InstanceList, |
|
batch_img_metas: List[dict]) -> Tuple[Tensor]: |
|
"""Loss function for outputs from a single decoder layer of a single |
|
feature level. |
|
|
|
Args: |
|
cls_scores (Tensor): Box score logits from a single decoder layer |
|
for all images, has shape (bs, num_queries, cls_out_channels). |
|
bbox_preds (Tensor): Sigmoid outputs from a single decoder layer |
|
for all images, with normalized coordinate (cx, cy, w, h) and |
|
shape (bs, num_queries, 4). |
|
batch_gt_instances (list[:obj:`InstanceData`]): Batch of |
|
gt_instance. It usually includes ``bboxes`` and ``labels`` |
|
attributes. |
|
batch_img_metas (list[dict]): Meta information of each image, e.g., |
|
image size, scaling factor, etc. |
|
|
|
Returns: |
|
Tuple[Tensor]: A tuple including `loss_cls`, `loss_box` and |
|
`loss_iou`. |
|
""" |
|
num_imgs = cls_scores.size(0) |
|
cls_scores_list = [cls_scores[i] for i in range(num_imgs)] |
|
bbox_preds_list = [bbox_preds[i] for i in range(num_imgs)] |
|
cls_reg_targets = self.get_targets(cls_scores_list, bbox_preds_list, |
|
batch_gt_instances, batch_img_metas) |
|
(labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, |
|
num_total_pos, num_total_neg) = cls_reg_targets |
|
labels = torch.cat(labels_list, 0) |
|
label_weights = torch.cat(label_weights_list, 0) |
|
bbox_targets = torch.cat(bbox_targets_list, 0) |
|
bbox_weights = torch.cat(bbox_weights_list, 0) |
|
|
|
|
|
cls_scores = cls_scores.reshape(-1, self.cls_out_channels) |
|
|
|
cls_avg_factor = num_total_pos * 1.0 + \ |
|
num_total_neg * self.bg_cls_weight |
|
if self.sync_cls_avg_factor: |
|
cls_avg_factor = reduce_mean( |
|
cls_scores.new_tensor([cls_avg_factor])) |
|
cls_avg_factor = max(cls_avg_factor, 1) |
|
|
|
loss_cls = self.loss_cls( |
|
cls_scores, labels, label_weights, avg_factor=cls_avg_factor) |
|
|
|
|
|
|
|
num_total_pos = loss_cls.new_tensor([num_total_pos]) |
|
num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item() |
|
|
|
|
|
factors = [] |
|
for img_meta, bbox_pred in zip(batch_img_metas, bbox_preds): |
|
img_h, img_w, = img_meta['img_shape'] |
|
factor = bbox_pred.new_tensor([img_w, img_h, img_w, |
|
img_h]).unsqueeze(0).repeat( |
|
bbox_pred.size(0), 1) |
|
factors.append(factor) |
|
factors = torch.cat(factors, 0) |
|
|
|
|
|
|
|
|
|
bbox_preds = bbox_preds.reshape(-1, 4) |
|
bboxes = bbox_cxcywh_to_xyxy(bbox_preds) * factors |
|
bboxes=adjust_bbox_to_pixel(bboxes) |
|
bboxes_gt = bbox_cxcywh_to_xyxy(bbox_targets) * factors |
|
|
|
|
|
loss_iou = self.loss_iou( |
|
bboxes, bboxes_gt, bbox_weights, avg_factor=num_total_pos) |
|
|
|
|
|
loss_bbox = self.loss_bbox( |
|
bbox_preds, bbox_targets, bbox_weights, avg_factor=num_total_pos) |
|
return loss_cls, loss_bbox, loss_iou |
|
|
|
|
|
def _predict_by_feat_single(self, |
|
cls_score: Tensor, |
|
bbox_pred: Tensor, |
|
img_meta: dict, |
|
rescale: bool = True) -> InstanceData: |
|
"""Transform outputs from the last decoder layer into bbox predictions |
|
for each image. |
|
|
|
Args: |
|
cls_score (Tensor): Box score logits from the last decoder layer |
|
for each image. Shape [num_queries, cls_out_channels]. |
|
bbox_pred (Tensor): Sigmoid outputs from the last decoder layer |
|
for each image, with coordinate format (cx, cy, w, h) and |
|
shape [num_queries, 4]. |
|
img_meta (dict): Image meta info. |
|
rescale (bool): If True, return boxes in original image |
|
space. Default True. |
|
|
|
Returns: |
|
:obj:`InstanceData`: Detection results of each image |
|
after the post process. |
|
Each item usually contains following keys. |
|
|
|
- scores (Tensor): Classification scores, has a shape |
|
(num_instance, ) |
|
- labels (Tensor): Labels of bboxes, has a shape |
|
(num_instances, ). |
|
- bboxes (Tensor): Has a shape (num_instances, 4), |
|
the last dimension 4 arrange as (x1, y1, x2, y2). |
|
""" |
|
assert len(cls_score) == len(bbox_pred) |
|
max_per_img = self.test_cfg.get('max_per_img', len(cls_score)) |
|
img_shape = img_meta['img_shape'] |
|
|
|
if self.loss_cls.use_sigmoid: |
|
cls_score = cls_score.sigmoid() |
|
scores, indexes = cls_score.view(-1).topk(max_per_img) |
|
det_labels = indexes % self.num_classes |
|
bbox_index = indexes // self.num_classes |
|
bbox_pred = bbox_pred[bbox_index] |
|
else: |
|
scores, det_labels = F.softmax(cls_score, dim=-1)[..., :-1].max(-1) |
|
scores, bbox_index = scores.topk(max_per_img) |
|
bbox_pred = bbox_pred[bbox_index] |
|
det_labels = det_labels[bbox_index] |
|
|
|
det_bboxes = bbox_cxcywh_to_xyxy(bbox_pred) |
|
det_bboxes[:, 0::2] = det_bboxes[:, 0::2] * img_shape[1] |
|
det_bboxes[:, 1::2] = det_bboxes[:, 1::2] * img_shape[0] |
|
det_bboxes[:, 0::2].clamp_(min=0, max=img_shape[1]) |
|
det_bboxes[:, 1::2].clamp_(min=0, max=img_shape[0]) |
|
|
|
if rescale: |
|
|
|
|
|
|
|
|
|
if img_meta.get('scale_factor') is not None: |
|
det_bboxes /= det_bboxes.new_tensor( |
|
img_meta['scale_factor']).repeat((1, 2)) |
|
|
|
results = InstanceData() |
|
results.bboxes = det_bboxes |
|
results.scores = scores |
|
results.labels = det_labels |
|
return results |
|
|
|
|
|
""" |
|
import matplotlib.pyplot as plt |
|
|
|
# 绘制直方图 |
|
bboxes_gt |
|
bboxes_gt_np = bboxes_gt.detach().cpu().numpy() |
|
result = bbox_preds.detach().cpu().numpy() |
|
result = (bboxes[:, 2] - bboxes[:, 0]) * (bboxes[:, 3] - bboxes[:, 1]) |
|
result = result.detach().cpu().numpy() |
|
plt.hist(result, bins=50, edgecolor='black') |
|
plt.xlabel('Result') |
|
plt.ylabel('Frequency') |
|
plt.title('Histogram of Result') |
|
plt.show() |
|
""" |