Spaces:

scfive
/

samewind

Build error

samewind / mmdet /models /dense_heads /dino_st_head.py

scfive

Resolve README.md conflict and continue rebase

e8f2571 11 days ago

9.08 kB

	# Copyright (c) OpenMMLab. All rights reserved.
	from typing import Dict, List, Tuple

	import torch
	from mmengine.structures import InstanceData
	from torch import Tensor
	import torch.nn.functional as F
	from mmdet.registry import MODELS
	from mmdet.structures import SampleList
	from mmdet.structures.bbox import bbox_cxcywh_to_xyxy, bbox_xyxy_to_cxcywh
	from mmdet.utils import InstanceList, OptInstanceList, reduce_mean
	from ..utils import multi_apply
	from .deformable_detr_head import DeformableDETRHead
	from .dino_head import DINOHead

	# def bbox_cxcywh_to_xyxy_1pixel(bbox: Tensor, factors: Tensor) -> Tensor:
	# """Convert bbox coordinates from (cx, cy, w, h) to (x1, y1, x2, y2).
	#
	# Args:
	# bbox (Tensor): Shape (n, 4) for bboxes.
	#
	# Returns:
	# Tensor: Converted bboxes.
	#
	# """
	# bbox1 = bbox*factors
	# cx, cy, w, h = bbox.split((1, 1, 1, 1), dim=-1)
	#
	# bbox_new = [(cx - 0.5 * w), (cy - 0.5 * h), (cx + 0.5 * w), (cy + 0.5 * h)]
	# return torch.cat(bbox_new, dim=-1)

	def adjust_bbox_to_pixel(bboxes: Tensor):
	# 向下取整得到目标的左上角坐标
	adjusted_bboxes = torch.floor(bboxes)
	# 向上取整得到目标的右下角坐标
	adjusted_bboxes[:, 2:] = torch.ceil(bboxes[:, 2:])
	return adjusted_bboxes

	def adjust_bbox_to_pixelV2(bboxes: Tensor):
	# 向下取整得到目标的左上角坐标
	adjusted_bboxes = torch.floor(bboxes)
	# 向上取整得到目标的右下角坐标
	adjusted_bboxes[:, 2:] = torch.ceil(bboxes[:, 2:])
	return adjusted_bboxes

	@MODELS.register_module()
	class DINOSTHead(DINOHead):
	def __init__(self,
	*args,
	round_coord: bool = False,
	**kwargs) -> None:
	self.round_coord = round_coord
	super().__init__(args, *kwargs)

	def loss_by_feat_single(self, cls_scores: Tensor, bbox_preds: Tensor,
	batch_gt_instances: InstanceList,
	batch_img_metas: List[dict]) -> Tuple[Tensor]:
	"""Loss function for outputs from a single decoder layer of a single
	feature level.

	Args:
	cls_scores (Tensor): Box score logits from a single decoder layer
	for all images, has shape (bs, num_queries, cls_out_channels).
	bbox_preds (Tensor): Sigmoid outputs from a single decoder layer
	for all images, with normalized coordinate (cx, cy, w, h) and
	shape (bs, num_queries, 4).
	batch_gt_instances (list[:obj:`InstanceData`]): Batch of
	gt_instance. It usually includes ``bboxes`` and ``labels``
	attributes.
	batch_img_metas (list[dict]): Meta information of each image, e.g.,
	image size, scaling factor, etc.

	Returns:
	Tuple[Tensor]: A tuple including `loss_cls`, `loss_box` and
	`loss_iou`.
	"""
	num_imgs = cls_scores.size(0)
	cls_scores_list = [cls_scores[i] for i in range(num_imgs)]
	bbox_preds_list = [bbox_preds[i] for i in range(num_imgs)]
	cls_reg_targets = self.get_targets(cls_scores_list, bbox_preds_list,
	batch_gt_instances, batch_img_metas)
	(labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
	num_total_pos, num_total_neg) = cls_reg_targets
	labels = torch.cat(labels_list, 0)
	label_weights = torch.cat(label_weights_list, 0)
	bbox_targets = torch.cat(bbox_targets_list, 0)
	bbox_weights = torch.cat(bbox_weights_list, 0)

	# classification loss
	cls_scores = cls_scores.reshape(-1, self.cls_out_channels)
	# construct weighted avg_factor to match with the official DETR repo
	cls_avg_factor = num_total_pos * 1.0 + \
	num_total_neg * self.bg_cls_weight
	if self.sync_cls_avg_factor:
	cls_avg_factor = reduce_mean(
	cls_scores.new_tensor([cls_avg_factor]))
	cls_avg_factor = max(cls_avg_factor, 1)

	loss_cls = self.loss_cls(
	cls_scores, labels, label_weights, avg_factor=cls_avg_factor)

	# Compute the average number of gt boxes across all gpus, for
	# normalization purposes
	num_total_pos = loss_cls.new_tensor([num_total_pos])
	num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item()

	# construct factors used for rescale bboxes
	factors = []
	for img_meta, bbox_pred in zip(batch_img_metas, bbox_preds):
	img_h, img_w, = img_meta['img_shape']
	factor = bbox_pred.new_tensor([img_w, img_h, img_w,
	img_h]).unsqueeze(0).repeat(
	bbox_pred.size(0), 1)
	factors.append(factor)
	factors = torch.cat(factors, 0)

	# DETR regress the relative position of boxes (cxcywh) in the image,
	# thus the learning target is normalized by the image size. So here
	# we need to re-scale them for calculating IoU loss
	bbox_preds = bbox_preds.reshape(-1, 4)
	bboxes = bbox_cxcywh_to_xyxy(bbox_preds) * factors
	bboxes=adjust_bbox_to_pixel(bboxes)
	bboxes_gt = bbox_cxcywh_to_xyxy(bbox_targets) * factors
	#
	# regression IoU loss, defaultly GIoU loss
	loss_iou = self.loss_iou(
	bboxes, bboxes_gt, bbox_weights, avg_factor=num_total_pos)

	# regression L1 loss
	loss_bbox = self.loss_bbox(
	bbox_preds, bbox_targets, bbox_weights, avg_factor=num_total_pos)
	return loss_cls, loss_bbox, loss_iou


	def _predict_by_feat_single(self,
	cls_score: Tensor,
	bbox_pred: Tensor,
	img_meta: dict,
	rescale: bool = True) -> InstanceData:
	"""Transform outputs from the last decoder layer into bbox predictions
	for each image.

	Args:
	cls_score (Tensor): Box score logits from the last decoder layer
	for each image. Shape [num_queries, cls_out_channels].
	bbox_pred (Tensor): Sigmoid outputs from the last decoder layer
	for each image, with coordinate format (cx, cy, w, h) and
	shape [num_queries, 4].
	img_meta (dict): Image meta info.
	rescale (bool): If True, return boxes in original image
	space. Default True.

	Returns:
	:obj:`InstanceData`: Detection results of each image
	after the post process.
	Each item usually contains following keys.

	- scores (Tensor): Classification scores, has a shape
	(num_instance, )
	- labels (Tensor): Labels of bboxes, has a shape
	(num_instances, ).
	- bboxes (Tensor): Has a shape (num_instances, 4),
	the last dimension 4 arrange as (x1, y1, x2, y2).
	"""
	assert len(cls_score) == len(bbox_pred) # num_queries
	max_per_img = self.test_cfg.get('max_per_img', len(cls_score))
	img_shape = img_meta['img_shape']
	# exclude background
	if self.loss_cls.use_sigmoid:
	cls_score = cls_score.sigmoid()
	scores, indexes = cls_score.view(-1).topk(max_per_img)
	det_labels = indexes % self.num_classes
	bbox_index = indexes // self.num_classes
	bbox_pred = bbox_pred[bbox_index]
	else:
	scores, det_labels = F.softmax(cls_score, dim=-1)[..., :-1].max(-1)
	scores, bbox_index = scores.topk(max_per_img)
	bbox_pred = bbox_pred[bbox_index]
	det_labels = det_labels[bbox_index]

	det_bboxes = bbox_cxcywh_to_xyxy(bbox_pred)
	det_bboxes[:, 0::2] = det_bboxes[:, 0::2] * img_shape[1]
	det_bboxes[:, 1::2] = det_bboxes[:, 1::2] * img_shape[0]
	det_bboxes[:, 0::2].clamp_(min=0, max=img_shape[1])
	det_bboxes[:, 1::2].clamp_(min=0, max=img_shape[0])

	if rescale:
	# assert img_meta.get('scale_factor') is not None
	# det_bboxes /= det_bboxes.new_tensor(
	# img_meta['scale_factor']).repeat((1, 2))
	# rw by lzx
	if img_meta.get('scale_factor') is not None:
	det_bboxes /= det_bboxes.new_tensor(
	img_meta['scale_factor']).repeat((1, 2))

	results = InstanceData()
	results.bboxes = det_bboxes
	results.scores = scores
	results.labels = det_labels
	return results


	"""
	import matplotlib.pyplot as plt

	# 绘制直方图
	bboxes_gt
	bboxes_gt_np = bboxes_gt.detach().cpu().numpy()
	result = bbox_preds.detach().cpu().numpy()
	result = (bboxes[:, 2] - bboxes[:, 0]) * (bboxes[:, 3] - bboxes[:, 1])
	result = result.detach().cpu().numpy()
	plt.hist(result, bins=50, edgecolor='black')
	plt.xlabel('Result')
	plt.ylabel('Frequency')
	plt.title('Histogram of Result')
	plt.show()
	"""