Spaces:

scfive
/

samewind

Build error

samewind / mmdet /models /dense_heads /guided_anchor_head.py

scfive

Resolve README.md conflict and continue rebase

e8f2571 11 days ago

44.1 kB

	# Copyright (c) OpenMMLab. All rights reserved.
	from typing import List, Optional, Tuple

	import torch
	import torch.nn as nn
	from mmcv.ops import DeformConv2d, MaskedConv2d
	from mmengine.model import BaseModule
	from mmengine.structures import InstanceData
	from torch import Tensor

	from mmdet.registry import MODELS, TASK_UTILS
	from mmdet.utils import (ConfigType, InstanceList, MultiConfig, OptConfigType,
	OptInstanceList)
	from ..layers import multiclass_nms
	from ..task_modules.prior_generators import anchor_inside_flags, calc_region
	from ..task_modules.samplers import PseudoSampler
	from ..utils import images_to_levels, multi_apply, unmap
	from .anchor_head import AnchorHead


	class FeatureAdaption(BaseModule):
	"""Feature Adaption Module.

	Feature Adaption Module is implemented based on DCN v1.
	It uses anchor shape prediction rather than feature map to
	predict offsets of deform conv layer.

	Args:
	in_channels (int): Number of channels in the input feature map.
	out_channels (int): Number of channels in the output feature map.
	kernel_size (int): Deformable conv kernel size. Defaults to 3.
	deform_groups (int): Deformable conv group size. Defaults to 4.
	init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or \
	list[dict], optional): Initialization config dict.
	"""

	def __init__(
	self,
	in_channels: int,
	out_channels: int,
	kernel_size: int = 3,
	deform_groups: int = 4,
	init_cfg: MultiConfig = dict(
	type='Normal',
	layer='Conv2d',
	std=0.1,
	override=dict(type='Normal', name='conv_adaption', std=0.01))
	) -> None:
	super().__init__(init_cfg=init_cfg)
	offset_channels = kernel_size * kernel_size * 2
	self.conv_offset = nn.Conv2d(
	2, deform_groups * offset_channels, 1, bias=False)
	self.conv_adaption = DeformConv2d(
	in_channels,
	out_channels,
	kernel_size=kernel_size,
	padding=(kernel_size - 1) // 2,
	deform_groups=deform_groups)
	self.relu = nn.ReLU(inplace=True)

	def forward(self, x: Tensor, shape: Tensor) -> Tensor:
	offset = self.conv_offset(shape.detach())
	x = self.relu(self.conv_adaption(x, offset))
	return x


	@MODELS.register_module()
	class GuidedAnchorHead(AnchorHead):
	"""Guided-Anchor-based head (GA-RPN, GA-RetinaNet, etc.).

	This GuidedAnchorHead will predict high-quality feature guided
	anchors and locations where anchors will be kept in inference.
	There are mainly 3 categories of bounding-boxes.

	- Sampled 9 pairs for target assignment. (approxes)
	- The square boxes where the predicted anchors are based on. (squares)
	- Guided anchors.

	Please refer to https://arxiv.org/abs/1901.03278 for more details.

	Args:
	num_classes (int): Number of classes.
	in_channels (int): Number of channels in the input feature map.
	feat_channels (int): Number of hidden channels. Defaults to 256.
	approx_anchor_generator (:obj:`ConfigDict` or dict): Config dict
	for approx generator
	square_anchor_generator (:obj:`ConfigDict` or dict): Config dict
	for square generator
	anchor_coder (:obj:`ConfigDict` or dict): Config dict for anchor coder
	bbox_coder (:obj:`ConfigDict` or dict): Config dict for bbox coder
	reg_decoded_bbox (bool): If true, the regression loss would be
	applied directly on decoded bounding boxes, converting both
	the predicted boxes and regression targets to absolute
	coordinates format. Defaults to False. It should be `True` when
	using `IoULoss`, `GIoULoss`, or `DIoULoss` in the bbox head.
	deform_groups: (int): Group number of DCN in FeatureAdaption module.
	Defaults to 4.
	loc_filter_thr (float): Threshold to filter out unconcerned regions.
	Defaults to 0.01.
	loss_loc (:obj:`ConfigDict` or dict): Config of location loss.
	loss_shape (:obj:`ConfigDict` or dict): Config of anchor shape loss.
	loss_cls (:obj:`ConfigDict` or dict): Config of classification loss.
	loss_bbox (:obj:`ConfigDict` or dict): Config of bbox regression loss.
	init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or \
	list[dict], optional): Initialization config dict.
	"""

	def __init__(
	self,
	num_classes: int,
	in_channels: int,
	feat_channels: int = 256,
	approx_anchor_generator: ConfigType = dict(
	type='AnchorGenerator',
	octave_base_scale=8,
	scales_per_octave=3,
	ratios=[0.5, 1.0, 2.0],
	strides=[4, 8, 16, 32, 64]),
	square_anchor_generator: ConfigType = dict(
	type='AnchorGenerator',
	ratios=[1.0],
	scales=[8],
	strides=[4, 8, 16, 32, 64]),
	anchor_coder: ConfigType = dict(
	type='DeltaXYWHBBoxCoder',
	target_means=[.0, .0, .0, .0],
	target_stds=[1.0, 1.0, 1.0, 1.0]),
	bbox_coder: ConfigType = dict(
	type='DeltaXYWHBBoxCoder',
	target_means=[.0, .0, .0, .0],
	target_stds=[1.0, 1.0, 1.0, 1.0]),
	reg_decoded_bbox: bool = False,
	deform_groups: int = 4,
	loc_filter_thr: float = 0.01,
	train_cfg: OptConfigType = None,
	test_cfg: OptConfigType = None,
	loss_loc: ConfigType = dict(
	type='FocalLoss',
	use_sigmoid=True,
	gamma=2.0,
	alpha=0.25,
	loss_weight=1.0),
	loss_shape: ConfigType = dict(
	type='BoundedIoULoss', beta=0.2, loss_weight=1.0),
	loss_cls: ConfigType = dict(
	type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
	loss_bbox: ConfigType = dict(
	type='SmoothL1Loss', beta=1.0, loss_weight=1.0),
	init_cfg: MultiConfig = dict(
	type='Normal',
	layer='Conv2d',
	std=0.01,
	override=dict(
	type='Normal', name='conv_loc', std=0.01, lbias_prob=0.01))
	) -> None:
	super(AnchorHead, self).__init__(init_cfg=init_cfg)
	self.in_channels = in_channels
	self.num_classes = num_classes
	self.feat_channels = feat_channels
	self.deform_groups = deform_groups
	self.loc_filter_thr = loc_filter_thr

	# build approx_anchor_generator and square_anchor_generator
	assert (approx_anchor_generator['octave_base_scale'] ==
	square_anchor_generator['scales'][0])
	assert (approx_anchor_generator['strides'] ==
	square_anchor_generator['strides'])
	self.approx_anchor_generator = TASK_UTILS.build(
	approx_anchor_generator)
	self.square_anchor_generator = TASK_UTILS.build(
	square_anchor_generator)
	self.approxs_per_octave = self.approx_anchor_generator \
	.num_base_priors[0]

	self.reg_decoded_bbox = reg_decoded_bbox

	# one anchor per location
	self.num_base_priors = self.square_anchor_generator.num_base_priors[0]

	self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False)
	self.loc_focal_loss = loss_loc['type'] in ['FocalLoss']
	if self.use_sigmoid_cls:
	self.cls_out_channels = self.num_classes
	else:
	self.cls_out_channels = self.num_classes + 1

	# build bbox_coder
	self.anchor_coder = TASK_UTILS.build(anchor_coder)
	self.bbox_coder = TASK_UTILS.build(bbox_coder)

	# build losses
	self.loss_loc = MODELS.build(loss_loc)
	self.loss_shape = MODELS.build(loss_shape)
	self.loss_cls = MODELS.build(loss_cls)
	self.loss_bbox = MODELS.build(loss_bbox)

	self.train_cfg = train_cfg
	self.test_cfg = test_cfg

	if self.train_cfg:
	self.assigner = TASK_UTILS.build(self.train_cfg['assigner'])
	# use PseudoSampler when no sampler in train_cfg
	if train_cfg.get('sampler', None) is not None:
	self.sampler = TASK_UTILS.build(
	self.train_cfg['sampler'], default_args=dict(context=self))
	else:
	self.sampler = PseudoSampler()

	self.ga_assigner = TASK_UTILS.build(self.train_cfg['ga_assigner'])
	if train_cfg.get('ga_sampler', None) is not None:
	self.ga_sampler = TASK_UTILS.build(
	self.train_cfg['ga_sampler'],
	default_args=dict(context=self))
	else:
	self.ga_sampler = PseudoSampler()

	self._init_layers()

	def _init_layers(self) -> None:
	"""Initialize layers of the head."""
	self.relu = nn.ReLU(inplace=True)
	self.conv_loc = nn.Conv2d(self.in_channels, 1, 1)
	self.conv_shape = nn.Conv2d(self.in_channels, self.num_base_priors * 2,
	1)
	self.feature_adaption = FeatureAdaption(
	self.in_channels,
	self.feat_channels,
	kernel_size=3,
	deform_groups=self.deform_groups)
	self.conv_cls = MaskedConv2d(
	self.feat_channels, self.num_base_priors * self.cls_out_channels,
	1)
	self.conv_reg = MaskedConv2d(self.feat_channels,
	self.num_base_priors * 4, 1)

	def forward_single(self, x: Tensor) -> Tuple[Tensor]:
	"""Forward feature of a single scale level."""
	loc_pred = self.conv_loc(x)
	shape_pred = self.conv_shape(x)
	x = self.feature_adaption(x, shape_pred)
	# masked conv is only used during inference for speed-up
	if not self.training:
	mask = loc_pred.sigmoid()[0] >= self.loc_filter_thr
	else:
	mask = None
	cls_score = self.conv_cls(x, mask)
	bbox_pred = self.conv_reg(x, mask)
	return cls_score, bbox_pred, shape_pred, loc_pred

	def forward(self, x: List[Tensor]) -> Tuple[List[Tensor]]:
	"""Forward features from the upstream network."""
	return multi_apply(self.forward_single, x)

	def get_sampled_approxs(self,
	featmap_sizes: List[Tuple[int, int]],
	batch_img_metas: List[dict],
	device: str = 'cuda') -> tuple:
	"""Get sampled approxs and inside flags according to feature map sizes.

	Args:
	featmap_sizes (list[tuple]): Multi-level feature map sizes.
	batch_img_metas (list[dict]): Image meta info.
	device (str): device for returned tensors

	Returns:
	tuple: approxes of each image, inside flags of each image
	"""
	num_imgs = len(batch_img_metas)

	# since feature map sizes of all images are the same, we only compute
	# approxes for one time
	multi_level_approxs = self.approx_anchor_generator.grid_priors(
	featmap_sizes, device=device)
	approxs_list = [multi_level_approxs for _ in range(num_imgs)]

	# for each image, we compute inside flags of multi level approxes
	inside_flag_list = []
	for img_id, img_meta in enumerate(batch_img_metas):
	multi_level_flags = []
	multi_level_approxs = approxs_list[img_id]

	# obtain valid flags for each approx first
	multi_level_approx_flags = self.approx_anchor_generator \
	.valid_flags(featmap_sizes,
	img_meta['pad_shape'],
	device=device)

	for i, flags in enumerate(multi_level_approx_flags):
	approxs = multi_level_approxs[i]
	inside_flags_list = []
	for j in range(self.approxs_per_octave):
	split_valid_flags = flags[j::self.approxs_per_octave]
	split_approxs = approxs[j::self.approxs_per_octave, :]
	inside_flags = anchor_inside_flags(
	split_approxs, split_valid_flags,
	img_meta['img_shape'][:2],
	self.train_cfg['allowed_border'])
	inside_flags_list.append(inside_flags)
	# inside_flag for a position is true if any anchor in this
	# position is true
	inside_flags = (
	torch.stack(inside_flags_list, 0).sum(dim=0) > 0)
	multi_level_flags.append(inside_flags)
	inside_flag_list.append(multi_level_flags)
	return approxs_list, inside_flag_list

	def get_anchors(self,
	featmap_sizes: List[Tuple[int, int]],
	shape_preds: List[Tensor],
	loc_preds: List[Tensor],
	batch_img_metas: List[dict],
	use_loc_filter: bool = False,
	device: str = 'cuda') -> tuple:
	"""Get squares according to feature map sizes and guided anchors.

	Args:
	featmap_sizes (list[tuple]): Multi-level feature map sizes.
	shape_preds (list[tensor]): Multi-level shape predictions.
	loc_preds (list[tensor]): Multi-level location predictions.
	batch_img_metas (list[dict]): Image meta info.
	use_loc_filter (bool): Use loc filter or not. Defaults to False
	device (str): device for returned tensors.
	Defaults to `cuda`.

	Returns:
	tuple: square approxs of each image, guided anchors of each image,
	loc masks of each image.
	"""
	num_imgs = len(batch_img_metas)
	num_levels = len(featmap_sizes)

	# since feature map sizes of all images are the same, we only compute
	# squares for one time
	multi_level_squares = self.square_anchor_generator.grid_priors(
	featmap_sizes, device=device)
	squares_list = [multi_level_squares for _ in range(num_imgs)]

	# for each image, we compute multi level guided anchors
	guided_anchors_list = []
	loc_mask_list = []
	for img_id, img_meta in enumerate(batch_img_metas):
	multi_level_guided_anchors = []
	multi_level_loc_mask = []
	for i in range(num_levels):
	squares = squares_list[img_id][i]
	shape_pred = shape_preds[i][img_id]
	loc_pred = loc_preds[i][img_id]
	guided_anchors, loc_mask = self._get_guided_anchors_single(
	squares,
	shape_pred,
	loc_pred,
	use_loc_filter=use_loc_filter)
	multi_level_guided_anchors.append(guided_anchors)
	multi_level_loc_mask.append(loc_mask)
	guided_anchors_list.append(multi_level_guided_anchors)
	loc_mask_list.append(multi_level_loc_mask)
	return squares_list, guided_anchors_list, loc_mask_list

	def _get_guided_anchors_single(
	self,
	squares: Tensor,
	shape_pred: Tensor,
	loc_pred: Tensor,
	use_loc_filter: bool = False) -> Tuple[Tensor]:
	"""Get guided anchors and loc masks for a single level.

	Args:
	squares (tensor): Squares of a single level.
	shape_pred (tensor): Shape predictions of a single level.
	loc_pred (tensor): Loc predictions of a single level.
	use_loc_filter (list[tensor]): Use loc filter or not.
	Defaults to False.

	Returns:
	tuple: guided anchors, location masks
	"""
	# calculate location filtering mask
	loc_pred = loc_pred.sigmoid().detach()
	if use_loc_filter:
	loc_mask = loc_pred >= self.loc_filter_thr
	else:
	loc_mask = loc_pred >= 0.0
	mask = loc_mask.permute(1, 2, 0).expand(-1, -1, self.num_base_priors)
	mask = mask.contiguous().view(-1)
	# calculate guided anchors
	squares = squares[mask]
	anchor_deltas = shape_pred.permute(1, 2, 0).contiguous().view(
	-1, 2).detach()[mask]
	bbox_deltas = anchor_deltas.new_full(squares.size(), 0)
	bbox_deltas[:, 2:] = anchor_deltas
	guided_anchors = self.anchor_coder.decode(
	squares, bbox_deltas, wh_ratio_clip=1e-6)
	return guided_anchors, mask

	def ga_loc_targets(self, batch_gt_instances: InstanceList,
	featmap_sizes: List[Tuple[int, int]]) -> tuple:
	"""Compute location targets for guided anchoring.

	Each feature map is divided into positive, negative and ignore regions.
	- positive regions: target 1, weight 1
	- ignore regions: target 0, weight 0
	- negative regions: target 0, weight 0.1

	Args:
	batch_gt_instances (list[:obj:`InstanceData`]): Batch of
	gt_instance. It usually includes ``bboxes`` and ``labels``
	attributes.
	featmap_sizes (list[tuple]): Multi level sizes of each feature
	maps.

	Returns:
	tuple: Returns a tuple containing location targets.
	"""
	anchor_scale = self.approx_anchor_generator.octave_base_scale
	anchor_strides = self.approx_anchor_generator.strides
	# Currently only supports same stride in x and y direction.
	for stride in anchor_strides:
	assert (stride[0] == stride[1])
	anchor_strides = [stride[0] for stride in anchor_strides]

	center_ratio = self.train_cfg['center_ratio']
	ignore_ratio = self.train_cfg['ignore_ratio']
	img_per_gpu = len(batch_gt_instances)
	num_lvls = len(featmap_sizes)
	r1 = (1 - center_ratio) / 2
	r2 = (1 - ignore_ratio) / 2
	all_loc_targets = []
	all_loc_weights = []
	all_ignore_map = []
	for lvl_id in range(num_lvls):
	h, w = featmap_sizes[lvl_id]
	loc_targets = torch.zeros(
	img_per_gpu,
	1,
	h,
	w,
	device=batch_gt_instances[0].bboxes.device,
	dtype=torch.float32)
	loc_weights = torch.full_like(loc_targets, -1)
	ignore_map = torch.zeros_like(loc_targets)
	all_loc_targets.append(loc_targets)
	all_loc_weights.append(loc_weights)
	all_ignore_map.append(ignore_map)
	for img_id in range(img_per_gpu):
	gt_bboxes = batch_gt_instances[img_id].bboxes
	scale = torch.sqrt((gt_bboxes[:, 2] - gt_bboxes[:, 0]) *
	(gt_bboxes[:, 3] - gt_bboxes[:, 1]))
	min_anchor_size = scale.new_full(
	(1, ), float(anchor_scale * anchor_strides[0]))
	# assign gt bboxes to different feature levels w.r.t. their scales
	target_lvls = torch.floor(
	torch.log2(scale) - torch.log2(min_anchor_size) + 0.5)
	target_lvls = target_lvls.clamp(min=0, max=num_lvls - 1).long()
	for gt_id in range(gt_bboxes.size(0)):
	lvl = target_lvls[gt_id].item()
	# rescaled to corresponding feature map
	gt_ = gt_bboxes[gt_id, :4] / anchor_strides[lvl]
	# calculate ignore regions
	ignore_x1, ignore_y1, ignore_x2, ignore_y2 = calc_region(
	gt_, r2, featmap_sizes[lvl])
	# calculate positive (center) regions
	ctr_x1, ctr_y1, ctr_x2, ctr_y2 = calc_region(
	gt_, r1, featmap_sizes[lvl])
	all_loc_targets[lvl][img_id, 0, ctr_y1:ctr_y2 + 1,
	ctr_x1:ctr_x2 + 1] = 1
	all_loc_weights[lvl][img_id, 0, ignore_y1:ignore_y2 + 1,
	ignore_x1:ignore_x2 + 1] = 0
	all_loc_weights[lvl][img_id, 0, ctr_y1:ctr_y2 + 1,
	ctr_x1:ctr_x2 + 1] = 1
	# calculate ignore map on nearby low level feature
	if lvl > 0:
	d_lvl = lvl - 1
	# rescaled to corresponding feature map
	gt_ = gt_bboxes[gt_id, :4] / anchor_strides[d_lvl]
	ignore_x1, ignore_y1, ignore_x2, ignore_y2 = calc_region(
	gt_, r2, featmap_sizes[d_lvl])
	all_ignore_map[d_lvl][img_id, 0, ignore_y1:ignore_y2 + 1,
	ignore_x1:ignore_x2 + 1] = 1
	# calculate ignore map on nearby high level feature
	if lvl < num_lvls - 1:
	u_lvl = lvl + 1
	# rescaled to corresponding feature map
	gt_ = gt_bboxes[gt_id, :4] / anchor_strides[u_lvl]
	ignore_x1, ignore_y1, ignore_x2, ignore_y2 = calc_region(
	gt_, r2, featmap_sizes[u_lvl])
	all_ignore_map[u_lvl][img_id, 0, ignore_y1:ignore_y2 + 1,
	ignore_x1:ignore_x2 + 1] = 1
	for lvl_id in range(num_lvls):
	# ignore negative regions w.r.t. ignore map
	all_loc_weights[lvl_id][(all_loc_weights[lvl_id] < 0)
	& (all_ignore_map[lvl_id] > 0)] = 0
	# set negative regions with weight 0.1
	all_loc_weights[lvl_id][all_loc_weights[lvl_id] < 0] = 0.1
	# loc average factor to balance loss
	loc_avg_factor = sum(
	[t.size(0) * t.size(-1) * t.size(-2)
	for t in all_loc_targets]) / 200
	return all_loc_targets, all_loc_weights, loc_avg_factor

	def _ga_shape_target_single(self,
	flat_approxs: Tensor,
	inside_flags: Tensor,
	flat_squares: Tensor,
	gt_instances: InstanceData,
	gt_instances_ignore: Optional[InstanceData],
	img_meta: dict,
	unmap_outputs: bool = True) -> tuple:
	"""Compute guided anchoring targets.

	This function returns sampled anchors and gt bboxes directly
	rather than calculates regression targets.

	Args:
	flat_approxs (Tensor): flat approxs of a single image,
	shape (n, 4)
	inside_flags (Tensor): inside flags of a single image,
	shape (n, ).
	flat_squares (Tensor): flat squares of a single image,
	shape (approxs_per_octave * n, 4)
	gt_instances (:obj:`InstanceData`): Ground truth of instance
	annotations. It usually includes ``bboxes`` and ``labels``
	attributes.
	gt_instances_ignore (:obj:`InstanceData`, optional): Instances
	to be ignored during training. It includes ``bboxes`` attribute
	data that is ignored during training and testing.
	img_meta (dict): Meta info of a single image.
	unmap_outputs (bool): unmap outputs or not.

	Returns:
	tuple: Returns a tuple containing shape targets of each image.
	"""
	if not inside_flags.any():
	raise ValueError(
	'There is no valid anchor inside the image boundary. Please '
	'check the image size and anchor sizes, or set '
	'``allowed_border`` to -1 to skip the condition.')
	# assign gt and sample anchors
	num_square = flat_squares.size(0)
	approxs = flat_approxs.view(num_square, self.approxs_per_octave, 4)
	approxs = approxs[inside_flags, ...]
	squares = flat_squares[inside_flags, :]

	pred_instances = InstanceData()
	pred_instances.priors = squares
	pred_instances.approxs = approxs

	assign_result = self.ga_assigner.assign(
	pred_instances=pred_instances,
	gt_instances=gt_instances,
	gt_instances_ignore=gt_instances_ignore)
	sampling_result = self.ga_sampler.sample(
	assign_result=assign_result,
	pred_instances=pred_instances,
	gt_instances=gt_instances)

	bbox_anchors = torch.zeros_like(squares)
	bbox_gts = torch.zeros_like(squares)
	bbox_weights = torch.zeros_like(squares)

	pos_inds = sampling_result.pos_inds
	neg_inds = sampling_result.neg_inds
	if len(pos_inds) > 0:
	bbox_anchors[pos_inds, :] = sampling_result.pos_bboxes
	bbox_gts[pos_inds, :] = sampling_result.pos_gt_bboxes
	bbox_weights[pos_inds, :] = 1.0

	# map up to original set of anchors
	if unmap_outputs:
	num_total_anchors = flat_squares.size(0)
	bbox_anchors = unmap(bbox_anchors, num_total_anchors, inside_flags)
	bbox_gts = unmap(bbox_gts, num_total_anchors, inside_flags)
	bbox_weights = unmap(bbox_weights, num_total_anchors, inside_flags)

	return (bbox_anchors, bbox_gts, bbox_weights, pos_inds, neg_inds,
	sampling_result)

	def ga_shape_targets(self,
	approx_list: List[List[Tensor]],
	inside_flag_list: List[List[Tensor]],
	square_list: List[List[Tensor]],
	batch_gt_instances: InstanceList,
	batch_img_metas: List[dict],
	batch_gt_instances_ignore: OptInstanceList = None,
	unmap_outputs: bool = True) -> tuple:
	"""Compute guided anchoring targets.

	Args:
	approx_list (list[list[Tensor]]): Multi level approxs of each
	image.
	inside_flag_list (list[list[Tensor]]): Multi level inside flags
	of each image.
	square_list (list[list[Tensor]]): Multi level squares of each
	image.
	batch_gt_instances (list[:obj:`InstanceData`]): Batch of
	gt_instance. It usually includes ``bboxes`` and ``labels``
	attributes.
	batch_img_metas (list[dict]): Meta information of each image, e.g.,
	image size, scaling factor, etc.
	batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
	Batch of gt_instances_ignore. It includes ``bboxes`` attribute
	data that is ignored during training and testing.
	Defaults to None.
	unmap_outputs (bool): unmap outputs or not. Defaults to None.

	Returns:
	tuple: Returns a tuple containing shape targets.
	"""
	num_imgs = len(batch_img_metas)
	assert len(approx_list) == len(inside_flag_list) == len(
	square_list) == num_imgs
	# anchor number of multi levels
	num_level_squares = [squares.size(0) for squares in square_list[0]]
	# concat all level anchors and flags to a single tensor
	inside_flag_flat_list = []
	approx_flat_list = []
	square_flat_list = []
	for i in range(num_imgs):
	assert len(square_list[i]) == len(inside_flag_list[i])
	inside_flag_flat_list.append(torch.cat(inside_flag_list[i]))
	approx_flat_list.append(torch.cat(approx_list[i]))
	square_flat_list.append(torch.cat(square_list[i]))

	# compute targets for each image
	if batch_gt_instances_ignore is None:
	batch_gt_instances_ignore = [None for _ in range(num_imgs)]
	(all_bbox_anchors, all_bbox_gts, all_bbox_weights, pos_inds_list,
	neg_inds_list, sampling_results_list) = multi_apply(
	self._ga_shape_target_single,
	approx_flat_list,
	inside_flag_flat_list,
	square_flat_list,
	batch_gt_instances,
	batch_gt_instances_ignore,
	batch_img_metas,
	unmap_outputs=unmap_outputs)
	# sampled anchors of all images
	avg_factor = sum(
	[results.avg_factor for results in sampling_results_list])
	# split targets to a list w.r.t. multiple levels
	bbox_anchors_list = images_to_levels(all_bbox_anchors,
	num_level_squares)
	bbox_gts_list = images_to_levels(all_bbox_gts, num_level_squares)
	bbox_weights_list = images_to_levels(all_bbox_weights,
	num_level_squares)
	return (bbox_anchors_list, bbox_gts_list, bbox_weights_list,
	avg_factor)

	def loss_shape_single(self, shape_pred: Tensor, bbox_anchors: Tensor,
	bbox_gts: Tensor, anchor_weights: Tensor,
	avg_factor: int) -> Tensor:
	"""Compute shape loss in single level."""
	shape_pred = shape_pred.permute(0, 2, 3, 1).contiguous().view(-1, 2)
	bbox_anchors = bbox_anchors.contiguous().view(-1, 4)
	bbox_gts = bbox_gts.contiguous().view(-1, 4)
	anchor_weights = anchor_weights.contiguous().view(-1, 4)
	bbox_deltas = bbox_anchors.new_full(bbox_anchors.size(), 0)
	bbox_deltas[:, 2:] += shape_pred
	# filter out negative samples to speed-up weighted_bounded_iou_loss
	inds = torch.nonzero(
	anchor_weights[:, 0] > 0, as_tuple=False).squeeze(1)
	bbox_deltas_ = bbox_deltas[inds]
	bbox_anchors_ = bbox_anchors[inds]
	bbox_gts_ = bbox_gts[inds]
	anchor_weights_ = anchor_weights[inds]
	pred_anchors_ = self.anchor_coder.decode(
	bbox_anchors_, bbox_deltas_, wh_ratio_clip=1e-6)
	loss_shape = self.loss_shape(
	pred_anchors_, bbox_gts_, anchor_weights_, avg_factor=avg_factor)
	return loss_shape

	def loss_loc_single(self, loc_pred: Tensor, loc_target: Tensor,
	loc_weight: Tensor, avg_factor: float) -> Tensor:
	"""Compute location loss in single level."""
	loss_loc = self.loss_loc(
	loc_pred.reshape(-1, 1),
	loc_target.reshape(-1).long(),
	loc_weight.reshape(-1),
	avg_factor=avg_factor)
	return loss_loc

	def loss_by_feat(
	self,
	cls_scores: List[Tensor],
	bbox_preds: List[Tensor],
	shape_preds: List[Tensor],
	loc_preds: List[Tensor],
	batch_gt_instances: InstanceList,
	batch_img_metas: List[dict],
	batch_gt_instances_ignore: OptInstanceList = None) -> dict:
	"""Calculate the loss based on the features extracted by the detection
	head.

	Args:
	cls_scores (list[Tensor]): Box scores for each scale level
	has shape (N, num_anchors * num_classes, H, W).
	bbox_preds (list[Tensor]): Box energies / deltas for each scale
	level with shape (N, num_anchors * 4, H, W).
	shape_preds (list[Tensor]): shape predictions for each scale
	level with shape (N, 1, H, W).
	loc_preds (list[Tensor]): location predictions for each scale
	level with shape (N, num_anchors * 2, H, W).
	batch_gt_instances (list[:obj:`InstanceData`]): Batch of
	gt_instance. It usually includes ``bboxes`` and ``labels``
	attributes.
	batch_img_metas (list[dict]): Meta information of each image, e.g.,
	image size, scaling factor, etc.
	batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
	Batch of gt_instances_ignore. It includes ``bboxes`` attribute
	data that is ignored during training and testing.
	Defaults to None.

	Returns:
	dict: A dictionary of loss components.
	"""

	featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
	assert len(featmap_sizes) == self.approx_anchor_generator.num_levels

	device = cls_scores[0].device

	# get loc targets
	loc_targets, loc_weights, loc_avg_factor = self.ga_loc_targets(
	batch_gt_instances, featmap_sizes)

	# get sampled approxes
	approxs_list, inside_flag_list = self.get_sampled_approxs(
	featmap_sizes, batch_img_metas, device=device)
	# get squares and guided anchors
	squares_list, guided_anchors_list, _ = self.get_anchors(
	featmap_sizes,
	shape_preds,
	loc_preds,
	batch_img_metas,
	device=device)

	# get shape targets
	shape_targets = self.ga_shape_targets(approxs_list, inside_flag_list,
	squares_list, batch_gt_instances,
	batch_img_metas)
	(bbox_anchors_list, bbox_gts_list, anchor_weights_list,
	ga_avg_factor) = shape_targets

	# get anchor targets
	cls_reg_targets = self.get_targets(
	guided_anchors_list,
	inside_flag_list,
	batch_gt_instances,
	batch_img_metas,
	batch_gt_instances_ignore=batch_gt_instances_ignore)
	(labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
	avg_factor) = cls_reg_targets

	# anchor number of multi levels
	num_level_anchors = [
	anchors.size(0) for anchors in guided_anchors_list[0]
	]
	# concat all level anchors to a single tensor
	concat_anchor_list = []
	for i in range(len(guided_anchors_list)):
	concat_anchor_list.append(torch.cat(guided_anchors_list[i]))
	all_anchor_list = images_to_levels(concat_anchor_list,
	num_level_anchors)

	# get classification and bbox regression losses
	losses_cls, losses_bbox = multi_apply(
	self.loss_by_feat_single,
	cls_scores,
	bbox_preds,
	all_anchor_list,
	labels_list,
	label_weights_list,
	bbox_targets_list,
	bbox_weights_list,
	avg_factor=avg_factor)

	# get anchor location loss
	losses_loc = []
	for i in range(len(loc_preds)):
	loss_loc = self.loss_loc_single(
	loc_preds[i],
	loc_targets[i],
	loc_weights[i],
	avg_factor=loc_avg_factor)
	losses_loc.append(loss_loc)

	# get anchor shape loss
	losses_shape = []
	for i in range(len(shape_preds)):
	loss_shape = self.loss_shape_single(
	shape_preds[i],
	bbox_anchors_list[i],
	bbox_gts_list[i],
	anchor_weights_list[i],
	avg_factor=ga_avg_factor)
	losses_shape.append(loss_shape)

	return dict(
	loss_cls=losses_cls,
	loss_bbox=losses_bbox,
	loss_shape=losses_shape,
	loss_loc=losses_loc)

	def predict_by_feat(self,
	cls_scores: List[Tensor],
	bbox_preds: List[Tensor],
	shape_preds: List[Tensor],
	loc_preds: List[Tensor],
	batch_img_metas: List[dict],
	cfg: OptConfigType = None,
	rescale: bool = False) -> InstanceList:
	"""Transform a batch of output features extracted from the head into
	bbox results.

	Args:
	cls_scores (list[Tensor]): Classification scores for all
	scale levels, each is a 4D-tensor, has shape
	(batch_size, num_priors * num_classes, H, W).
	bbox_preds (list[Tensor]): Box energies / deltas for all
	scale levels, each is a 4D-tensor, has shape
	(batch_size, num_priors * 4, H, W).
	shape_preds (list[Tensor]): shape predictions for each scale
	level with shape (N, 1, H, W).
	loc_preds (list[Tensor]): location predictions for each scale
	level with shape (N, num_anchors * 2, H, W).
	batch_img_metas (list[dict], Optional): Batch image meta info.
	Defaults to None.
	cfg (ConfigDict, optional): Test / postprocessing
	configuration, if None, test_cfg would be used.
	Defaults to None.
	rescale (bool): If True, return boxes in original image space.
	Defaults to False.

	Returns:
	list[:obj:`InstanceData`]: Object detection results of each image
	after the post process. Each item usually contains following keys.

	- scores (Tensor): Classification scores, has a shape
	(num_instance, )
	- labels (Tensor): Labels of bboxes, has a shape (num_instances, ).
	- bboxes (Tensor): Has a shape (num_instances, 4), the last
	dimension 4 arrange as (x1, y1, x2, y2).
	"""
	assert len(cls_scores) == len(bbox_preds) == len(shape_preds) == len(
	loc_preds)
	num_levels = len(cls_scores)
	featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
	device = cls_scores[0].device
	# get guided anchors
	_, guided_anchors, loc_masks = self.get_anchors(
	featmap_sizes,
	shape_preds,
	loc_preds,
	batch_img_metas,
	use_loc_filter=not self.training,
	device=device)
	result_list = []
	for img_id in range(len(batch_img_metas)):
	cls_score_list = [
	cls_scores[i][img_id].detach() for i in range(num_levels)
	]
	bbox_pred_list = [
	bbox_preds[i][img_id].detach() for i in range(num_levels)
	]
	guided_anchor_list = [
	guided_anchors[img_id][i].detach() for i in range(num_levels)
	]
	loc_mask_list = [
	loc_masks[img_id][i].detach() for i in range(num_levels)
	]
	proposals = self._predict_by_feat_single(
	cls_scores=cls_score_list,
	bbox_preds=bbox_pred_list,
	mlvl_anchors=guided_anchor_list,
	mlvl_masks=loc_mask_list,
	img_meta=batch_img_metas[img_id],
	cfg=cfg,
	rescale=rescale)
	result_list.append(proposals)
	return result_list

	def _predict_by_feat_single(self,
	cls_scores: List[Tensor],
	bbox_preds: List[Tensor],
	mlvl_anchors: List[Tensor],
	mlvl_masks: List[Tensor],
	img_meta: dict,
	cfg: ConfigType,
	rescale: bool = False) -> InstanceData:
	"""Transform a single image's features extracted from the head into
	bbox results.

	Args:
	cls_scores (list[Tensor]): Box scores from all scale
	levels of a single image, each item has shape
	(num_priors * num_classes, H, W).
	bbox_preds (list[Tensor]): Box energies / deltas from
	all scale levels of a single image, each item has shape
	(num_priors * 4, H, W).
	mlvl_anchors (list[Tensor]): Each element in the list is
	the anchors of a single level in feature pyramid. it has
	shape (num_priors, 4).
	mlvl_masks (list[Tensor]): Each element in the list is location
	masks of a single level.
	img_meta (dict): Image meta info.
	cfg (:obj:`ConfigDict` or dict): Test / postprocessing
	configuration, if None, test_cfg would be used.
	rescale (bool): If True, return boxes in original image space.
	Defaults to False.

	Returns:
	:obj:`InstanceData`: Detection results of each image
	after the post process.
	Each item usually contains following keys.

	- scores (Tensor): Classification scores, has a shape
	(num_instance, )
	- labels (Tensor): Labels of bboxes, has a shape (num_instances, ).
	- bboxes (Tensor): Has a shape (num_instances, 4), the last
	dimension 4 arrange as (x1, y1, x2, y2).
	"""
	cfg = self.test_cfg if cfg is None else cfg
	assert len(cls_scores) == len(bbox_preds) == len(mlvl_anchors)
	mlvl_bbox_preds = []
	mlvl_valid_anchors = []
	mlvl_scores = []
	for cls_score, bbox_pred, anchors, mask in zip(cls_scores, bbox_preds,
	mlvl_anchors,
	mlvl_masks):
	assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
	# if no location is kept, end.
	if mask.sum() == 0:
	continue
	# reshape scores and bbox_pred
	cls_score = cls_score.permute(1, 2,
	0).reshape(-1, self.cls_out_channels)
	if self.use_sigmoid_cls:
	scores = cls_score.sigmoid()
	else:
	scores = cls_score.softmax(-1)
	bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4)
	# filter scores, bbox_pred w.r.t. mask.
	# anchors are filtered in get_anchors() beforehand.
	scores = scores[mask, :]
	bbox_pred = bbox_pred[mask, :]
	if scores.dim() == 0:
	anchors = anchors.unsqueeze(0)
	scores = scores.unsqueeze(0)
	bbox_pred = bbox_pred.unsqueeze(0)
	# filter anchors, bbox_pred, scores w.r.t. scores
	nms_pre = cfg.get('nms_pre', -1)
	if nms_pre > 0 and scores.shape[0] > nms_pre:
	if self.use_sigmoid_cls:
	max_scores, _ = scores.max(dim=1)
	else:
	# remind that we set FG labels to [0, num_class-1]
	# since mmdet v2.0
	# BG cat_id: num_class
	max_scores, _ = scores[:, :-1].max(dim=1)
	_, topk_inds = max_scores.topk(nms_pre)
	anchors = anchors[topk_inds, :]
	bbox_pred = bbox_pred[topk_inds, :]
	scores = scores[topk_inds, :]

	mlvl_bbox_preds.append(bbox_pred)
	mlvl_valid_anchors.append(anchors)
	mlvl_scores.append(scores)

	mlvl_bbox_preds = torch.cat(mlvl_bbox_preds)
	mlvl_anchors = torch.cat(mlvl_valid_anchors)
	mlvl_scores = torch.cat(mlvl_scores)
	mlvl_bboxes = self.bbox_coder.decode(
	mlvl_anchors, mlvl_bbox_preds, max_shape=img_meta['img_shape'])

	if rescale:
	assert img_meta.get('scale_factor') is not None
	mlvl_bboxes /= mlvl_bboxes.new_tensor(
	img_meta['scale_factor']).repeat((1, 2))

	if self.use_sigmoid_cls:
	# Add a dummy background class to the backend when using sigmoid
	# remind that we set FG labels to [0, num_class-1] since mmdet v2.0
	# BG cat_id: num_class
	padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1)
	mlvl_scores = torch.cat([mlvl_scores, padding], dim=1)
	# multi class NMS
	det_bboxes, det_labels = multiclass_nms(mlvl_bboxes, mlvl_scores,
	cfg.score_thr, cfg.nms,
	cfg.max_per_img)

	results = InstanceData()
	results.bboxes = det_bboxes[:, :-1]
	results.scores = det_bboxes[:, -1]
	results.labels = det_labels
	return results