Spaces:

scfive
/

samewind

Build error

samewind / mmdet /models /task_modules /coders /delta_xywh_bbox_coder.py

scfive

Resolve README.md conflict and continue rebase

e8f2571 12 days ago

17.4 kB

	# Copyright (c) OpenMMLab. All rights reserved.
	import warnings
	from typing import Optional, Sequence, Union

	import numpy as np
	import torch
	from torch import Tensor

	from mmdet.registry import TASK_UTILS
	from mmdet.structures.bbox import BaseBoxes, HorizontalBoxes, get_box_tensor
	from .base_bbox_coder import BaseBBoxCoder


	@TASK_UTILS.register_module()
	class DeltaXYWHBBoxCoder(BaseBBoxCoder):
	"""Delta XYWH BBox coder.

	Following the practice in `R-CNN <https://arxiv.org/abs/1311.2524>`_,
	this coder encodes bbox (x1, y1, x2, y2) into delta (dx, dy, dw, dh) and
	decodes delta (dx, dy, dw, dh) back to original bbox (x1, y1, x2, y2).

	Args:
	target_means (Sequence[float]): Denormalizing means of target for
	delta coordinates
	target_stds (Sequence[float]): Denormalizing standard deviation of
	target for delta coordinates
	clip_border (bool, optional): Whether clip the objects outside the
	border of the image. Defaults to True.
	add_ctr_clamp (bool): Whether to add center clamp, when added, the
	predicted box is clamped is its center is too far away from
	the original anchor's center. Only used by YOLOF. Default False.
	ctr_clamp (int): the maximum pixel shift to clamp. Only used by YOLOF.
	Default 32.
	"""

	def __init__(self,
	target_means: Sequence[float] = (0., 0., 0., 0.),
	target_stds: Sequence[float] = (1., 1., 1., 1.),
	clip_border: bool = True,
	add_ctr_clamp: bool = False,
	ctr_clamp: int = 32,
	**kwargs) -> None:
	super().__init__(**kwargs)
	self.means = target_means
	self.stds = target_stds
	self.clip_border = clip_border
	self.add_ctr_clamp = add_ctr_clamp
	self.ctr_clamp = ctr_clamp

	def encode(self, bboxes: Union[Tensor, BaseBoxes],
	gt_bboxes: Union[Tensor, BaseBoxes]) -> Tensor:
	"""Get box regression transformation deltas that can be used to
	transform the ``bboxes`` into the ``gt_bboxes``.

	Args:
	bboxes (torch.Tensor or :obj:`BaseBoxes`): Source boxes,
	e.g., object proposals.
	gt_bboxes (torch.Tensor or :obj:`BaseBoxes`): Target of the
	transformation, e.g., ground-truth boxes.

	Returns:
	torch.Tensor: Box transformation deltas
	"""
	bboxes = get_box_tensor(bboxes)
	gt_bboxes = get_box_tensor(gt_bboxes)
	assert bboxes.size(0) == gt_bboxes.size(0)
	assert bboxes.size(-1) == gt_bboxes.size(-1) == 4
	encoded_bboxes = bbox2delta(bboxes, gt_bboxes, self.means, self.stds)
	return encoded_bboxes

	def decode(
	self,
	bboxes: Union[Tensor, BaseBoxes],
	pred_bboxes: Tensor,
	max_shape: Optional[Union[Sequence[int], Tensor,
	Sequence[Sequence[int]]]] = None,
	wh_ratio_clip: Optional[float] = 16 / 1000
	) -> Union[Tensor, BaseBoxes]:
	"""Apply transformation `pred_bboxes` to `boxes`.

	Args:
	bboxes (torch.Tensor or :obj:`BaseBoxes`): Basic boxes. Shape
	(B, N, 4) or (N, 4)
	pred_bboxes (Tensor): Encoded offsets with respect to each roi.
	Has shape (B, N, num_classes * 4) or (B, N, 4) or
	(N, num_classes * 4) or (N, 4). Note N = num_anchors * W * H
	when rois is a grid of anchors.Offset encoding follows [1]_.
	max_shape (Sequence[int] or torch.Tensor or Sequence[
	Sequence[int]],optional): Maximum bounds for boxes, specifies
	(H, W, C) or (H, W). If bboxes shape is (B, N, 4), then
	the max_shape should be a Sequence[Sequence[int]]
	and the length of max_shape should also be B.
	wh_ratio_clip (float, optional): The allowed ratio between
	width and height.

	Returns:
	Union[torch.Tensor, :obj:`BaseBoxes`]: Decoded boxes.
	"""
	bboxes = get_box_tensor(bboxes)
	assert pred_bboxes.size(0) == bboxes.size(0)
	if pred_bboxes.ndim == 3:
	assert pred_bboxes.size(1) == bboxes.size(1)

	if pred_bboxes.ndim == 2 and not torch.onnx.is_in_onnx_export():
	# single image decode
	decoded_bboxes = delta2bbox(bboxes, pred_bboxes, self.means,
	self.stds, max_shape, wh_ratio_clip,
	self.clip_border, self.add_ctr_clamp,
	self.ctr_clamp)
	else:
	if pred_bboxes.ndim == 3 and not torch.onnx.is_in_onnx_export():
	warnings.warn(
	'DeprecationWarning: onnx_delta2bbox is deprecated '
	'in the case of batch decoding and non-ONNX, '
	'please use “delta2bbox” instead. In order to improve '
	'the decoding speed, the batch function will no '
	'longer be supported. ')
	decoded_bboxes = onnx_delta2bbox(bboxes, pred_bboxes, self.means,
	self.stds, max_shape,
	wh_ratio_clip, self.clip_border,
	self.add_ctr_clamp,
	self.ctr_clamp)

	if self.use_box_type:
	assert decoded_bboxes.size(-1) == 4, \
	('Cannot warp decoded boxes with box type when decoded boxes'
	'have shape of (N, num_classes * 4)')
	decoded_bboxes = HorizontalBoxes(decoded_bboxes)
	return decoded_bboxes


	def bbox2delta(
	proposals: Tensor,
	gt: Tensor,
	means: Sequence[float] = (0., 0., 0., 0.),
	stds: Sequence[float] = (1., 1., 1., 1.)
	) -> Tensor:
	"""Compute deltas of proposals w.r.t. gt.

	We usually compute the deltas of x, y, w, h of proposals w.r.t ground
	truth bboxes to get regression target.
	This is the inverse function of :func:`delta2bbox`.

	Args:
	proposals (Tensor): Boxes to be transformed, shape (N, ..., 4)
	gt (Tensor): Gt bboxes to be used as base, shape (N, ..., 4)
	means (Sequence[float]): Denormalizing means for delta coordinates
	stds (Sequence[float]): Denormalizing standard deviation for delta
	coordinates

	Returns:
	Tensor: deltas with shape (N, 4), where columns represent dx, dy,
	dw, dh.
	"""
	assert proposals.size() == gt.size()

	proposals = proposals.float()
	gt = gt.float()
	px = (proposals[..., 0] + proposals[..., 2]) * 0.5
	py = (proposals[..., 1] + proposals[..., 3]) * 0.5
	pw = proposals[..., 2] - proposals[..., 0]
	ph = proposals[..., 3] - proposals[..., 1]

	gx = (gt[..., 0] + gt[..., 2]) * 0.5
	gy = (gt[..., 1] + gt[..., 3]) * 0.5
	gw = gt[..., 2] - gt[..., 0]
	gh = gt[..., 3] - gt[..., 1]

	dx = (gx - px) / pw
	dy = (gy - py) / ph
	dw = torch.log(gw / pw)
	dh = torch.log(gh / ph)
	deltas = torch.stack([dx, dy, dw, dh], dim=-1)

	means = deltas.new_tensor(means).unsqueeze(0)
	stds = deltas.new_tensor(stds).unsqueeze(0)
	deltas = deltas.sub_(means).div_(stds)

	return deltas


	def delta2bbox(rois: Tensor,
	deltas: Tensor,
	means: Sequence[float] = (0., 0., 0., 0.),
	stds: Sequence[float] = (1., 1., 1., 1.),
	max_shape: Optional[Union[Sequence[int], Tensor,
	Sequence[Sequence[int]]]] = None,
	wh_ratio_clip: float = 16 / 1000,
	clip_border: bool = True,
	add_ctr_clamp: bool = False,
	ctr_clamp: int = 32) -> Tensor:
	"""Apply deltas to shift/scale base boxes.

	Typically the rois are anchor or proposed bounding boxes and the deltas are
	network outputs used to shift/scale those boxes.
	This is the inverse function of :func:`bbox2delta`.

	Args:
	rois (Tensor): Boxes to be transformed. Has shape (N, 4).
	deltas (Tensor): Encoded offsets relative to each roi.
	Has shape (N, num_classes * 4) or (N, 4). Note
	N = num_base_anchors * W * H, when rois is a grid of
	anchors. Offset encoding follows [1]_.
	means (Sequence[float]): Denormalizing means for delta coordinates.
	Default (0., 0., 0., 0.).
	stds (Sequence[float]): Denormalizing standard deviation for delta
	coordinates. Default (1., 1., 1., 1.).
	max_shape (tuple[int, int]): Maximum bounds for boxes, specifies
	(H, W). Default None.
	wh_ratio_clip (float): Maximum aspect ratio for boxes. Default
	16 / 1000.
	clip_border (bool, optional): Whether clip the objects outside the
	border of the image. Default True.
	add_ctr_clamp (bool): Whether to add center clamp. When set to True,
	the center of the prediction bounding box will be clamped to
	avoid being too far away from the center of the anchor.
	Only used by YOLOF. Default False.
	ctr_clamp (int): the maximum pixel shift to clamp. Only used by YOLOF.
	Default 32.

	Returns:
	Tensor: Boxes with shape (N, num_classes * 4) or (N, 4), where 4
	represent tl_x, tl_y, br_x, br_y.

	References:
	.. [1] https://arxiv.org/abs/1311.2524

	Example:
	>>> rois = torch.Tensor([[ 0., 0., 1., 1.],
	>>> [ 0., 0., 1., 1.],
	>>> [ 0., 0., 1., 1.],
	>>> [ 5., 5., 5., 5.]])
	>>> deltas = torch.Tensor([[ 0., 0., 0., 0.],
	>>> [ 1., 1., 1., 1.],
	>>> [ 0., 0., 2., -1.],
	>>> [ 0.7, -1.9, -0.5, 0.3]])
	>>> delta2bbox(rois, deltas, max_shape=(32, 32, 3))
	tensor([[0.0000, 0.0000, 1.0000, 1.0000],
	[0.1409, 0.1409, 2.8591, 2.8591],
	[0.0000, 0.3161, 4.1945, 0.6839],
	[5.0000, 5.0000, 5.0000, 5.0000]])
	"""
	num_bboxes, num_classes = deltas.size(0), deltas.size(1) // 4
	if num_bboxes == 0:
	return deltas

	deltas = deltas.reshape(-1, 4)

	means = deltas.new_tensor(means).view(1, -1)
	stds = deltas.new_tensor(stds).view(1, -1)
	denorm_deltas = deltas * stds + means

	dxy = denorm_deltas[:, :2]
	dwh = denorm_deltas[:, 2:]

	# Compute width/height of each roi
	rois_ = rois.repeat(1, num_classes).reshape(-1, 4)
	pxy = ((rois_[:, :2] + rois_[:, 2:]) * 0.5)
	pwh = (rois_[:, 2:] - rois_[:, :2])

	dxy_wh = pwh * dxy

	max_ratio = np.abs(np.log(wh_ratio_clip))
	if add_ctr_clamp:
	dxy_wh = torch.clamp(dxy_wh, max=ctr_clamp, min=-ctr_clamp)
	dwh = torch.clamp(dwh, max=max_ratio)
	else:
	dwh = dwh.clamp(min=-max_ratio, max=max_ratio)

	gxy = pxy + dxy_wh
	gwh = pwh * dwh.exp()
	x1y1 = gxy - (gwh * 0.5)
	x2y2 = gxy + (gwh * 0.5)
	bboxes = torch.cat([x1y1, x2y2], dim=-1)
	if clip_border and max_shape is not None:
	bboxes[..., 0::2].clamp_(min=0, max=max_shape[1])
	bboxes[..., 1::2].clamp_(min=0, max=max_shape[0])
	bboxes = bboxes.reshape(num_bboxes, -1)
	return bboxes


	def onnx_delta2bbox(rois: Tensor,
	deltas: Tensor,
	means: Sequence[float] = (0., 0., 0., 0.),
	stds: Sequence[float] = (1., 1., 1., 1.),
	max_shape: Optional[Union[Sequence[int], Tensor,
	Sequence[Sequence[int]]]] = None,
	wh_ratio_clip: float = 16 / 1000,
	clip_border: Optional[bool] = True,
	add_ctr_clamp: bool = False,
	ctr_clamp: int = 32) -> Tensor:
	"""Apply deltas to shift/scale base boxes.

	Typically the rois are anchor or proposed bounding boxes and the deltas are
	network outputs used to shift/scale those boxes.
	This is the inverse function of :func:`bbox2delta`.

	Args:
	rois (Tensor): Boxes to be transformed. Has shape (N, 4) or (B, N, 4)
	deltas (Tensor): Encoded offsets with respect to each roi.
	Has shape (B, N, num_classes * 4) or (B, N, 4) or
	(N, num_classes * 4) or (N, 4). Note N = num_anchors * W * H
	when rois is a grid of anchors.Offset encoding follows [1]_.
	means (Sequence[float]): Denormalizing means for delta coordinates.
	Default (0., 0., 0., 0.).
	stds (Sequence[float]): Denormalizing standard deviation for delta
	coordinates. Default (1., 1., 1., 1.).
	max_shape (Sequence[int] or torch.Tensor or Sequence[
	Sequence[int]],optional): Maximum bounds for boxes, specifies
	(H, W, C) or (H, W). If rois shape is (B, N, 4), then
	the max_shape should be a Sequence[Sequence[int]]
	and the length of max_shape should also be B. Default None.
	wh_ratio_clip (float): Maximum aspect ratio for boxes.
	Default 16 / 1000.
	clip_border (bool, optional): Whether clip the objects outside the
	border of the image. Default True.
	add_ctr_clamp (bool): Whether to add center clamp, when added, the
	predicted box is clamped is its center is too far away from
	the original anchor's center. Only used by YOLOF. Default False.
	ctr_clamp (int): the maximum pixel shift to clamp. Only used by YOLOF.
	Default 32.

	Returns:
	Tensor: Boxes with shape (B, N, num_classes * 4) or (B, N, 4) or
	(N, num_classes * 4) or (N, 4), where 4 represent
	tl_x, tl_y, br_x, br_y.

	References:
	.. [1] https://arxiv.org/abs/1311.2524

	Example:
	>>> rois = torch.Tensor([[ 0., 0., 1., 1.],
	>>> [ 0., 0., 1., 1.],
	>>> [ 0., 0., 1., 1.],
	>>> [ 5., 5., 5., 5.]])
	>>> deltas = torch.Tensor([[ 0., 0., 0., 0.],
	>>> [ 1., 1., 1., 1.],
	>>> [ 0., 0., 2., -1.],
	>>> [ 0.7, -1.9, -0.5, 0.3]])
	>>> delta2bbox(rois, deltas, max_shape=(32, 32, 3))
	tensor([[0.0000, 0.0000, 1.0000, 1.0000],
	[0.1409, 0.1409, 2.8591, 2.8591],
	[0.0000, 0.3161, 4.1945, 0.6839],
	[5.0000, 5.0000, 5.0000, 5.0000]])
	"""
	means = deltas.new_tensor(means).view(1,
	-1).repeat(1,
	deltas.size(-1) // 4)
	stds = deltas.new_tensor(stds).view(1, -1).repeat(1, deltas.size(-1) // 4)
	denorm_deltas = deltas * stds + means
	dx = denorm_deltas[..., 0::4]
	dy = denorm_deltas[..., 1::4]
	dw = denorm_deltas[..., 2::4]
	dh = denorm_deltas[..., 3::4]

	x1, y1 = rois[..., 0], rois[..., 1]
	x2, y2 = rois[..., 2], rois[..., 3]
	# Compute center of each roi
	px = ((x1 + x2) * 0.5).unsqueeze(-1).expand_as(dx)
	py = ((y1 + y2) * 0.5).unsqueeze(-1).expand_as(dy)
	# Compute width/height of each roi
	pw = (x2 - x1).unsqueeze(-1).expand_as(dw)
	ph = (y2 - y1).unsqueeze(-1).expand_as(dh)

	dx_width = pw * dx
	dy_height = ph * dy

	max_ratio = np.abs(np.log(wh_ratio_clip))
	if add_ctr_clamp:
	dx_width = torch.clamp(dx_width, max=ctr_clamp, min=-ctr_clamp)
	dy_height = torch.clamp(dy_height, max=ctr_clamp, min=-ctr_clamp)
	dw = torch.clamp(dw, max=max_ratio)
	dh = torch.clamp(dh, max=max_ratio)
	else:
	dw = dw.clamp(min=-max_ratio, max=max_ratio)
	dh = dh.clamp(min=-max_ratio, max=max_ratio)
	# Use exp(network energy) to enlarge/shrink each roi
	gw = pw * dw.exp()
	gh = ph * dh.exp()
	# Use network energy to shift the center of each roi
	gx = px + dx_width
	gy = py + dy_height
	# Convert center-xy/width/height to top-left, bottom-right
	x1 = gx - gw * 0.5
	y1 = gy - gh * 0.5
	x2 = gx + gw * 0.5
	y2 = gy + gh * 0.5

	bboxes = torch.stack([x1, y1, x2, y2], dim=-1).view(deltas.size())

	if clip_border and max_shape is not None:
	# clip bboxes with dynamic `min` and `max` for onnx
	if torch.onnx.is_in_onnx_export():
	from mmdet.core.export import dynamic_clip_for_onnx
	x1, y1, x2, y2 = dynamic_clip_for_onnx(x1, y1, x2, y2, max_shape)
	bboxes = torch.stack([x1, y1, x2, y2], dim=-1).view(deltas.size())
	return bboxes
	if not isinstance(max_shape, torch.Tensor):
	max_shape = x1.new_tensor(max_shape)
	max_shape = max_shape[..., :2].type_as(x1)
	if max_shape.ndim == 2:
	assert bboxes.ndim == 3
	assert max_shape.size(0) == bboxes.size(0)

	min_xy = x1.new_tensor(0)
	max_xy = torch.cat(
	[max_shape] * (deltas.size(-1) // 2),
	dim=-1).flip(-1).unsqueeze(-2)
	bboxes = torch.where(bboxes < min_xy, min_xy, bboxes)
	bboxes = torch.where(bboxes > max_xy, max_xy, bboxes)

	return bboxes