pose_demo_01

Sleeping

App Files Files Community

pose_demo_01 / easy_ViTPose /vit_models /backbone /vit.py

Maksym-Lysyi

initial commit

e3641b1 11 months ago

raw

history blame contribute delete

14.8 kB


	import math
	import warnings

	from itertools import repeat
	import collections.abc

	import torch
	from functools import partial
	import torch.nn as nn
	import torch.nn.functional as F
	import torch.utils.checkpoint as checkpoint
	from torch import Tensor

	# from timm.models.layers import drop_path, to_2tuple, trunc_normal_

	# from .base_backbone import BaseBackbone

	def drop_path(x, drop_prob: float = 0., training: bool = False, scale_by_keep: bool = True):
	"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

	This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
	the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
	See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
	changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
	'survival rate' as the argument.

	"""
	if drop_prob == 0. or not training:
	return x
	keep_prob = 1 - drop_prob
	shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets
	random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
	if keep_prob > 0.0 and scale_by_keep:
	random_tensor.div_(keep_prob)
	return x * random_tensor

	def _ntuple(n):
	def parse(x):
	if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
	return x
	return tuple(repeat(x, n))
	return parse


	to_1tuple = _ntuple(1)
	to_2tuple = _ntuple(2)
	to_3tuple = _ntuple(3)
	to_4tuple = _ntuple(4)
	to_ntuple = _ntuple

	def _trunc_normal_(tensor, mean, std, a, b):
	# Cut & paste from PyTorch official master until it's in a few official releases - RW
	# Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
	def norm_cdf(x):
	# Computes standard normal cumulative distribution function
	return (1. + math.erf(x / math.sqrt(2.))) / 2.

	if (mean < a - 2 * std) or (mean > b + 2 * std):
	warnings.warn("mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
	"The distribution of values may be incorrect.",
	stacklevel=2)

	# Values are generated by using a truncated uniform distribution and
	# then using the inverse CDF for the normal distribution.
	# Get upper and lower cdf values
	l = norm_cdf((a - mean) / std)
	u = norm_cdf((b - mean) / std)

	# Uniformly fill tensor with values from [l, u], then translate to
	# [2l-1, 2u-1].
	tensor.uniform_(2 * l - 1, 2 * u - 1)

	# Use inverse cdf transform for normal distribution to get truncated
	# standard normal
	tensor.erfinv_()

	# Transform to proper mean, std
	tensor.mul_(std * math.sqrt(2.))
	tensor.add_(mean)

	# Clamp to ensure it's in the proper range
	tensor.clamp_(min=a, max=b)
	return tensor


	def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):
	# type: (Tensor, float, float, float, float) -> Tensor
	r"""Fills the input Tensor with values drawn from a truncated
	normal distribution. The values are effectively drawn from the
	normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
	with values outside :math:`[a, b]` redrawn until they are within
	the bounds. The method used for generating the random values works
	best when :math:`a \leq \text{mean} \leq b`.

	NOTE: this impl is similar to the PyTorch trunc_normal_, the bounds [a, b] are
	applied while sampling the normal with mean/std applied, therefore a, b args
	should be adjusted to match the range of mean, std args.

	Args:
	tensor: an n-dimensional `torch.Tensor`
	mean: the mean of the normal distribution
	std: the standard deviation of the normal distribution
	a: the minimum cutoff value
	b: the maximum cutoff value
	Examples:
	>>> w = torch.empty(3, 5)
	>>> nn.init.trunc_normal_(w)
	"""
	with torch.no_grad():
	return _trunc_normal_(tensor, mean, std, a, b)

	class DropPath(nn.Module):
	"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
	"""
	def __init__(self, drop_prob=None):
	super(DropPath, self).__init__()
	self.drop_prob = drop_prob

	def forward(self, x):
	return drop_path(x, self.drop_prob, self.training)

	def extra_repr(self):
	return 'p={}'.format(self.drop_prob)

	class Mlp(nn.Module):
	def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
	super().__init__()
	out_features = out_features or in_features
	hidden_features = hidden_features or in_features
	self.fc1 = nn.Linear(in_features, hidden_features)
	self.act = act_layer()
	self.fc2 = nn.Linear(hidden_features, out_features)
	self.drop = nn.Dropout(drop)

	def forward(self, x):
	x = self.fc1(x)
	x = self.act(x)
	x = self.fc2(x)
	x = self.drop(x)
	return x

	class Attention(nn.Module):
	def __init__(
	self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0.,
	proj_drop=0., attn_head_dim=None,):
	super().__init__()
	self.num_heads = num_heads
	head_dim = dim // num_heads
	self.dim = dim

	if attn_head_dim is not None:
	head_dim = attn_head_dim
	all_head_dim = head_dim * self.num_heads

	self.scale = qk_scale or head_dim ** -0.5

	self.qkv = nn.Linear(dim, all_head_dim * 3, bias=qkv_bias)

	self.attn_drop = nn.Dropout(attn_drop)
	self.proj = nn.Linear(all_head_dim, dim)
	self.proj_drop = nn.Dropout(proj_drop)

	def forward(self, x):
	B, N, C = x.shape
	qkv = self.qkv(x)
	qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
	q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple)

	q = q * self.scale
	attn = (q @ k.transpose(-2, -1))

	attn = attn.softmax(dim=-1)
	attn = self.attn_drop(attn)

	x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
	x = self.proj(x)
	x = self.proj_drop(x)

	return x

	class Block(nn.Module):

	def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None,
	drop=0., attn_drop=0., drop_path=0., act_layer=nn.GELU,
	norm_layer=nn.LayerNorm, attn_head_dim=None
	):
	super().__init__()

	self.norm1 = norm_layer(dim)
	self.attn = Attention(
	dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
	attn_drop=attn_drop, proj_drop=drop, attn_head_dim=attn_head_dim
	)

	# NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
	self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
	self.norm2 = norm_layer(dim)
	mlp_hidden_dim = int(dim * mlp_ratio)
	self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)

	def forward(self, x):
	x = x + self.drop_path(self.attn(self.norm1(x)))
	x = x + self.drop_path(self.mlp(self.norm2(x)))
	return x


	class PatchEmbed(nn.Module):
	""" Image to Patch Embedding
	"""
	def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, ratio=1):
	super().__init__()
	img_size = to_2tuple(img_size)
	patch_size = to_2tuple(patch_size)
	num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0]) * (ratio ** 2)
	self.patch_shape = (int(img_size[0] // patch_size[0] * ratio), int(img_size[1] // patch_size[1] * ratio))
	self.origin_patch_shape = (int(img_size[0] // patch_size[0]), int(img_size[1] // patch_size[1]))
	self.img_size = img_size
	self.patch_size = patch_size
	self.num_patches = num_patches

	self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=(patch_size[0] // ratio), padding=4 + 2 * (ratio//2-1))

	def forward(self, x):
	x = self.proj(x)
	B, C, Hp, Wp = x.shape
	x = x.view(B, C, Hp * Wp).transpose(1, 2)
	return x, (Hp, Wp)


	class HybridEmbed(nn.Module):
	""" CNN Feature Map Embedding
	Extract feature map from CNN, flatten, project to embedding dim.
	"""
	def __init__(self, backbone, img_size=224, feature_size=None, in_chans=3, embed_dim=768):
	super().__init__()
	assert isinstance(backbone, nn.Module)
	img_size = to_2tuple(img_size)
	self.img_size = img_size
	self.backbone = backbone
	if feature_size is None:
	with torch.no_grad():
	training = backbone.training
	if training:
	backbone.eval()
	o = self.backbone(torch.zeros(1, in_chans, img_size[0], img_size[1]))[-1]
	feature_size = o.shape[-2:]
	feature_dim = o.shape[1]
	backbone.train(training)
	else:
	feature_size = to_2tuple(feature_size)
	feature_dim = self.backbone.feature_info.channels()[-1]
	self.num_patches = feature_size[0] * feature_size[1]
	self.proj = nn.Linear(feature_dim, embed_dim)

	def forward(self, x):
	x = self.backbone(x)[-1]
	x = x.flatten(2).transpose(1, 2)
	x = self.proj(x)
	return x


	class ViT(nn.Module):
	def __init__(self,
	img_size=224, patch_size=16, in_chans=3, num_classes=80, embed_dim=768, depth=12,
	num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0.,
	drop_path_rate=0., hybrid_backbone=None, norm_layer=None, use_checkpoint=False,
	frozen_stages=-1, ratio=1, last_norm=True,
	patch_padding='pad', freeze_attn=False, freeze_ffn=False,
	):
	super(ViT, self).__init__()
	# Protect mutable default arguments

	norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
	self.num_classes = num_classes
	self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models
	self.frozen_stages = frozen_stages
	self.use_checkpoint = use_checkpoint
	self.patch_padding = patch_padding
	self.freeze_attn = freeze_attn
	self.freeze_ffn = freeze_ffn
	self.depth = depth

	if hybrid_backbone is not None:
	self.patch_embed = HybridEmbed(
	hybrid_backbone, img_size=img_size, in_chans=in_chans, embed_dim=embed_dim)
	else:
	self.patch_embed = PatchEmbed(
	img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim, ratio=ratio)
	num_patches = self.patch_embed.num_patches

	# since the pretraining model has class token
	self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim))

	dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule

	self.blocks = nn.ModuleList([
	Block(
	dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
	drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer,
	)
	for i in range(depth)])

	self.last_norm = norm_layer(embed_dim) if last_norm else nn.Identity()

	if self.pos_embed is not None:
	trunc_normal_(self.pos_embed, std=.02)

	self._freeze_stages()

	def _freeze_stages(self):
	"""Freeze parameters."""
	if self.frozen_stages >= 0:
	self.patch_embed.eval()
	for param in self.patch_embed.parameters():
	param.requires_grad = False

	for i in range(1, self.frozen_stages + 1):
	m = self.blocks[i]
	m.eval()
	for param in m.parameters():
	param.requires_grad = False

	if self.freeze_attn:
	for i in range(0, self.depth):
	m = self.blocks[i]
	m.attn.eval()
	m.norm1.eval()
	for param in m.attn.parameters():
	param.requires_grad = False
	for param in m.norm1.parameters():
	param.requires_grad = False

	if self.freeze_ffn:
	self.pos_embed.requires_grad = False
	self.patch_embed.eval()
	for param in self.patch_embed.parameters():
	param.requires_grad = False
	for i in range(0, self.depth):
	m = self.blocks[i]
	m.mlp.eval()
	m.norm2.eval()
	for param in m.mlp.parameters():
	param.requires_grad = False
	for param in m.norm2.parameters():
	param.requires_grad = False

	def init_weights(self, pretrained=None):
	"""Initialize the weights in backbone.
	Args:
	pretrained (str, optional): Path to pre-trained weights.
	Defaults to None.
	"""
	super().init_weights(pretrained, patch_padding=self.patch_padding)

	if pretrained is None:
	def _init_weights(m):
	if isinstance(m, nn.Linear):
	trunc_normal_(m.weight, std=.02)
	if isinstance(m, nn.Linear) and m.bias is not None:
	nn.init.constant_(m.bias, 0)
	elif isinstance(m, nn.LayerNorm):
	nn.init.constant_(m.bias, 0)
	nn.init.constant_(m.weight, 1.0)

	self.apply(_init_weights)

	def get_num_layers(self):
	return len(self.blocks)

	@torch.jit.ignore
	def no_weight_decay(self):
	return {'pos_embed', 'cls_token'}

	def forward(self, x):
	B, C, H, W = x.shape
	x, (Hp, Wp) = self.patch_embed(x)

	if self.pos_embed is not None:
	# fit for multiple GPU training
	# since the first element for pos embed (sin-cos manner) is zero, it will cause no difference
	x = x + self.pos_embed[:, 1:] + self.pos_embed[:, :1]

	for blk in self.blocks:
	x = blk(x)

	x = self.last_norm(x)
	x = x.permute(0, 2, 1).view(B, -1, Hp, Wp).contiguous()
	return x

	def train(self, mode=True):
	"""Convert the model into training mode."""
	super().train(mode)
	self._freeze_stages()