Spaces:

venite
/

sat3density

Runtime error

App Files Files Community

sat3density / imaginaire /model_utils /fs_vid2vid.py

venite

initial

f670afc over 1 year ago

raw

history blame contribute delete

32.3 kB

	# Copyright (C) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
	#
	# This work is made available under the Nvidia Source Code License-NC.
	# To view a copy of this license, check out LICENSE.md
	"""Utils for the few shot vid2vid model."""
	import random
	import numpy as np
	import torch
	import torch.nn.functional as F


	def resample(image, flow):
	r"""Resamples an image using the provided flow.

	Args:
	image (NxCxHxW tensor) : Image to resample.
	flow (Nx2xHxW tensor) : Optical flow to resample the image.
	Returns:
	output (NxCxHxW tensor) : Resampled image.
	"""
	assert flow.shape[1] == 2
	b, c, h, w = image.size()
	grid = get_grid(b, (h, w))
	flow = torch.cat([flow[:, 0:1, :, :] / ((w - 1.0) / 2.0),
	flow[:, 1:2, :, :] / ((h - 1.0) / 2.0)], dim=1)
	final_grid = (grid + flow).permute(0, 2, 3, 1)
	try:
	output = F.grid_sample(image, final_grid, mode='bilinear',
	padding_mode='border', align_corners=True)
	except Exception:
	output = F.grid_sample(image, final_grid, mode='bilinear',
	padding_mode='border')
	return output


	def get_grid(batchsize, size, minval=-1.0, maxval=1.0):
	r"""Get a grid ranging [-1, 1] of 2D/3D coordinates.

	Args:
	batchsize (int) : Batch size.
	size (tuple) : (height, width) or (depth, height, width).
	minval (float) : minimum value in returned grid.
	maxval (float) : maximum value in returned grid.
	Returns:
	t_grid (4D tensor) : Grid of coordinates.
	"""
	if len(size) == 2:
	rows, cols = size
	elif len(size) == 3:
	deps, rows, cols = size
	else:
	raise ValueError('Dimension can only be 2 or 3.')
	x = torch.linspace(minval, maxval, cols)
	x = x.view(1, 1, 1, cols)
	x = x.expand(batchsize, 1, rows, cols)

	y = torch.linspace(minval, maxval, rows)
	y = y.view(1, 1, rows, 1)
	y = y.expand(batchsize, 1, rows, cols)

	t_grid = torch.cat([x, y], dim=1)

	if len(size) == 3:
	z = torch.linspace(minval, maxval, deps)
	z = z.view(1, 1, deps, 1, 1)
	z = z.expand(batchsize, 1, deps, rows, cols)

	t_grid = t_grid.unsqueeze(2).expand(batchsize, 2, deps, rows, cols)
	t_grid = torch.cat([t_grid, z], dim=1)

	t_grid.requires_grad = False
	return t_grid.to('cuda')


	def pick_image(images, idx):
	r"""Pick the image among images according to idx.

	Args:
	images (B x N x C x H x W tensor or list of tensors) : N images.
	idx (B tensor) : indices to select.
	Returns:
	image (B x C x H x W) : Selected images.
	"""
	if type(images) == list:
	return [pick_image(r, idx) for r in images]
	if idx is None:
	return images[:, 0]
	elif type(idx) == int:
	return images[:, idx]
	idx = idx.long().view(-1, 1, 1, 1, 1)
	image = images.gather(1, idx.expand_as(images)[:, 0:1])[:, 0]
	return image


	def crop_face_from_data(cfg, is_inference, data):
	r"""Crop the face regions in input data and resize to the target size.
	This is for training face datasets.

	Args:
	cfg (obj): Data configuration.
	is_inference (bool): Is doing inference or not.
	data (dict): Input data.
	Returns:
	data (dict): Cropped data.
	"""
	label = data['label'] if 'label' in data else None
	image = data['images']
	landmarks = data['landmarks-dlib68_xy']
	ref_labels = data['few_shot_label'] if 'few_shot_label' in data else None
	ref_images = data['few_shot_images']
	ref_landmarks = data['few_shot_landmarks-dlib68_xy']
	img_size = image.shape[-2:]
	h, w = cfg.output_h_w.split(',')
	h, w = int(h), int(w)

	# When doing inference, need to sync common attributes like crop coodinates
	# between different workers, so all workers crop the same region.
	if 'common_attr' in data and 'crop_coords' in data['common_attr']:
	# Has been computed before, reusing the previous one.
	crop_coords, ref_crop_coords = data['common_attr']['crop_coords']
	else:
	# Is the first frame, need to compute the bbox.
	ref_crop_coords, scale = get_face_bbox_for_data(
	ref_landmarks[0], img_size, None, is_inference)
	crop_coords, _ = get_face_bbox_for_data(
	landmarks[0], img_size, scale, is_inference)

	# Crop the images according to the bbox and resize them to target size.
	label, image = crop_and_resize([label, image], crop_coords, (h, w))
	ref_labels, ref_images = crop_and_resize([ref_labels, ref_images],
	ref_crop_coords, (h, w))

	data['images'], data['few_shot_images'] = image, ref_images
	if label is not None:
	data['label'], data['few_shot_label'] = label, ref_labels
	if is_inference:
	if 'common_attr' not in data:
	data['common_attr'] = dict()
	data['common_attr']['crop_coords'] = crop_coords, ref_crop_coords
	return data


	def get_face_bbox_for_data(keypoints, orig_img_size, scale, is_inference):
	r"""Get the bbox coordinates for face region.

	Args:
	keypoints (Nx2 tensor): Facial landmarks.
	orig_img_size (int tuple): Height and width of the input image size.
	scale (float): When training, randomly scale the crop size for
	augmentation.
	is_inference (bool): Is doing inference or not.
	Returns:
	crop_coords (list of int): bbox for face region.
	scale (float): Also returns scale to ensure reference and target frames
	are croppped using the same scale.
	"""
	min_y, max_y = int(keypoints[:, 1].min()), int(keypoints[:, 1].max())
	min_x, max_x = int(keypoints[:, 0].min()), int(keypoints[:, 0].max())
	x_cen, y_cen = (min_x + max_x) // 2, (min_y + max_y) // 2
	H, W = orig_img_size
	w = h = (max_x - min_x)
	if not is_inference:
	# During training, randomly jitter the cropping position by offset
	# amount for augmentation.
	offset_max = 0.2
	offset = [np.random.uniform(-offset_max, offset_max),
	np.random.uniform(-offset_max, offset_max)]
	# Also augment the crop size.
	if scale is None:
	scale_max = 0.2
	scale = [np.random.uniform(1 - scale_max, 1 + scale_max),
	np.random.uniform(1 - scale_max, 1 + scale_max)]
	w *= scale[0]
	h *= scale[1]
	x_cen += int(offset[0] * w)
	y_cen += int(offset[1] * h)

	# Get the cropping coordinates.
	x_cen = max(w, min(W - w, x_cen))
	y_cen = max(h * 1.25, min(H - h * 0.75, y_cen))

	min_x = x_cen - w
	min_y = y_cen - h * 1.25
	max_x = min_x + w * 2
	max_y = min_y + h * 2

	crop_coords = [min_y, max_y, min_x, max_x]
	return [int(x) for x in crop_coords], scale


	def crop_person_from_data(cfg, is_inference, data):
	r"""Crop the person regions in data and resize to the target size.
	This is for training full body datasets.

	Args:
	cfg (obj): Data configuration.
	is_inference (bool): Is doing inference or not.
	data (dict): Input data.
	Returns:
	data (dict): Cropped data.
	"""
	label = data['label']
	image = data['images']
	use_few_shot = 'few_shot_label' in data
	if use_few_shot:
	ref_labels = data['few_shot_label']
	ref_images = data['few_shot_images']

	img_size = image.shape[-2:]
	output_h, output_w = cfg.output_h_w.split(',')
	output_h, output_w = int(output_h), int(output_w)
	output_aspect_ratio = output_w / output_h

	if 'human_instance_maps' in data:
	# Remove other people in the DensePose map except for the current
	# target.
	label = remove_other_ppl(label, data['human_instance_maps'])
	if use_few_shot:
	ref_labels = remove_other_ppl(ref_labels,
	data['few_shot_human_instance_maps'])

	# Randomly jitter the crop position by offset amount for augmentation.
	offset = ref_offset = None
	if not is_inference:
	offset = np.random.randn(2) * 0.05
	offset = np.minimum(1, np.maximum(-1, offset))
	ref_offset = np.random.randn(2) * 0.02
	ref_offset = np.minimum(1, np.maximum(-1, ref_offset))

	# Randomly scale the crop size for augmentation.
	# Final cropped size = person height * scale.
	scale = ref_scale = 1.5
	if not is_inference:
	scale = min(2, max(1, scale + np.random.randn() * 0.05))
	ref_scale = min(2, max(1, ref_scale + np.random.randn() * 0.02))

	# When doing inference, need to sync common attributes like crop coodinates
	# between different workers, so all workers crop the same region.
	if 'common_attr' in data:
	# Has been computed before, reusing the previous one.
	crop_coords, ref_crop_coords = data['common_attr']['crop_coords']
	else:
	# Is the first frame, need to compute the bbox.
	crop_coords = get_person_bbox_for_data(label, img_size, scale,
	output_aspect_ratio, offset)
	if use_few_shot:
	ref_crop_coords = get_person_bbox_for_data(
	ref_labels, img_size, ref_scale,
	output_aspect_ratio, ref_offset)
	else:
	ref_crop_coords = None

	# Crop the images according to the bbox and resize them to target size.
	label = crop_and_resize(label, crop_coords, (output_h, output_w), 'nearest')
	image = crop_and_resize(image, crop_coords, (output_h, output_w))
	if use_few_shot:
	ref_labels = crop_and_resize(ref_labels, ref_crop_coords,
	(output_h, output_w), 'nearest')
	ref_images = crop_and_resize(ref_images, ref_crop_coords,
	(output_h, output_w))

	data['label'], data['images'] = label, image
	if use_few_shot:
	data['few_shot_label'], data['few_shot_images'] = ref_labels, ref_images
	if 'human_instance_maps' in data:
	del data['human_instance_maps']
	if 'few_shot_human_instance_maps' in data:
	del data['few_shot_human_instance_maps']
	if is_inference:
	data['common_attr'] = dict()
	data['common_attr']['crop_coords'] = crop_coords, ref_crop_coords

	return data


	def get_person_bbox_for_data(pose_map, orig_img_size, scale=1.5,
	crop_aspect_ratio=1, offset=None):
	r"""Get the bbox (pixel coordinates) to crop for person body region.

	Args:
	pose_map (NxCxHxW tensor): Input pose map.
	orig_img_size (int tuple): Height and width of the input image size.
	scale (float): When training, randomly scale the crop size for
	augmentation.
	crop_aspect_ratio (float): Output aspect ratio,
	offset (list of float): Offset for crop position.
	Returns:
	crop_coords (list of int): bbox for body region.
	"""
	H, W = orig_img_size
	assert pose_map.dim() == 4
	nonzero_indices = (pose_map[:, :3] > 0).nonzero(as_tuple=False)
	if nonzero_indices.size(0) == 0:
	bw = int(H * crop_aspect_ratio // 2)
	return [0, H, W // 2 - bw, W // 2 + bw]

	y_indices, x_indices = nonzero_indices[:, 2], nonzero_indices[:, 3]
	y_min, y_max = y_indices.min().item(), y_indices.max().item()
	x_min, x_max = x_indices.min().item(), x_indices.max().item()
	y_cen = int(y_min + y_max) // 2
	x_cen = int(x_min + x_max) // 2
	y_len = y_max - y_min
	x_len = x_max - x_min

	# bh, bw: half of height / width of final cropped size.
	bh = int(min(H, max(H // 2, y_len * scale))) // 2
	bh = max(bh, int(x_len * scale / crop_aspect_ratio) // 2)
	bw = int(bh * crop_aspect_ratio)

	# Randomly offset the cropped position for augmentation.
	if offset is not None:
	x_cen += int(offset[0] * bw)
	y_cen += int(offset[1] * bh)
	x_cen = max(bw, min(W - bw, x_cen))
	y_cen = max(bh, min(H - bh, y_cen))

	return [(y_cen - bh), (y_cen + bh), (x_cen - bw), (x_cen + bw)]


	def crop_and_resize(img, coords, size=None, method='bilinear'):
	r"""Crop the image using the given coordinates and resize to target size.

	Args:
	img (tensor or list of tensors): Input image.
	coords (list of int): Pixel coordinates to crop.
	size (list of int): Output size.
	method (str): Interpolation method.
	Returns:
	img (tensor or list of tensors): Output image.
	"""
	if isinstance(img, list):
	return [crop_and_resize(x, coords, size, method) for x in img]
	if img is None:
	return None
	min_y, max_y, min_x, max_x = coords

	img = img[:, :, min_y:max_y, min_x:max_x]
	if size is not None:
	if method == 'nearest':
	img = F.interpolate(img, size=size, mode=method)
	else:
	img = F.interpolate(img, size=size, mode=method,
	align_corners=False)
	return img


	def remove_other_ppl(labels, densemasks):
	r"""Remove other people in the label map except for the current target
	by looking at the id in the densemask map.

	Args:
	labels (NxCxHxW tensor): Input labels.
	densemasks (Nx1xHxW tensor): Densemask maps.
	Returns:
	labels (NxCxHxW tensor): Output labels.
	"""
	densemasks = densemasks[:, 0:1] * 255
	for idx in range(labels.shape[0]):
	label, densemask = labels[idx], densemasks[idx]
	# Get OpenPose and find the person id in Densemask that has the most
	# overlap with the person in OpenPose result.
	openpose = label[3:]
	valid = (openpose[0] > 0) \| (openpose[1] > 0) \| (openpose[2] > 0)
	dp_valid = densemask[valid.unsqueeze(0)]
	if dp_valid.shape[0]:
	ind = np.bincount(dp_valid).argmax()
	# Remove all other people that have different indices.
	label = label * (densemask == ind).float()
	labels[idx] = label
	return labels


	def select_object(data, obj_indices=None):
	r"""Select the object/person in the dict according to the object index.
	Currently it's used to select the target person in OpenPose dict.

	Args:
	data (dict): Input data.
	obj_indices (list of int): Indices for the objects to select.
	Returns:
	data (dict): Output data.
	"""
	op_keys = ['poses-openpose', 'captions-clip']
	for op_key in op_keys:
	if op_key in data:
	for i in range(len(data[op_key])):
	# data[op_key] is a list of dicts for different frames.
	# people = data[op_key][i]['people']
	people = data[op_key][i]
	# "people" is a list of people dicts found by OpenPose. We will
	# use the obj_index to get the target person from the list, and
	# write it back to the dict.
	# data[op_key][i]['people'] = [people[obj_indices[i]]]
	if obj_indices is not None:
	data[op_key][i] = people[obj_indices[i]]
	else:
	if op_key == 'poses-openpose':
	data[op_key][i] = people[0]
	else:
	idx = random.randint(0, len(people) - 1)
	data[op_key][i] = people[idx]
	return data


	def concat_frames(prev, now, n_frames):
	r"""Concat previous and current frames and only keep the latest $(n_frames).
	If concatenated frames are longer than $(n_frames), drop the oldest one.

	Args:
	prev (NxTxCxHxW tensor): Tensor for previous frames.
	now (NxCxHxW tensor): Tensor for current frame.
	n_frames (int): Max number of frames to store.
	Returns:
	result (NxTxCxHxW tensor): Updated tensor.
	"""
	now = now.unsqueeze(1)
	if prev is None:
	return now
	if prev.shape[1] == n_frames:
	prev = prev[:, 1:]
	return torch.cat([prev, now], dim=1)


	def combine_fg_mask(fg_mask, ref_fg_mask, has_fg):
	r"""Get the union of target and reference foreground masks.
	Args:
	fg_mask (tensor): Foreground mask for target image.
	ref_fg_mask (tensor): Foreground mask for reference image.
	has_fg (bool): Whether the image can be classified into fg/bg.
	Returns:
	output (tensor or int): Combined foreground mask.
	"""
	return ((fg_mask > 0) \| (ref_fg_mask > 0)).float() if has_fg else 1


	def get_fg_mask(densepose_map, has_fg):
	r"""Obtain the foreground mask for pose sequences, which only includes
	the human. This is done by looking at the body part map from DensePose.

	Args:
	densepose_map (NxCxHxW tensor): DensePose map.
	has_fg (bool): Whether data has foreground or not.
	Returns:
	mask (Nx1xHxW tensor): fg mask.
	"""
	if type(densepose_map) == list:
	return [get_fg_mask(label, has_fg) for label in densepose_map]
	if not has_fg or densepose_map is None:
	return 1
	if densepose_map.dim() == 5:
	densepose_map = densepose_map[:, 0]
	# Get the body part map from DensePose.
	mask = densepose_map[:, 2:3]

	# Make the mask slightly larger.
	mask = torch.nn.MaxPool2d(15, padding=7, stride=1)(mask)
	mask = (mask > -1).float()
	return mask


	def get_part_mask(densepose_map):
	r"""Obtain mask of different body parts of humans. This is done by
	looking at the body part map from DensePose.

	Args:
	densepose_map (NxCxHxW tensor): DensePose map.
	Returns:
	mask (NxKxHxW tensor): Body part mask, where K is the number of parts.
	"""
	# Groups of body parts. Each group contains IDs of body part labels in
	# DensePose. The 9 groups here are: background, torso, hands, feet,
	# upper legs, lower legs, upper arms, lower arms, head.
	part_groups = [[0], [1, 2], [3, 4], [5, 6], [7, 9, 8, 10], [11, 13, 12, 14],
	[15, 17, 16, 18], [19, 21, 20, 22], [23, 24]]
	n_parts = len(part_groups)

	need_reshape = densepose_map.dim() == 4
	if need_reshape:
	bo, t, h, w = densepose_map.size()
	densepose_map = densepose_map.view(-1, h, w)
	b, h, w = densepose_map.size()
	part_map = (densepose_map / 2 + 0.5) * 24
	assert (part_map >= 0).all() and (part_map < 25).all()

	mask = torch.cuda.ByteTensor(b, n_parts, h, w).fill_(0)
	for i in range(n_parts):
	for j in part_groups[i]:
	# Account for numerical errors.
	mask[:, i] = mask[:, i] \| (
	(part_map > j - 0.1) & (part_map < j + 0.1)).byte()
	if need_reshape:
	mask = mask.view(bo, t, -1, h, w)
	return mask.float()


	def get_face_mask(densepose_map):
	r"""Obtain mask of faces.
	Args:
	densepose_map (3D or 4D tensor): DensePose map.
	Returns:
	mask (3D or 4D tensor): Face mask.
	"""
	need_reshape = densepose_map.dim() == 4
	if need_reshape:
	bo, t, h, w = densepose_map.size()
	densepose_map = densepose_map.view(-1, h, w)

	b, h, w = densepose_map.size()
	part_map = (densepose_map / 2 + 0.5) * 24
	assert (part_map >= 0).all() and (part_map < 25).all()
	if densepose_map.is_cuda:
	mask = torch.cuda.ByteTensor(b, h, w).fill_(0)
	else:
	mask = torch.ByteTensor(b, h, w).fill_(0)
	for j in [23, 24]:
	mask = mask \| ((part_map > j - 0.1) & (part_map < j + 0.1)).byte()
	if need_reshape:
	mask = mask.view(bo, t, h, w)
	return mask.float()


	def extract_valid_pose_labels(pose_map, pose_type, remove_face_labels,
	do_remove=True):
	r"""Remove some labels (e.g. face regions) in the pose map if necessary.

	Args:
	pose_map (3D, 4D or 5D tensor): Input pose map.
	pose_type (str): 'both' or 'open'.
	remove_face_labels (bool): Whether to remove labels for the face region.
	do_remove (bool): Do remove face labels.
	Returns:
	pose_map (3D, 4D or 5D tensor): Output pose map.
	"""
	if pose_map is None:
	return pose_map
	if type(pose_map) == list:
	return [extract_valid_pose_labels(p, pose_type, remove_face_labels,
	do_remove) for p in pose_map]

	orig_dim = pose_map.dim()
	assert (orig_dim >= 3 and orig_dim <= 5)
	if orig_dim == 3:
	pose_map = pose_map.unsqueeze(0).unsqueeze(0)
	elif orig_dim == 4:
	pose_map = pose_map.unsqueeze(0)

	if pose_type == 'open':
	# If input is only openpose, remove densepose part.
	pose_map = pose_map[:, :, 3:]

	elif remove_face_labels and do_remove:
	# Remove face part for densepose input.
	densepose, openpose = pose_map[:, :, :3], pose_map[:, :, 3:]
	face_mask = get_face_mask(pose_map[:, :, 2]).unsqueeze(2)
	pose_map = torch.cat([densepose * (1 - face_mask) - face_mask,
	openpose], dim=2)

	if orig_dim == 3:
	pose_map = pose_map[0, 0]
	elif orig_dim == 4:
	pose_map = pose_map[0]
	return pose_map


	def normalize_faces(keypoints, ref_keypoints,
	dist_scale_x=None, dist_scale_y=None):
	r"""Normalize face keypoints w.r.t. the reference face keypoints.

	Args:
	keypoints (Kx2 numpy array): target facial keypoints.
	ref_keypoints (Kx2 numpy array): reference facial keypoints.
	Returns:
	keypoints (Kx2 numpy array): normalized facial keypoints.
	"""
	if keypoints.shape[0] == 68:
	central_keypoints = [8]
	add_upper_face = False
	part_list = [[0, 16], [1, 15], [2, 14], [3, 13], [4, 12],
	[5, 11], [6, 10], [7, 9, 8],
	[17, 26], [18, 25], [19, 24], [20, 23], [21, 22],
	[27], [28], [29], [30], [31, 35], [32, 34], [33],
	[36, 45], [37, 44], [38, 43], [39, 42], [40, 47], [41, 46],
	[48, 54], [49, 53], [50, 52], [51], [55, 59], [56, 58],
	[57],
	[60, 64], [61, 63], [62], [65, 67], [66]
	]
	if add_upper_face:
	part_list += [[68, 82], [69, 81], [70, 80], [71, 79], [72, 78],
	[73, 77], [74, 76, 75]]
	elif keypoints.shape[0] == 126:
	central_keypoints = [16]
	part_list = [[i] for i in range(126)]
	else:
	raise ValueError('Input keypoints type not supported.')

	face_cen = np.mean(keypoints[central_keypoints, :], axis=0)
	ref_face_cen = np.mean(ref_keypoints[central_keypoints, :], axis=0)

	def get_mean_dists(pts, face_cen):
	r"""Get the mean xy distances of keypoints wrt face center."""
	mean_dists_x, mean_dists_y = [], []
	pts_cen = np.mean(pts, axis=0)
	for p, pt in enumerate(pts):
	mean_dists_x.append(np.linalg.norm(pt - pts_cen))
	mean_dists_y.append(np.linalg.norm(pts_cen - face_cen))
	mean_dist_x = sum(mean_dists_x) / len(mean_dists_x) + 1e-3
	mean_dist_y = sum(mean_dists_y) / len(mean_dists_y) + 1e-3
	return mean_dist_x, mean_dist_y

	if dist_scale_x is None:
	dist_scale_x, dist_scale_y = [None] * len(part_list), \
	[None] * len(part_list)

	for i, pts_idx in enumerate(part_list):
	pts = keypoints[pts_idx]
	if dist_scale_x[i] is None:
	ref_pts = ref_keypoints[pts_idx]
	mean_dist_x, mean_dist_y = get_mean_dists(pts, face_cen)
	ref_dist_x, ref_dist_y = get_mean_dists(ref_pts, ref_face_cen)

	dist_scale_x[i] = ref_dist_x / mean_dist_x
	dist_scale_y[i] = ref_dist_y / mean_dist_y

	pts_cen = np.mean(pts, axis=0)
	pts = (pts - pts_cen) * dist_scale_x[i] + \
	(pts_cen - face_cen) * dist_scale_y[i] + face_cen
	keypoints[pts_idx] = pts
	return keypoints, [dist_scale_x, dist_scale_y]


	def crop_face_from_output(data_cfg, image, input_label, crop_smaller=0):
	r"""Crop out the face region of the image (and resize if necessary to feed
	into generator/discriminator).

	Args:
	data_cfg (obj): Data configuration.
	image (NxC1xHxW tensor or list of tensors): Image to crop.
	input_label (NxC2xHxW tensor): Input label map.
	crop_smaller (int): Number of pixels to crop slightly smaller region.
	Returns:
	output (NxC1xHxW tensor or list of tensors): Cropped image.
	"""
	if type(image) == list:
	return [crop_face_from_output(data_cfg, im, input_label, crop_smaller)
	for im in image]

	output = None
	face_size = image.shape[-2] // 32 * 8
	for i in range(input_label.size(0)):
	ys, ye, xs, xe = get_face_bbox_for_output(data_cfg,
	input_label[i:i + 1],
	crop_smaller=crop_smaller)
	output_i = F.interpolate(image[i:i + 1, -3:, ys:ye, xs:xe],
	size=(face_size, face_size), mode='bilinear',
	align_corners=True)
	# output_i = image[i:i + 1, -3:, ys:ye, xs:xe]
	output = torch.cat([output, output_i]) if i != 0 else output_i
	return output


	def get_face_bbox_for_output(data_cfg, pose, crop_smaller=0):
	r"""Get pixel coordinates of the face bounding box.

	Args:
	data_cfg (obj): Data configuration.
	pose (NxCxHxW tensor): Pose label map.
	crop_smaller (int): Number of pixels to crop slightly smaller region.
	Returns:
	output (list of int): Face bbox.
	"""
	if pose.dim() == 3:
	pose = pose.unsqueeze(0)
	elif pose.dim() == 5:
	pose = pose[-1, -1:]
	_, _, h, w = pose.size()

	use_openpose = 'pose_maps-densepose' not in data_cfg.input_labels
	if use_openpose: # Use openpose face keypoints to identify face region.
	for input_type in data_cfg.input_types:
	if 'poses-openpose' in input_type:
	num_ch = input_type['poses-openpose'].num_channels
	if num_ch > 3:
	face = (pose[:, -1] > 0).nonzero(as_tuple=False)
	else:
	raise ValueError('Not implemented yet.')
	else: # Use densepose labels.
	face = (pose[:, 2] > 0.9).nonzero(as_tuple=False)

	ylen = xlen = h // 32 * 8
	if face.size(0):
	y, x = face[:, 1], face[:, 2]
	ys, ye = y.min().item(), y.max().item()
	xs, xe = x.min().item(), x.max().item()
	if use_openpose:
	xc, yc = (xs + xe) // 2, (ys * 3 + ye * 2) // 5
	ylen = int((xe - xs) * 2.5)
	else:
	xc, yc = (xs + xe) // 2, (ys + ye) // 2
	ylen = int((ye - ys) * 1.25)
	ylen = xlen = min(w, max(32, ylen))
	yc = max(ylen // 2, min(h - 1 - ylen // 2, yc))
	xc = max(xlen // 2, min(w - 1 - xlen // 2, xc))
	else:
	yc = h // 4
	xc = w // 2

	ys, ye = yc - ylen // 2, yc + ylen // 2
	xs, xe = xc - xlen // 2, xc + xlen // 2
	if crop_smaller != 0: # Crop slightly smaller region inside face.
	ys += crop_smaller
	xs += crop_smaller
	ye -= crop_smaller
	xe -= crop_smaller
	return [ys, ye, xs, xe]


	def crop_hand_from_output(data_cfg, image, input_label):
	r"""Crop out the hand region of the image.

	Args:
	data_cfg (obj): Data configuration.
	image (NxC1xHxW tensor or list of tensors): Image to crop.
	input_label (NxC2xHxW tensor): Input label map.
	Returns:
	output (NxC1xHxW tensor or list of tensors): Cropped image.
	"""
	if type(image) == list:
	return [crop_hand_from_output(data_cfg, im, input_label)
	for im in image]

	output = None
	for i in range(input_label.size(0)):
	coords = get_hand_bbox_for_output(data_cfg, input_label[i:i + 1])
	if coords:
	for coord in coords:
	ys, ye, xs, xe = coord
	output_i = image[i:i + 1, -3:, ys:ye, xs:xe]
	output = torch.cat([output, output_i]) \
	if output is not None else output_i
	return output


	def get_hand_bbox_for_output(data_cfg, pose):
	r"""Get coordinates of the hand bounding box.

	Args:
	data_cfg (obj): Data configuration.
	pose (NxCxHxW tensor): Pose label map.
	Returns:
	output (list of int): Hand bbox.
	"""
	if pose.dim() == 3:
	pose = pose.unsqueeze(0)
	elif pose.dim() == 5:
	pose = pose[-1, -1:]
	_, _, h, w = pose.size()
	ylen = xlen = h // 64 * 8

	coords = []
	colors = [[0.95, 0.5, 0.95], [0.95, 0.95, 0.5]]
	for i, color in enumerate(colors):
	if pose.shape[1] > 6: # Using one-hot encoding for openpose.
	idx = -3 if i == 0 else -2
	hand = (pose[:, idx] == 1).nonzero(as_tuple=False)
	else:
	raise ValueError('Not implemented yet.')
	if hand.size(0):
	y, x = hand[:, 1], hand[:, 2]
	ys, ye, xs, xe = y.min().item(), y.max().item(), \
	x.min().item(), x.max().item()
	xc, yc = (xs + xe) // 2, (ys + ye) // 2
	yc = max(ylen // 2, min(h - 1 - ylen // 2, yc))
	xc = max(xlen // 2, min(w - 1 - xlen // 2, xc))
	ys, ye, xs, xe = yc - ylen // 2, yc + ylen // 2, \
	xc - xlen // 2, xc + xlen // 2
	coords.append([ys, ye, xs, xe])
	return coords


	def pre_process_densepose(pose_cfg, pose_map, is_infer=False):
	r"""Pre-process the DensePose part of input label map.

	Args:
	pose_cfg (obj): Pose data configuration.
	pose_map (NxCxHxW tensor): Pose label map.
	is_infer (bool): Is doing inference.
	Returns:
	pose_map (NxCxHxW tensor): Processed pose label map.
	"""
	part_map = pose_map[:, :, 2] * 255 # should be within [0-24]
	assert (part_map >= 0).all() and (part_map < 25).all()

	# Randomly drop some body part during training.
	if not is_infer:
	random_drop_prob = getattr(pose_cfg, 'random_drop_prob', 0)
	else:
	random_drop_prob = 0
	if random_drop_prob > 0:
	densepose_map = pose_map[:, :, :3]
	for part_id in range(1, 25):
	if (random.random() < random_drop_prob):
	part_mask = abs(part_map - part_id) < 0.1
	densepose_map[part_mask.unsqueeze(2).expand_as(
	densepose_map)] = 0
	pose_map[:, :, :3] = densepose_map

	# Renormalize the DensePose channel from [0, 24] to [0, 255].
	pose_map[:, :, 2] = pose_map[:, :, 2] * (255 / 24)
	# Normalize from [0, 1] to [-1, 1].
	pose_map = pose_map * 2 - 1
	return pose_map


	def random_roll(tensors):
	r"""Randomly roll the input tensors along x and y dimensions. Also randomly
	flip the tensors.

	Args:
	tensors (list of 4D tensors): Input tensors.
	Returns:
	output (list of 4D tensors): Rolled tensors.
	"""
	h, w = tensors[0].shape[2:]
	ny = np.random.choice([np.random.randint(h//16),
	h-np.random.randint(h//16)])
	nx = np.random.choice([np.random.randint(w//16),
	w-np.random.randint(w//16)])
	flip = np.random.rand() > 0.5
	return [roll(t, ny, nx, flip) for t in tensors]


	def roll(t, ny, nx, flip=False):
	r"""Roll and flip the tensor by specified amounts.

	Args:
	t (4D tensor): Input tensor.
	ny (int): Amount to roll along y dimension.
	nx (int): Amount to roll along x dimension.
	flip (bool): Whether to flip input.
	Returns:
	t (4D tensor): Output tensor.
	"""
	t = torch.cat([t[:, :, -ny:], t[:, :, :-ny]], dim=2)
	t = torch.cat([t[:, :, :, -nx:], t[:, :, :, :-nx]], dim=3)
	if flip:
	t = torch.flip(t, dims=[3])
	return t


	def detach(output):
	r"""Detach tensors in the dict.

	Args:
	output (dict): Output dict.
	Returns:
	output (dict): Detached output dict.
	"""
	if type(output) == dict:
	new_dict = dict()
	for k, v in output.items():
	new_dict[k] = detach(v)
	return new_dict
	elif type(output) == torch.Tensor:
	return output.detach()
	return output