Spaces:
Running
Running
import os | |
if "PYOPENGL_PLATFORM" not in os.environ: | |
os.environ["PYOPENGL_PLATFORM"] = "egl" | |
import math | |
import numpy as np | |
import pyrender | |
import torch | |
import trimesh | |
import cv2 | |
import gradio as gr | |
from src.datasets.vitdet_dataset import ViTDetDataset | |
from src.models import load_hmr2 | |
# Color of the mesh | |
LIGHT_BLUE = (0.65098039, 0.74117647, 0.85882353) | |
class WeakPerspectiveCamera(pyrender.Camera): | |
def __init__( | |
self, | |
scale, | |
translation, | |
znear=10.0, | |
zfar=1000.0, | |
name=None, | |
): | |
super(WeakPerspectiveCamera, self).__init__( | |
znear=znear, | |
zfar=zfar, | |
name=name, | |
) | |
self.scale = scale | |
self.translation = translation | |
def get_projection_matrix(self, width=None, height=None): | |
P = np.eye(4) | |
P[0, 0] = self.scale[0] | |
P[1, 1] = self.scale[1] | |
P[0, 3] = self.translation[0] * self.scale[0] | |
P[1, 3] = -self.translation[1] * self.scale[1] | |
P[2, 2] = -0.1 | |
return P | |
class Renderer: | |
def __init__(self, faces, resolution=(1024, 1024), orig_img=False): | |
self.resolution = resolution | |
self.faces = faces | |
self.orig_img = orig_img | |
self.renderer = pyrender.OffscreenRenderer( | |
viewport_width=self.resolution[0], | |
viewport_height=self.resolution[1], | |
point_size=1.0, | |
) | |
self.scene = pyrender.Scene(bg_color=[0.0, 0.0, 0.0, 0.0], ambient_light=(0.3, 0.3, 0.3)) | |
light = pyrender.DirectionalLight(color=[1.0, 1.0, 1.0], intensity=0.8) | |
light_pose = np.eye(4) | |
light_pose[:3, 3] = [0, -1, 1] | |
self.scene.add(light, pose=light_pose) | |
light_pose[:3, 3] = [0, 1, 1] | |
self.scene.add(light, pose=light_pose) | |
light_pose[:3, 3] = [1, 1, 2] | |
self.scene.add(light, pose=light_pose) | |
def render(self, verts, cam, color=LIGHT_BLUE, znear=1.0, zfar=10000.0): | |
mesh = trimesh.Trimesh(vertices=verts, faces=self.faces, process=False) | |
Rx = trimesh.transformations.rotation_matrix(math.radians(180), [1, 0, 0]) | |
mesh.apply_transform(Rx) | |
sx, sy, tx, ty = cam | |
camera = WeakPerspectiveCamera(scale=[sx, sy], translation=[tx, ty], znear=znear, zfar=zfar) | |
material = pyrender.MetallicRoughnessMaterial( | |
metallicFactor=0.0, alphaMode="OPAQUE", baseColorFactor=LIGHT_BLUE | |
) | |
mesh = pyrender.Mesh.from_trimesh(mesh, material=material, smooth=True) | |
mesh_node = self.scene.add(mesh, "mesh") | |
camera_pose = np.eye(4) | |
cam_node = self.scene.add(camera, pose=camera_pose) | |
render_flags = pyrender.RenderFlags.RGBA | |
rgb, depth = self.renderer.render(self.scene, flags=render_flags) | |
self.scene.remove_node(mesh_node) | |
self.scene.remove_node(cam_node) | |
return rgb, depth | |
def create_temp_obj(vertices, faces): | |
mesh = trimesh.Trimesh( | |
vertices=vertices, | |
faces=faces, | |
vertex_colors=np.tile(np.array(LIGHT_BLUE + (1.0,)), (len(vertices), 1)), | |
) | |
temp_path = os.path.join(os.getcwd(), "out_mesh.obj") | |
mesh.export(temp_path) | |
return temp_path | |
def resize_and_pad(img): | |
original_type = img.dtype | |
img_to_process = img.copy() | |
h, w = img_to_process.shape[:2] | |
target_size = 1024 | |
scale = min(target_size / w, target_size / h) | |
new_w = int(w * scale) | |
new_h = int(h * scale) | |
resized = cv2.resize(img_to_process, (new_w, new_h), interpolation=cv2.INTER_AREA) | |
if len(img.shape) == 3: | |
canvas = np.zeros((target_size, target_size, img.shape[2]), dtype=original_type) | |
else: | |
canvas = np.zeros((target_size, target_size), dtype=original_type) | |
x_offset = (target_size - new_w) // 2 | |
y_offset = (target_size - new_h) // 2 | |
canvas[y_offset : y_offset + new_h, x_offset : x_offset + new_w] = resized | |
return canvas | |
def process_image(input_image): | |
img = resize_and_pad(input_image["composite"]) | |
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) | |
hmr2, hmr_2_cfg = load_hmr2() | |
device = torch.device("cpu") | |
hmr2 = hmr2.to(device) | |
hmr2.eval() | |
bbox = [0, 0, img.shape[1], img.shape[0]] | |
dataset = ViTDetDataset(hmr_2_cfg, img, np.array([bbox])) | |
dataloader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=False, num_workers=0) | |
batch = next(iter(dataloader)) | |
with torch.inference_mode(): | |
out = hmr2(batch) | |
pred_verts = hmr2.smpl(**{k: v.float() for k, v in out["pred_smpl_params"].items()}, pose2rot=False).vertices[0] | |
scale, tx, ty = out["scale"], out["tx"], out["ty"] | |
obj_verts = pred_verts.detach().cpu().numpy() | |
obj_verts[:, 1] = -obj_verts[:, 1] | |
obj_verts[:, 0] = -obj_verts[:, 0] | |
obj_path = create_temp_obj(obj_verts, hmr2.smpl.faces) | |
if str(device) == "cpu": | |
pred_verts = pred_verts * torch.tensor([-1, -1, 1])[None] | |
renderer = Renderer(hmr2.smpl.faces, resolution=(img.shape[1], img.shape[0])) | |
factor = 2.0 | |
rendered, depth = renderer.render( | |
pred_verts.detach().cpu().numpy(), | |
(scale * factor, scale * factor, tx / scale, ty / scale), | |
) | |
rendered_float = rendered.astype(np.float32) / 255.0 | |
out_img_float = img.astype(np.float32) / 255.0 | |
mask = rendered_float[:, :, 3] | |
mask = np.stack([mask] * 3, axis=-1) | |
rendered_rgb = rendered_float[:, :, :3] | |
mesh_overlay = out_img_float * (1 - mask) + rendered_rgb * mask | |
mesh_overlay = (mesh_overlay * 255).astype(np.uint8) | |
return cv2.cvtColor(mesh_overlay, cv2.COLOR_RGB2BGR), obj_path | |
iface = gr.Interface( | |
fn=process_image, | |
analytics_enabled=False, | |
inputs=gr.ImageEditor( | |
sources=("upload", "clipboard"), | |
brush=False, | |
eraser=False, | |
crop_size="1:1", | |
layers=False, | |
placeholder="Upload an image or select from the examples.", | |
), | |
outputs=[ | |
gr.Image(label="Mesh overlay"), | |
gr.Model3D( | |
clear_color=[0.0, 0.0, 0.0, 0.0], | |
label="3D Model", | |
display_mode="point_cloud", | |
), | |
], | |
title="GenZoo", | |
description=""" | |
# Generative Zoo | |
https://genzoo.is.tue.mpg.de | |
## Usage | |
1. **Input**: Select an example image or upload your own. | |
2. **Processing**: Crop the image to a square. | |
3. **Output**: | |
- 2D mesh overlay on the original image | |
- Interactive 3D model visualization | |
The demo is provided for non-commercial purposes, and its use is governed by the [LICENSE](https://genzoo.is.tue.mpg.de/license.html). \n | |
We thank the authors of [Humans in 4D: Reconstructing and Tracking Humans with Transformers](https://shubham-goel.github.io/4dhumans/) from which we borrowed components. | |
""", | |
examples=[ | |
"gradio_example_images/000014.png", | |
"gradio_example_images/000018.png", | |
"gradio_example_images/000247.png", | |
"gradio_example_images/000315.png", | |
"gradio_example_images/001114.png", | |
], | |
) | |
iface.launch( | |
) | |