|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import argparse |
|
import json |
|
import os |
|
import os.path as osp |
|
import re |
|
|
|
import cv2 |
|
import dust3r.utils.geometry as geometry |
|
import numpy as np |
|
import PIL.Image as Image |
|
import pyrender |
|
import trimesh |
|
import trimesh.exchange.ply |
|
from dust3r.datasets.utils.cropping import rescale_image_depthmap |
|
from scipy.spatial.transform import Rotation |
|
from tqdm import tqdm |
|
|
|
inv = np.linalg.inv |
|
norm = np.linalg.norm |
|
REGEXPR_DSLR = re.compile(r"^DSC(?P<frameid>\d+).JPG$") |
|
REGEXPR_IPHONE = re.compile(r"frame_(?P<frameid>\d+).jpg$") |
|
|
|
DEBUG_VIZ = None |
|
if DEBUG_VIZ is not None: |
|
import matplotlib.pyplot as plt |
|
|
|
|
|
OPENGL_TO_OPENCV = np.float32( |
|
[[1, 0, 0, 0], [0, -1, 0, 0], [0, 0, -1, 0], [0, 0, 0, 1]] |
|
) |
|
|
|
|
|
def get_parser(): |
|
parser = argparse.ArgumentParser() |
|
parser.add_argument("--scannetpp_dir", required=True) |
|
parser.add_argument("--precomputed_pairs", required=True) |
|
parser.add_argument("--output_dir", default="data/scannetpp_processed") |
|
parser.add_argument( |
|
"--target_resolution", default=920, type=int, help="images resolution" |
|
) |
|
parser.add_argument( |
|
"--pyopengl-platform", type=str, default="", help="PyOpenGL env variable" |
|
) |
|
return parser |
|
|
|
|
|
def pose_from_qwxyz_txyz(elems): |
|
qw, qx, qy, qz, tx, ty, tz = map(float, elems) |
|
pose = np.eye(4) |
|
pose[:3, :3] = Rotation.from_quat((qx, qy, qz, qw)).as_matrix() |
|
pose[:3, 3] = (tx, ty, tz) |
|
return np.linalg.inv(pose) |
|
|
|
|
|
def get_frame_number(name, cam_type="dslr"): |
|
if cam_type == "dslr": |
|
regex_expr = REGEXPR_DSLR |
|
elif cam_type == "iphone": |
|
regex_expr = REGEXPR_IPHONE |
|
else: |
|
raise NotImplementedError(f"wrong {cam_type=} for get_frame_number") |
|
matches = re.match(regex_expr, name) |
|
return matches["frameid"] |
|
|
|
|
|
def load_sfm(sfm_dir, cam_type="dslr"): |
|
|
|
with open(osp.join(sfm_dir, "cameras.txt"), "r") as f: |
|
raw = f.read().splitlines()[3:] |
|
|
|
intrinsics = {} |
|
for camera in tqdm(raw, position=1, leave=False): |
|
camera = camera.split(" ") |
|
intrinsics[int(camera[0])] = [camera[1]] + [float(cam) for cam in camera[2:]] |
|
|
|
|
|
with open(os.path.join(sfm_dir, "images.txt"), "r") as f: |
|
raw = f.read().splitlines() |
|
raw = [line for line in raw if not line.startswith("#")] |
|
|
|
img_idx = {} |
|
img_infos = {} |
|
for image, points in tqdm( |
|
zip(raw[0::2], raw[1::2]), total=len(raw) // 2, position=1, leave=False |
|
): |
|
image = image.split(" ") |
|
points = points.split(" ") |
|
|
|
idx = image[0] |
|
img_name = image[-1] |
|
assert img_name not in img_idx, "duplicate db image: " + img_name |
|
img_idx[img_name] = idx |
|
|
|
current_points2D = { |
|
int(i): (float(x), float(y)) |
|
for i, x, y in zip(points[2::3], points[0::3], points[1::3]) |
|
if i != "-1" |
|
} |
|
img_infos[idx] = dict( |
|
intrinsics=intrinsics[int(image[-2])], |
|
path=img_name, |
|
frame_id=get_frame_number(img_name, cam_type), |
|
cam_to_world=pose_from_qwxyz_txyz(image[1:-2]), |
|
sparse_pts2d=current_points2D, |
|
) |
|
|
|
|
|
with open(os.path.join(sfm_dir, "points3D.txt"), "r") as f: |
|
raw = f.read().splitlines() |
|
raw = [line for line in raw if not line.startswith("#")] |
|
|
|
points3D = {} |
|
observations = {idx: [] for idx in img_infos.keys()} |
|
for point in tqdm(raw, position=1, leave=False): |
|
point = point.split() |
|
point_3d_idx = int(point[0]) |
|
points3D[point_3d_idx] = tuple(map(float, point[1:4])) |
|
if len(point) > 8: |
|
for idx, point_2d_idx in zip(point[8::2], point[9::2]): |
|
observations[idx].append((point_3d_idx, int(point_2d_idx))) |
|
|
|
return img_idx, img_infos, points3D, observations |
|
|
|
|
|
def subsample_img_infos(img_infos, num_images, allowed_name_subset=None): |
|
img_infos_val = [(idx, val) for idx, val in img_infos.items()] |
|
if allowed_name_subset is not None: |
|
img_infos_val = [ |
|
(idx, val) |
|
for idx, val in img_infos_val |
|
if val["path"] in allowed_name_subset |
|
] |
|
|
|
if len(img_infos_val) > num_images: |
|
img_infos_val = sorted(img_infos_val, key=lambda x: x[1]["frame_id"]) |
|
kept_idx = ( |
|
np.round(np.linspace(0, len(img_infos_val) - 1, num_images)) |
|
.astype(int) |
|
.tolist() |
|
) |
|
img_infos_val = [img_infos_val[idx] for idx in kept_idx] |
|
return {idx: val for idx, val in img_infos_val} |
|
|
|
|
|
def undistort_images(intrinsics, rgb, mask): |
|
camera_type = intrinsics[0] |
|
|
|
width = int(intrinsics[1]) |
|
height = int(intrinsics[2]) |
|
fx = intrinsics[3] |
|
fy = intrinsics[4] |
|
cx = intrinsics[5] |
|
cy = intrinsics[6] |
|
distortion = np.array(intrinsics[7:]) |
|
|
|
K = np.zeros([3, 3]) |
|
K[0, 0] = fx |
|
K[0, 2] = cx |
|
K[1, 1] = fy |
|
K[1, 2] = cy |
|
K[2, 2] = 1 |
|
|
|
K = geometry.colmap_to_opencv_intrinsics(K) |
|
if camera_type == "OPENCV_FISHEYE": |
|
assert len(distortion) == 4 |
|
|
|
new_K = cv2.fisheye.estimateNewCameraMatrixForUndistortRectify( |
|
K, |
|
distortion, |
|
(width, height), |
|
np.eye(3), |
|
balance=0.0, |
|
) |
|
|
|
new_K[0, 2] = width / 2.0 |
|
new_K[1, 2] = height / 2.0 |
|
|
|
map1, map2 = cv2.fisheye.initUndistortRectifyMap( |
|
K, distortion, np.eye(3), new_K, (width, height), cv2.CV_32FC1 |
|
) |
|
else: |
|
new_K, _ = cv2.getOptimalNewCameraMatrix( |
|
K, distortion, (width, height), 1, (width, height), True |
|
) |
|
map1, map2 = cv2.initUndistortRectifyMap( |
|
K, distortion, np.eye(3), new_K, (width, height), cv2.CV_32FC1 |
|
) |
|
|
|
undistorted_image = cv2.remap( |
|
rgb, |
|
map1, |
|
map2, |
|
interpolation=cv2.INTER_LINEAR, |
|
borderMode=cv2.BORDER_REFLECT_101, |
|
) |
|
undistorted_mask = cv2.remap( |
|
mask, |
|
map1, |
|
map2, |
|
interpolation=cv2.INTER_LINEAR, |
|
borderMode=cv2.BORDER_CONSTANT, |
|
borderValue=255, |
|
) |
|
new_K = geometry.opencv_to_colmap_intrinsics(new_K) |
|
return width, height, new_K, undistorted_image, undistorted_mask |
|
|
|
|
|
def process_scenes(root, pairsdir, output_dir, target_resolution): |
|
os.makedirs(output_dir, exist_ok=True) |
|
|
|
|
|
|
|
znear = 0.05 |
|
zfar = 20.0 |
|
|
|
listfile = osp.join(pairsdir, "scene_list.json") |
|
with open(listfile, "r") as f: |
|
scenes = json.load(f) |
|
|
|
|
|
|
|
renderer = pyrender.OffscreenRenderer(0, 0) |
|
for scene in tqdm(scenes, position=0, leave=True): |
|
data_dir = os.path.join(root, "data", scene) |
|
dir_dslr = os.path.join(data_dir, "dslr") |
|
dir_iphone = os.path.join(data_dir, "iphone") |
|
dir_scans = os.path.join(data_dir, "scans") |
|
|
|
assert ( |
|
os.path.isdir(data_dir) |
|
and os.path.isdir(dir_dslr) |
|
and os.path.isdir(dir_iphone) |
|
and os.path.isdir(dir_scans) |
|
) |
|
|
|
output_dir_scene = os.path.join(output_dir, scene) |
|
scene_metadata_path = osp.join(output_dir_scene, "scene_metadata.npz") |
|
if osp.isfile(scene_metadata_path): |
|
continue |
|
|
|
pairs_dir_scene = os.path.join(pairsdir, scene) |
|
pairs_dir_scene_selected_pairs = os.path.join( |
|
pairs_dir_scene, "selected_pairs.npz" |
|
) |
|
assert osp.isfile(pairs_dir_scene_selected_pairs) |
|
selected_npz = np.load(pairs_dir_scene_selected_pairs) |
|
selection, pairs = selected_npz["selection"], selected_npz["pairs"] |
|
|
|
|
|
output_dir_scene_rgb = os.path.join(output_dir_scene, "images") |
|
output_dir_scene_depth = os.path.join(output_dir_scene, "depth") |
|
os.makedirs(output_dir_scene_rgb, exist_ok=True) |
|
os.makedirs(output_dir_scene_depth, exist_ok=True) |
|
|
|
ply_path = os.path.join(dir_scans, "mesh_aligned_0.05.ply") |
|
|
|
sfm_dir_dslr = os.path.join(dir_dslr, "colmap") |
|
rgb_dir_dslr = os.path.join(dir_dslr, "resized_images") |
|
mask_dir_dslr = os.path.join(dir_dslr, "resized_anon_masks") |
|
|
|
sfm_dir_iphone = os.path.join(dir_iphone, "colmap") |
|
rgb_dir_iphone = os.path.join(dir_iphone, "rgb") |
|
mask_dir_iphone = os.path.join(dir_iphone, "rgb_masks") |
|
|
|
|
|
with open(ply_path, "rb") as f: |
|
mesh_kwargs = trimesh.exchange.ply.load_ply(f) |
|
mesh_scene = trimesh.Trimesh(**mesh_kwargs) |
|
|
|
|
|
img_idx_dslr, img_infos_dslr, points3D_dslr, observations_dslr = load_sfm( |
|
sfm_dir_dslr, cam_type="dslr" |
|
) |
|
dslr_paths = { |
|
"in_colmap": sfm_dir_dslr, |
|
"in_rgb": rgb_dir_dslr, |
|
"in_mask": mask_dir_dslr, |
|
} |
|
|
|
( |
|
img_idx_iphone, |
|
img_infos_iphone, |
|
points3D_iphone, |
|
observations_iphone, |
|
) = load_sfm(sfm_dir_iphone, cam_type="iphone") |
|
iphone_paths = { |
|
"in_colmap": sfm_dir_iphone, |
|
"in_rgb": rgb_dir_iphone, |
|
"in_mask": mask_dir_iphone, |
|
} |
|
|
|
mesh = pyrender.Mesh.from_trimesh(mesh_scene, smooth=False) |
|
pyrender_scene = pyrender.Scene() |
|
pyrender_scene.add(mesh) |
|
|
|
selection_dslr = [ |
|
imgname + ".JPG" for imgname in selection if imgname.startswith("DSC") |
|
] |
|
selection_iphone = [ |
|
imgname + ".jpg" for imgname in selection if imgname.startswith("frame_") |
|
] |
|
|
|
|
|
for selection_cam, img_idx, img_infos, paths_data in [ |
|
(selection_dslr, img_idx_dslr, img_infos_dslr, dslr_paths), |
|
(selection_iphone, img_idx_iphone, img_infos_iphone, iphone_paths), |
|
]: |
|
rgb_dir = paths_data["in_rgb"] |
|
mask_dir = paths_data["in_mask"] |
|
for imgname in tqdm(selection_cam, position=1, leave=False): |
|
imgidx = img_idx[imgname] |
|
img_infos_idx = img_infos[imgidx] |
|
rgb = np.array(Image.open(os.path.join(rgb_dir, img_infos_idx["path"]))) |
|
mask = np.array( |
|
Image.open( |
|
os.path.join(mask_dir, img_infos_idx["path"][:-3] + "png") |
|
) |
|
) |
|
|
|
_, _, K, rgb, mask = undistort_images( |
|
img_infos_idx["intrinsics"], rgb, mask |
|
) |
|
|
|
|
|
intrinsics = geometry.colmap_to_opencv_intrinsics(K) |
|
image, mask, intrinsics = rescale_image_depthmap( |
|
rgb, |
|
mask, |
|
intrinsics, |
|
(target_resolution, target_resolution * 3.0 / 4), |
|
) |
|
|
|
W, H = image.size |
|
intrinsics = geometry.opencv_to_colmap_intrinsics(intrinsics) |
|
|
|
|
|
img_infos_idx["intrinsics"] = intrinsics |
|
rgb_outpath = os.path.join( |
|
output_dir_scene_rgb, img_infos_idx["path"][:-3] + "jpg" |
|
) |
|
image.save(rgb_outpath) |
|
|
|
depth_outpath = os.path.join( |
|
output_dir_scene_depth, img_infos_idx["path"][:-3] + "png" |
|
) |
|
|
|
renderer.viewport_width, renderer.viewport_height = W, H |
|
fx, fy, cx, cy = ( |
|
intrinsics[0, 0], |
|
intrinsics[1, 1], |
|
intrinsics[0, 2], |
|
intrinsics[1, 2], |
|
) |
|
camera = pyrender.camera.IntrinsicsCamera( |
|
fx, fy, cx, cy, znear=znear, zfar=zfar |
|
) |
|
camera_node = pyrender_scene.add( |
|
camera, pose=img_infos_idx["cam_to_world"] @ OPENGL_TO_OPENCV |
|
) |
|
|
|
depth = renderer.render( |
|
pyrender_scene, flags=pyrender.RenderFlags.DEPTH_ONLY |
|
) |
|
pyrender_scene.remove_node(camera_node) |
|
|
|
depth = (depth * 1000).astype("uint16") |
|
|
|
depth_mask = mask < 255 |
|
depth[depth_mask] = 0 |
|
Image.fromarray(depth).save(depth_outpath) |
|
|
|
trajectories = [] |
|
intrinsics = [] |
|
for imgname in selection: |
|
if imgname.startswith("DSC"): |
|
imgidx = img_idx_dslr[imgname + ".JPG"] |
|
img_infos_idx = img_infos_dslr[imgidx] |
|
elif imgname.startswith("frame_"): |
|
imgidx = img_idx_iphone[imgname + ".jpg"] |
|
img_infos_idx = img_infos_iphone[imgidx] |
|
else: |
|
raise ValueError("invalid image name") |
|
|
|
intrinsics.append(img_infos_idx["intrinsics"]) |
|
trajectories.append(img_infos_idx["cam_to_world"]) |
|
|
|
intrinsics = np.stack(intrinsics, axis=0) |
|
trajectories = np.stack(trajectories, axis=0) |
|
|
|
np.savez( |
|
scene_metadata_path, |
|
trajectories=trajectories, |
|
intrinsics=intrinsics, |
|
images=selection, |
|
pairs=pairs, |
|
) |
|
|
|
del img_infos |
|
del pyrender_scene |
|
|
|
|
|
scene_data = {} |
|
for scene_subdir in scenes: |
|
scene_metadata_path = osp.join(output_dir, scene_subdir, "scene_metadata.npz") |
|
with np.load(scene_metadata_path) as data: |
|
trajectories = data["trajectories"] |
|
intrinsics = data["intrinsics"] |
|
images = data["images"] |
|
pairs = data["pairs"] |
|
scene_data[scene_subdir] = { |
|
"trajectories": trajectories, |
|
"intrinsics": intrinsics, |
|
"images": images, |
|
"pairs": pairs, |
|
} |
|
|
|
offset = 0 |
|
counts = [] |
|
scenes = [] |
|
sceneids = [] |
|
images = [] |
|
intrinsics = [] |
|
trajectories = [] |
|
pairs = [] |
|
for scene_idx, (scene_subdir, data) in enumerate(scene_data.items()): |
|
num_imgs = data["images"].shape[0] |
|
img_pairs = data["pairs"] |
|
|
|
scenes.append(scene_subdir) |
|
sceneids.extend([scene_idx] * num_imgs) |
|
|
|
images.append(data["images"]) |
|
|
|
intrinsics.append(data["intrinsics"]) |
|
trajectories.append(data["trajectories"]) |
|
|
|
|
|
img_pairs[:, 0:2] += offset |
|
pairs.append(img_pairs) |
|
counts.append(offset) |
|
|
|
offset += num_imgs |
|
|
|
images = np.concatenate(images, axis=0) |
|
intrinsics = np.concatenate(intrinsics, axis=0) |
|
trajectories = np.concatenate(trajectories, axis=0) |
|
pairs = np.concatenate(pairs, axis=0) |
|
np.savez( |
|
osp.join(output_dir, "all_metadata.npz"), |
|
counts=counts, |
|
scenes=scenes, |
|
sceneids=sceneids, |
|
images=images, |
|
intrinsics=intrinsics, |
|
trajectories=trajectories, |
|
pairs=pairs, |
|
) |
|
print("all done") |
|
|
|
|
|
if __name__ == "__main__": |
|
parser = get_parser() |
|
args = parser.parse_args() |
|
if args.pyopengl_platform.strip(): |
|
os.environ["PYOPENGL_PLATFORM"] = args.pyopengl_platform |
|
process_scenes( |
|
args.scannetpp_dir, |
|
args.precomputed_pairs, |
|
args.output_dir, |
|
args.target_resolution, |
|
) |
|
|