In [None]:
##### Co3D_Multiview
import rootutils
rootutils.setup_root("/path/to/fast3r/fast3r", indicator=".project-root", pythonpath=True)

import numpy as np

from fast3r.dust3r.datasets.co3d_multiview import Co3d_Multiview
from fast3r.dust3r.datasets.base.base_stereo_view_dataset import view_name
from fast3r.dust3r.utils.image import rgb
from fast3r.dust3r.viz import SceneViz, auto_cam_size
from IPython.display import display


dataset = Co3d_Multiview(
 split="train", num_views=10, window_degree_range=360, num_samples_per_window=100, data_scaling=0.9, mask_bg='rand', ROOT="/path/to/dust3r_data/co3d_50_seqs_per_category_subset_processed", resolution=224, aug_crop=16,
)

# dataset = Co3d_Multiview(
# split="train", num_views=40, window_degree_range=360, num_samples_per_window=1, mask_bg='rand', ROOT="/path/to/dust3r_data/co3d_all_seqs_per_category_subset_processed", resolution=512, aug_crop=16,
# )

for idx in np.random.permutation(len(dataset)):
 views = dataset[idx]
 assert len(views) == dataset.num_views
 print([view_name(view) for view in views])
 viz = SceneViz()
 poses = [views[view_idx]["camera_pose"] for view_idx in range(dataset.num_views)]
 cam_size = max(auto_cam_size(poses), 1)
 for view_idx in range(dataset.num_views):
 pts3d = views[view_idx]["pts3d"]
 valid_mask = views[view_idx]["valid_mask"]
 colors = rgb(views[view_idx]["img"])
 viz.add_pointcloud(pts3d, colors, valid_mask)
 viz.add_camera(
 pose_c2w=views[view_idx]["camera_pose"],
 focal=views[view_idx]["camera_intrinsics"][0, 0],
 color=(view_idx * 255, (1 - view_idx) * 255, 0),
 image=colors,
 cam_size=cam_size,
 )
 display(viz.show(point_size=100, viewer="notebook"))
 break

In [None]:
len(dataset)

In [None]:
views[0]['camera_pose']

In [None]:
views[view_idx]["img"].shape

In [None]:
import numpy as np
import plotly.graph_objects as go
from scipy.linalg import rq
from fast3r.dust3r.datasets.co3d_multiview import Co3d_Multiview
from fast3r.dust3r.utils.image import rgb
from IPython.display import display

# Load dataset
dataset = Co3d_Multiview(
 split="train", num_views=10, window_degree_range=360, num_samples_per_window=100, mask_bg='rand', 
 ROOT="/path/to/dust3r_data/co3d_50_seqs_per_category_subset_processed", resolution=224, aug_crop=16,
)

# Function to estimate the projection matrix using Direct Linear Transformation (DLT)
def estimate_projection_matrix(world_points, image_points):
 num_points = world_points.shape[0]
 A = []

 for i in range(num_points):
 X, Y, Z = world_points[i]
 u, v = image_points[i]
 
 A.append([-X, -Y, -Z, -1, 0, 0, 0, 0, u*X, u*Y, u*Z, u])
 A.append([0, 0, 0, 0, -X, -Y, -Z, -1, v*X, v*Y, v*Z, v])
 
 A = np.array(A)
 
 # Solve using SVD (least squares solution)
 U, S, Vh = np.linalg.svd(A)
 P = Vh[-1, :].reshape(3, 4)
 
 return P

# Function to decompose the projection matrix into intrinsic and extrinsic matrices
def decompose_projection_matrix(P):
 # Decompose P into K[R|t] using RQ decomposition
 M = P[:, :3]
 K, R = rq(M)
 
 # Normalize K to make sure the diagonal elements are positive
 T = np.diag(np.sign(np.diag(K)))
 K = K @ T
 R = T @ R
 
 # Extract translation vector
 t = np.linalg.inv(K) @ P[:, 3]
 
 return K, R, t

# Function to plot the cameras as cones in 3D space based on the intrinsic matrix K
def plot_camera_cones(fig, R, t, K, color='blue', scale=0.1):
 """
 Plot the camera as a cone in 3D space based on the intrinsic matrix K for focal length.
 
 Parameters:
 fig (plotly.graph_objects.Figure): The existing Plotly figure.
 R (np.ndarray): The 3x3 rotation matrix.
 t (np.ndarray): The 3x1 translation vector.
 K (np.ndarray): The 3x3 intrinsic matrix.
 color (str): Color of the camera cone.
 scale (float): Scale factor for the size of the cone base.
 """
 # The focal length is the element K[0, 0] (assuming fx and fy are equal)
 focal_length = K[0, 0] / K[2, 2]

 # The camera center (apex of the cone)
 camera_center = -R.T @ t

 # Define the orientation of the cone based on the inverse of the rotation matrix
 direction = R.T @ np.array([0, 0, -1]) # Camera looks along the -Z axis in world space

 # Scale the direction by the focal length
 direction = direction * focal_length

 # Plot the camera cone
 fig.add_trace(go.Cone(
 x=[camera_center[0]],
 y=[camera_center[1]],
 z=[camera_center[2]],
 u=[direction[0]],
 v=[direction[1]],
 w=[direction[2]],
 colorscale=[[0, color], [1, color]], # Single color for the cone
 showscale=False,
 sizemode="absolute",
 sizeref=scale, # The size of the cone base
 anchor="tip", # The tip of the cone is the camera center
 name="Camera Cone"
 ))

# Function to visualize 3D points with RGB colors and estimated camera poses as cones using Plotly
def plot_3d_scene_with_estimated_poses(points_list, colors_list, estimated_poses):
 fig = go.Figure()

 # Plot 3D points with RGB colors
 for pts3d, colors in zip(points_list, colors_list):
 x, y, z = pts3d[:, 0], pts3d[:, 1], pts3d[:, 2]
 colors_rgb = colors.reshape(-1, 3)
 fig.add_trace(go.Scatter3d(
 x=x, y=y, z=z, mode='markers',
 marker=dict(size=2, color=colors_rgb, colorscale=None, opacity=0.8),
 name='3D Points'
 ))

 # Plot estimated camera cones
 for idx, (R, t, K) in enumerate(estimated_poses):
 plot_camera_cones(fig, R, t, K, color='blue', scale=5)

 # Update layout for better visualization
 fig.update_layout(
 scene=dict(
 xaxis_title='X',
 yaxis_title='Y',
 zaxis_title='Z',
 aspectmode='data'
 ),
 margin=dict(r=0, l=0, b=0, t=0)
 )
 
 fig.show()

# Processing a single batch of views
def process_views(N=5000):
 for idx in np.random.permutation(len(dataset)):
 views = dataset[idx]

 # Collect all 3D points, RGB colors, and estimated poses for visualization
 points_list = []
 colors_list = []
 estimated_poses = []

 for view_idx in range(dataset.num_views):
 pts3d = views[view_idx]["pts3d"] # (224, 224, 3)
 valid_mask = views[view_idx]["valid_mask"] # Only keep valid points

 # Flatten the valid 3D points
 pts3d = pts3d.reshape(-1, 3)
 valid_mask_flat = valid_mask.flatten()
 pts3d = pts3d[valid_mask_flat]

 # Flatten the RGB image and apply the valid mask
 img_rgb = rgb(views[view_idx]["img"]).reshape(-1, 3)
 img_rgb = img_rgb[valid_mask_flat]

 # Generate x and y coordinates for the image
 x_coords = np.tile(np.arange(224), 224)
 y_coords = np.repeat(np.arange(224), 224)
 pixel_coords = np.stack((x_coords, y_coords), axis=1)
 valid_pixel_coords = pixel_coords[valid_mask_flat]

 # Sample N points to speed up estimation
 if len(pts3d) > N:
 sample_indices = np.random.choice(len(pts3d), N, replace=False)
 pts3d = pts3d[sample_indices]
 img_rgb = img_rgb[sample_indices]
 valid_pixel_coords = valid_pixel_coords[sample_indices]

 points_list.append(pts3d)
 colors_list.append(img_rgb)

 image_points = valid_pixel_coords # Now image_points correspond to pts3d

 # Estimate projection matrix for this view
 P = estimate_projection_matrix(pts3d, image_points)
 
 # Decompose the projection matrix into intrinsic and extrinsic matrices
 K, R, t = decompose_projection_matrix(P)

 # Print the estimated K, R, and t
 print(f"View {view_idx} - Intrinsic matrix (K):\n{K}")
 print(f"View {view_idx} - Rotation matrix (R):\n{R}")
 print(f"View {view_idx} - Translation vector (t):\n{t}\n")
 
 # Store the estimated rotation (R), translation (t), and intrinsic matrix (K)
 estimated_poses.append((R, t, K))

 # Plot the 3D scene with estimated camera cones
 plot_3d_scene_with_estimated_poses(points_list, colors_list, estimated_poses)
 
 break # Process one sample


# Run the process with N point sampling
process_views(N=10000) # You can change N for faster/slower performance


In [None]:
# Using Ground Truth Intrinsic Matrix + cv2.solvePnP

import numpy as np
import cv2 # OpenCV for solvePnP
from fast3r.dust3r.datasets.co3d_multiview import Co3d_Multiview
from fast3r.dust3r.utils.image import rgb
from fast3r.dust3r.viz import SceneViz, auto_cam_size
from IPython.display import display

# Load dataset
dataset = Co3d_Multiview(
 split="train", num_views=10, window_degree_range=360, num_samples_per_window=100, mask_bg='rand', 
 ROOT="/path/to/dust3r_data/co3d_50_seqs_per_category_subset_processed", resolution=224, aug_crop=16,
)

# Function to convert estimated rotation and translation (R, t) into a camera pose (4x4 matrix)
def Rt_to_pose(R, t):
 """Convert rotation matrix and translation vector to a 4x4 camera pose matrix."""
 pose = np.eye(4)
 pose[:3, :3] = R
 pose[:3, 3] = t[:, 0] # Convert t from (3, 1) to (3,) shape
 return pose

# Function to invert a 4x4 pose matrix (world-to-camera to camera-to-world)
def invert_pose(pose):
 """Invert a 4x4 pose matrix."""
 R_inv = pose[:3, :3].T # Transpose the rotation part
 t_inv = -R_inv @ pose[:3, 3] # Invert the translation
 pose_inv = np.eye(4)
 pose_inv[:3, :3] = R_inv
 pose_inv[:3, 3] = t_inv
 return pose_inv

# Processing a single batch of views
def process_views(N=5000):
 for idx in np.random.permutation(len(dataset)):
 views = dataset[idx]
 assert len(views) == dataset.num_views
 print([view_name(view) for view in views])

 # Initialize SceneViz for visualization
 viz = SceneViz()
 
 # Estimate camera poses and set up visualization
 points_list = []
 colors_list = []
 estimated_poses = []
 poses_c2w = [] # List for the camera-to-world poses to visualize

 for view_idx in range(dataset.num_views):
 pts3d = views[view_idx]["pts3d"] # (224, 224, 3)
 valid_mask = views[view_idx]["valid_mask"] # Only keep valid points
 img_rgb = rgb(views[view_idx]["img"])

 # Flatten the valid 3D points
 pts3d = pts3d.reshape(-1, 3)
 valid_mask_flat = valid_mask.flatten()
 pts3d = pts3d[valid_mask_flat]

 # Flatten the RGB image and apply the valid mask
 img_rgb = img_rgb.reshape(-1, 3)
 img_rgb = img_rgb[valid_mask_flat]

 # Generate x and y coordinates for the image
 x_coords = np.tile(np.arange(224), 224)
 y_coords = np.repeat(np.arange(224), 224)
 pixel_coords = np.stack((x_coords, y_coords), axis=1)
 valid_pixel_coords = pixel_coords[valid_mask_flat]

 # Sample N points to speed up estimation
 if len(pts3d) > N:
 sample_indices = np.random.choice(len(pts3d), N, replace=False)
 pts3d = pts3d[sample_indices]
 img_rgb = img_rgb[sample_indices]
 valid_pixel_coords = valid_pixel_coords[sample_indices]

 points_list.append(pts3d)
 colors_list.append(img_rgb)

 image_points = valid_pixel_coords # Now image_points correspond to pts3d

 # Convert pts3d and image_points to float32
 pts3d = pts3d.astype(np.float32)
 image_points = image_points.astype(np.float32)

 # Get intrinsic matrix from the dataset and ensure it's float32
 K = np.array(views[view_idx]["camera_intrinsics"], dtype=np.float32)

 # Check if we have at least 4 points
 if len(pts3d) < 4 or len(image_points) < 4:
 raise ValueError("Not enough points to run solvePnP. Need at least 4.")

 # Solve for the camera pose (R, t) using OpenCV's solvePnP
 success, rvec, tvec = cv2.solvePnP(pts3d, image_points, K, None)
 R, _ = cv2.Rodrigues(rvec) # Convert rotation vector to matrix

 # Convert (R, t) to world-to-camera pose matrix (4x4)
 pose_w2c = Rt_to_pose(R, tvec)

 # Invert the pose to get camera-to-world pose
 pose_c2w = invert_pose(pose_w2c)
 poses_c2w.append(pose_c2w)

 # Use auto_cam_size to get the camera size for visualization
 cam_size = max(auto_cam_size(poses_c2w), 1)

 # Add the point clouds and estimated camera poses to the visualization
 for view_idx in range(dataset.num_views):
 pts3d = views[view_idx]["pts3d"]
 valid_mask = views[view_idx]["valid_mask"]
 colors = rgb(views[view_idx]["img"])

 # Add the pointcloud to the visualization
 viz.add_pointcloud(pts3d, colors, valid_mask)

 # Add the estimated camera pose (camera-to-world matrix)
 viz.add_camera(
 pose_c2w=poses_c2w[view_idx], # Use the inverted camera-to-world pose
 focal=views[view_idx]["camera_intrinsics"][0, 0],
 color=(view_idx * 255, (1 - view_idx) * 255, 0),
 image=colors,
 cam_size=cam_size,
 )

 # Show the visualization
 display(viz.show(point_size=100, viewer="notebook"))

 break # Process one sample


# Run the process
process_views(N=10000)


In [None]:
# Using DLT to extimate Intrinsic Matrix

import numpy as np
from scipy.linalg import rq # For RQ decomposition
from fast3r.dust3r.datasets.co3d_multiview import Co3d_Multiview
from fast3r.dust3r.utils.image import rgb
from fast3r.dust3r.viz import SceneViz, auto_cam_size
from IPython.display import display

# Load dataset
dataset = Co3d_Multiview(
 split="train", num_views=10, window_degree_range=360, num_samples_per_window=100, mask_bg='rand', 
 ROOT="/path/to/dust3r_data/co3d_50_seqs_per_category_subset_processed", resolution=224, aug_crop=16,
)

# Function to estimate the projection matrix using Direct Linear Transformation (DLT)
def estimate_projection_matrix(world_points, image_points):
 num_points = world_points.shape[0]
 A = []

 for i in range(num_points):
 X, Y, Z = world_points[i]
 u, v = image_points[i]
 
 A.append([-X, -Y, -Z, -1, 0, 0, 0, 0, u*X, u*Y, u*Z, u])
 A.append([0, 0, 0, 0, -X, -Y, -Z, -1, v*X, v*Y, v*Z, v])
 
 A = np.array(A)
 
 # Solve using SVD (least squares solution)
 U, S, Vh = np.linalg.svd(A)
 P = Vh[-1, :].reshape(3, 4)
 
 return P

# Function to decompose the projection matrix into intrinsic and extrinsic matrices
def decompose_projection_matrix(P):
 """Decompose the projection matrix P into intrinsic matrix K and extrinsic parameters (R, t)."""
 # Decompose P into K[R|t] using RQ decomposition
 M = P[:, :3]
 K, R = rq(M)
 
 # Normalize K to make sure the diagonal elements are positive
 T = np.diag(np.sign(np.diag(K)))
 K = K @ T
 R = T @ R
 
 # Extract translation vector
 t = np.linalg.inv(K) @ P[:, 3]
 
 return K, R, t

# Function to convert estimated rotation and translation (R, t) into a camera pose (4x4 matrix)
def Rt_to_pose(R, t):
 """Convert rotation matrix and translation vector to a 4x4 camera pose matrix."""
 pose = np.eye(4)
 pose[:3, :3] = R
 pose[:3, 3] = t[:, 0] # Convert t from (3, 1) to (3,) shape
 return pose

# Function to invert a 4x4 pose matrix (world-to-camera to camera-to-world)
def invert_pose(pose):
 """Invert a 4x4 pose matrix."""
 R_inv = pose[:3, :3].T # Transpose the rotation part
 t_inv = -R_inv @ pose[:3, 3] # Invert the translation
 pose_inv = np.eye(4)
 pose_inv[:3, :3] = R_inv
 pose_inv[:3, 3] = t_inv
 return pose_inv

# Processing a single batch of views
def process_views(N=5000):
 for idx in np.random.permutation(len(dataset)):
 views = dataset[idx]
 assert len(views) == dataset.num_views
 print([view_name(view) for view in views])

 # Initialize SceneViz for visualization
 viz = SceneViz()
 
 # Estimate camera poses and intrinsics, and set up visualization
 points_list = []
 colors_list = []
 estimated_poses = []
 poses_c2w = [] # List for the camera-to-world poses to visualize

 for view_idx in range(dataset.num_views):
 pts3d = views[view_idx]["pts3d"] # (224, 224, 3)
 valid_mask = views[view_idx]["valid_mask"] # Only keep valid points
 img_rgb = rgb(views[view_idx]["img"])

 # Flatten the valid 3D points
 pts3d = pts3d.reshape(-1, 3)
 valid_mask_flat = valid_mask.flatten()
 pts3d = pts3d[valid_mask_flat]

 # Flatten the RGB image and apply the valid mask
 img_rgb = img_rgb.reshape(-1, 3)
 img_rgb = img_rgb[valid_mask_flat]

 # Generate x and y coordinates for the image
 x_coords = np.tile(np.arange(224), 224)
 y_coords = np.repeat(np.arange(224), 224)
 pixel_coords = np.stack((x_coords, y_coords), axis=1)
 valid_pixel_coords = pixel_coords[valid_mask_flat]

 # Sample N points to speed up estimation
 if len(pts3d) > N:
 sample_indices = np.random.choice(len(pts3d), N, replace=False)
 pts3d = pts3d[sample_indices]
 img_rgb = img_rgb[sample_indices]
 valid_pixel_coords = valid_pixel_coords[sample_indices]

 points_list.append(pts3d)
 colors_list.append(img_rgb)

 image_points = valid_pixel_coords # Now image_points correspond to pts3d

 # Convert pts3d and image_points to float32
 pts3d = pts3d.astype(np.float32)
 image_points = image_points.astype(np.float32)

 # Estimate the projection matrix using DLT
 P = estimate_projection_matrix(pts3d, image_points)

 # Decompose the projection matrix into intrinsics and extrinsics
 K, R, t = decompose_projection_matrix(P)

 # Print the estimated intrinsics and extrinsics
 print(f"View {view_idx} - Estimated Intrinsic matrix (K):\n{K}")
 print(f"View {view_idx} - Estimated Rotation matrix (R):\n{R}")
 print(f"View {view_idx} - Estimated Translation vector (t):\n{t}\n")

 # Convert (R, t) to world-to-camera pose matrix (4x4)
 pose_w2c = Rt_to_pose(R, t.reshape(-1, 1))

 # Invert the pose to get camera-to-world pose
 pose_c2w = invert_pose(pose_w2c)
 poses_c2w.append(pose_c2w)

 # Use auto_cam_size to get the camera size for visualization
 cam_size = max(auto_cam_size(poses_c2w), 1)

 # Add the point clouds and estimated camera poses to the visualization
 for view_idx in range(dataset.num_views):
 pts3d = views[view_idx]["pts3d"]
 valid_mask = views[view_idx]["valid_mask"]
 colors = rgb(views[view_idx]["img"])

 # Add the pointcloud to the visualization
 viz.add_pointcloud(pts3d, colors, valid_mask)

 # Add the estimated camera pose (camera-to-world matrix)
 viz.add_camera(
 pose_c2w=poses_c2w[view_idx], # Use the inverted camera-to-world pose
 focal=K[0, 0] / K[2, 2], # Use the estimated focal length from K
 # focal=None,
 color=(view_idx * 255, (1 - view_idx) * 255, 0),
 image=colors,
 cam_size=cam_size,
 )

 # Show the visualization
 display(viz.show(point_size=100, viewer="notebook"))

 break # Process one sample


# Run the process
process_views(N=10000)


In [None]:
# Guess focal length and use cv2.solvePnPRansac to solve for extrinsics

import rootutils
rootutils.setup_root("/path/to/fast3r/fast3r", indicator=".project-root", pythonpath=True)


import numpy as np
import torch
import cv2
from fast3r.dust3r.datasets.co3d_multiview import Co3d_Multiview
from fast3r.dust3r.datasets.base.base_stereo_view_dataset import view_name
from fast3r.dust3r.utils.image import rgb
from fast3r.dust3r.viz import SceneViz, auto_cam_size
from IPython.display import display
from fast3r.dust3r.cloud_opt.init_im_poses import fast_pnp # Import fast_pnp

# Load dataset
dataset = Co3d_Multiview(
 split="train", num_views=2, window_degree_range=360, num_samples_per_window=100, mask_bg='rand', 
 ROOT="/path/to/dust3r_data/co3d_50_seqs_per_category_subset_processed", resolution=224, aug_crop=16,
)

# Function to process views and estimate camera poses using fast_pnp
def process_views_with_fast_pnp(niter_PnP=10):
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 
 for idx in np.random.permutation(len(dataset)):
 views = dataset[idx]
 assert len(views) == dataset.num_views
 print([view_name(view) for view in views])

 # Initialize SceneViz for visualization
 viz = SceneViz()
 
 # Estimate camera poses and focal lengths, and set up visualization
 points_list = []
 colors_list = []
 estimated_poses = []
 estimated_focals = [] # List for the guessed focal lengths
 poses_c2w = [] # List for the camera-to-world poses to visualize

 for view_idx in range(dataset.num_views):
 pts3d = views[view_idx]["pts3d"] # (224, 224, 3) shape
 valid_mask = views[view_idx]["valid_mask"] # (224, 224) mask
 img_rgb = rgb(views[view_idx]["img"])

 # Do not flatten pts3d or valid_mask here for fast_pnp
 points_list.append(pts3d)
 colors_list.append(img_rgb)

 # Call fast_pnp with unflattened pts3d and mask
 focal_length, pose_c2w = fast_pnp(
 torch.tensor(pts3d, device=device), # Pass original unmasked pts3d
 None, # Guess focal length
 torch.tensor(valid_mask, device=device, dtype=torch.bool), # Valid mask (unflattened)
 device,
 pp=None, # Use default principal point (center of image)
 niter_PnP=niter_PnP
 )

 if pose_c2w is None:
 print(f"Failed to estimate pose for view {view_idx}")
 continue

 # Store the estimated camera-to-world pose and focal length
 poses_c2w.append(pose_c2w.cpu().numpy())
 estimated_focals.append(focal_length)
 print(f"View {view_idx} - Estimated Focal Length: {focal_length}")

 # Use auto_cam_size to get the camera size for visualization
 cam_size = max(auto_cam_size(poses_c2w), 1)

 # Add the point clouds and estimated camera poses to the visualization
 for view_idx in range(dataset.num_views):
 pts3d = views[view_idx]["pts3d"]
 valid_mask = views[view_idx]["valid_mask"]
 colors = rgb(views[view_idx]["img"])

 # Add the pointcloud to the visualization
 viz.add_pointcloud(pts3d, colors, valid_mask)

 # Add the estimated camera pose (camera-to-world matrix) and focal length
 viz.add_camera(
 pose_c2w=poses_c2w[view_idx], # Use the estimated camera-to-world pose
 focal=estimated_focals[view_idx], # Use the estimated focal length for each view
 color=np.random.randint(0, 256, size=3), # Generate a random RGB color
 image=colors,
 cam_size=cam_size,
 )

 # Show the visualization
 display(viz.show(point_size=100, viewer="notebook"))

 break # Process one sample


# Run the process using fast_pnp
process_views_with_fast_pnp()



In [None]:
%load_ext autoreload
%autoreload 2

import rootutils
rootutils.setup_root("/path/to/fast3r/fast3r", indicator=".project-root", pythonpath=True)

import fast3r
from fast3r.dust3r.viz_plotly import SceneViz # Import Plotly version for visualization

import importlib
importlib.reload(fast3r.dust3r.viz_plotly)

# Load dataset
dataset = Co3d_Multiview(
 split="train", num_views=2, window_degree_range=360, num_samples_per_window=100, mask_bg='rand', 
 ROOT="/path/to/dust3r_data/co3d_50_seqs_per_category_subset_processed", resolution=224, aug_crop=16,
)

# Run the process using fast_pnp
process_views_with_fast_pnp()


In [None]:
import numpy as np
import plotly.graph_objects as go
from sklearn.cluster import KMeans
from sklearn.utils import shuffle
from PIL import Image

def image2zvals(img, n_colors=4, n_training_pixels=1000, random_seed=42):
 """Perform color quantization on the image using K-means clustering."""
 if img.ndim != 3:
 raise ValueError(f"Your image does not appear to be a color image. Its shape is {img.shape}")
 
 rows, cols, d = img.shape
 if d < 3:
 raise ValueError(f"A color image should have the shape (m, n, d), d=3 or 4. Your d={d}")
 
 if img.max() > 1:
 img = np.clip(img / 255.0, 0, 1)

 observations = img[:, :, :3].reshape(rows * cols, 3)
 training_pixels = shuffle(observations, random_state=random_seed)[:n_training_pixels]
 
 kmeans = KMeans(n_clusters=n_colors, random_state=random_seed).fit(training_pixels)
 codebook = kmeans.cluster_centers_
 indices = kmeans.predict(observations)
 
 z_vals = indices.astype(float) / (n_colors - 1) # Normalize to [0, 1]
 z_vals = z_vals.reshape(rows, cols)

 # Generate the colorscale for Plotly
 scale = np.linspace(0, 1, n_colors)
 colors = (codebook * 255).astype(np.uint8)
 plotly_colorscale = [[s, f'rgb{tuple(c)}'] for s, c in zip(scale, colors)]
 
 return z_vals, plotly_colorscale

def plot_quantized_heatmap(image):
 """Plot the quantized image as a 2D heatmap to debug the color quantization."""
 z_vals, pl_colorscale = image2zvals(image)

 fig = go.Figure(data=go.Heatmap(
 z=z_vals, 
 colorscale=pl_colorscale,
 showscale=False
 ))

 fig.update_layout(
 title="Quantized Image Heatmap",
 xaxis=dict(visible=False),
 yaxis=dict(visible=False),
 width=600,
 height=600
 )
 
 fig.show()

# Example Usage
image_path = '/path/to/beef_jerky/IMG_0050.jpg'
image = np.array(Image.open(image_path))

plot_quantized_heatmap(image)


In [None]:
import numpy as np
from PIL import Image
from sklearn.cluster import KMeans
from sklearn.utils import shuffle
import plotly.graph_objects as go

# Function to create a surface (modified from the blog)
def surface(rows, cols):
 """Generate a surface with a sine and cosine wave for testing."""
 x = np.linspace(-np.pi, np.pi, cols)
 y = np.linspace(-np.pi, np.pi, rows)
 x, y = np.meshgrid(x, y)
 z = 0.5 * np.cos(x / 2) + 0.2 * np.sin(y / 4)
 return x, y, z

# Helper function to quantize the image using K-means
def image2zvals(img, n_colors=64, n_training_pixels=10000, rngs=123):
 """Quantize the image to n_colors using KMeans."""
 rows, cols, _ = img.shape

 # Normalize the image if necessary
 if img.max() > 1:
 img = np.clip(img / 255.0, 0, 1)

 observations = img[:, :, :3].reshape(rows * cols, 3)
 training_pixels = shuffle(observations, random_state=rngs)[:n_training_pixels]

 kmeans = KMeans(n_clusters=n_colors, random_state=rngs).fit(training_pixels)
 codebook = kmeans.cluster_centers_
 indices = kmeans.predict(observations)

 z_vals = indices.astype(float) / (n_colors - 1) # Normalize to [0, 1]
 z_vals = z_vals.reshape(rows, cols)

 # Generate the Plotly colorscale
 scale = np.linspace(0, 1, n_colors)
 colors = (codebook * 255).astype(np.uint8)
 plotly_colorscale = [[s, f'rgb{tuple(c)}'] for s, c in zip(scale, colors)]

 return z_vals, plotly_colorscale

# Generate triangles for the mesh
def regular_triangles(rows, cols):
 """Generate regular triangles for a mesh."""
 triangles = []
 for i in range(rows - 1):
 for j in range(cols - 1):
 k = j + i * cols
 triangles.extend([[k, k + cols, k + 1 + cols], [k, k + 1 + cols, k + 1]])
 return np.array(triangles)

# Create mesh data for texture mapping
def mesh_data(img, n_colors=32, n_training_pixels=1000):
 """Generate mesh data with quantized color intensities for the image."""
 # Quantize the downsampled image
 z_vals, pl_colorscale = image2zvals(img, n_colors=n_colors, n_training_pixels=n_training_pixels)

 # Generate triangles
 rows, cols, _ = img.shape
 triangles = regular_triangles(rows, cols)
 I, J, K = triangles.T

 # Assign intensity to each triangle
 zc = z_vals.flatten()[triangles]
 tri_color_intensity = [zc[k][2] if k % 2 else zc[k][1] for k in range(len(zc))]

 return I, J, K, tri_color_intensity, pl_colorscale

# Function to downsample the image and create the 3D Mesh3d object for plotting
def create_mesh3d(img, resolution=64, n_colors=256, view_idx=0):
 """Creates a Mesh3d object for the image texture mapping with downsampled image."""
 # Downsample the image first
 img_downsampled = np.array(Image.fromarray(img).resize((resolution, resolution)))

 # Generate the surface mesh based on downsampled resolution
 rows, cols, _ = img_downsampled.shape
 x, y, z = surface(rows, cols)

 # Get the mesh data
 I, J, K, tri_color_intensity, pl_colorscale = mesh_data(img_downsampled, n_colors=n_colors)

 # Create the Mesh3d trace
 mesh3d_trace = go.Mesh3d(
 x=x.flatten(), y=np.flipud(y).flatten(), z=z.flatten() + view_idx, # Offset z for different views
 i=I, j=J, k=K,
 intensity=tri_color_intensity,
 intensitymode="cell",
 colorscale=pl_colorscale,
 showscale=False,
 name=f"Image {view_idx}"
 )

 return mesh3d_trace

# Load two images for testing
image1_path = '/path/to/beef_jerky/IMG_0050.jpg'
image2_path = '/path/to/beef_jerky/IMG_0051.jpg'

image1 = np.array(Image.open(image1_path))
image2 = np.array(Image.open(image2_path))

# Test with one surface using Mesh3d
fig1 = go.Figure()

# Add the first mesh with the first image
mesh1 = create_mesh3d(image1, resolution=256, n_colors=128, view_idx=0)
fig1.add_trace(mesh1)

fig1.update_layout(
 title="One Surface Mesh3d Test (Using Surface Mesh and Downsampled Image)",
 scene=dict(aspectmode='data')
)

fig1.show()

# Test with two surfaces using Mesh3d
fig2 = go.Figure()

# Add the first mesh with the first image
mesh1 = create_mesh3d(image1, resolution=128, n_colors=256, view_idx=0)
fig2.add_trace(mesh1)

# Add the second mesh with the second image
mesh2 = create_mesh3d(image2, resolution=64, n_colors=256, view_idx=1)
fig2.add_trace(mesh2)

fig2.update_layout(
 title="Two Surface Mesh3d Test (Using Surface Mesh and Downsampled Image)",
 scene=dict(aspectmode='data')
)

fig2.show()


In [None]:
import numpy as np
import plotly.graph_objects as go
from PIL import Image

def create_camera_frustum_with_image(pose_c2w, focal, H, W, image=None, scale=0.05, color='blue', resolution=64):
 # Create frustum points in camera space
 depth = focal * scale
 hw_ratio = W / H
 
 frustum_points = np.array([
 [0, 0, 0], # Camera origin
 [-hw_ratio * depth, -depth, depth], # Bottom left corner of the frustum
 [hw_ratio * depth, -depth, depth], # Bottom right corner
 [hw_ratio * depth, depth, depth], # Top right corner
 [-hw_ratio * depth, depth, depth], # Top left corner
 ])
 
 # Transform frustum points to world coordinates
 frustum_points_homogeneous = np.hstack([frustum_points, np.ones((frustum_points.shape[0], 1))]) # Homogeneous coordinates
 frustum_points_world = (pose_c2w @ frustum_points_homogeneous.T).T[:, :3] # Apply pose transformation

 # Frustum lines (edges of the pyramid)
 edges = [
 (0, 1), (0, 2), (0, 3), (0, 4), # From camera to corners of the image plane
 (1, 2), (2, 3), (3, 4), (4, 1) # Edges of the image plane
 ]
 
 # Combine all edges into one trace
 x_vals, y_vals, z_vals = [], [], []
 for edge in edges:
 x_vals += [frustum_points_world[edge[0], 0], frustum_points_world[edge[1], 0], None] # Add None to break the line
 y_vals += [frustum_points_world[edge[0], 1], frustum_points_world[edge[1], 1], None]
 z_vals += [frustum_points_world[edge[0], 2], frustum_points_world[edge[1], 2], None]
 
 frustum_trace = go.Scatter3d(
 x=x_vals,
 y=y_vals,
 z=z_vals,
 mode='lines',
 line=dict(color=color),
 name="Camera Frustum",
 legendgroup=f"frustum_{id(pose_c2w)}",
 showlegend=True
 )

 # Add image to the base of the frustum if available
 image_surface_trace = None
 if image is not None:
 # Downsample the image to finer resolution for better color mapping
 img = np.array(image.resize((resolution, resolution))) # Resize for faster processing
 H_img, W_img, _ = img.shape

 # Create mesh grid on the base of the frustum
 u = np.linspace(0, 1, W_img)
 v = np.linspace(0, 1, H_img)
 uu, vv = np.meshgrid(u, v)

 # Bottom rectangle vertices of the frustum (for image mapping)
 img_vertices = frustum_points_world[1:5] # Bottom rectangle of the frustum
 img_x, img_y, img_z = img_vertices[:, 0], img_vertices[:, 1], img_vertices[:, 2]

 # Bilinearly interpolate to create a fine grid for the image mapping
 X = img_x[0] * (1 - uu) * (1 - vv) + img_x[1] * uu * (1 - vv) + img_x[3] * (1 - uu) * vv + img_x[2] * uu * vv
 Y = img_y[0] * (1 - uu) * (1 - vv) + img_y[1] * uu * (1 - vv) + img_y[3] * (1 - uu) * vv + img_y[2] * uu * vv
 Z = img_z[0] * (1 - uu) * (1 - vv) + img_z[1] * uu * (1 - vv) + img_z[3] * (1 - uu) * vv + img_z[2] * uu * vv

 # Compute grayscale intensity (average of RGB channels)
 grayscale_img = np.mean(img, axis=-1) / 255.0 # Normalize to [0, 1]

 # Create surface trace for the grayscale image
 image_surface_trace = go.Surface(
 x=X,
 y=Y,
 z=Z,
 surfacecolor=grayscale_img, # Use the grayscale image
 colorscale='gray', # Grayscale color scale
 showscale=False,
 name="Camera Frustum Image",
 legendgroup=f"frustum_{id(pose_c2w)}", # Link with the frustum lines
 showlegend=False # Hide separate legend, it's linked to the frustum lines
 )

 return [frustum_trace, image_surface_trace] if image_surface_trace else [frustum_trace]

def plot_cameras(camera_poses, focals, H, W, images=None, scale=0.05, resolution=64):
 fig = go.Figure()

 # Add camera frustums to the plot
 for i, (pose_c2w, focal) in enumerate(zip(camera_poses, focals)):
 image = images[i] if images is not None else None
 frustum_traces = create_camera_frustum_with_image(pose_c2w, focal, H, W, image, scale, color=f'rgb({50*i}, {100}, {150})', resolution=resolution)
 for trace in frustum_traces:
 if trace is not None:
 fig.add_trace(trace)

 # Set 3D aspect ratio and layout
 fig.update_layout(scene=dict(aspectmode='data'),
 scene_camera=dict(eye=dict(x=1.5, y=1.5, z=1.5)),
 title="Camera Poses and Frustums with Images")
 
 fig.show()

# Example usage with camera poses (4x4 matrices) and focals
camera_poses = [
 np.eye(4), # Identity pose for the first camera
 np.array([[1, 0, 0, 0.5], [0, 1, 0, 0.5], [0, 0, 1, 0.5], [0, 0, 0, 1]]) # Example pose for the second camera
]
focals = [20, 500] # Focal lengths of the cameras
H, W = 1080, 1920 # Example image dimensions

# Load example images (replace with real images)
image1 = Image.open('/path/to/beef_jerky/IMG_0050.jpg')
image2 = Image.open('/path/to/beef_jerky/IMG_0050.jpg')

plot_cameras(camera_poses, focals, H, W, images=[image1, image2], scale=0.1, resolution=128)


# ScanNet++

In [None]:
%load_ext autoreload
%autoreload 2
##### ScanNetpp_Multiview

import rootutils
rootutils.setup_root("/path/to/fast3r/fast3r", indicator=".project-root", pythonpath=True)

import itertools
import json
import os.path as osp
from collections import deque

import numpy as np

from fast3r.dust3r.datasets.scannetpp_multiview import ScanNetpp_Multiview

from fast3r.dust3r.datasets.base.base_stereo_view_dataset import view_name
from fast3r.dust3r.utils.image import rgb
from fast3r.dust3r.viz import SceneViz, auto_cam_size
from IPython.display import display



dataset = ScanNetpp_Multiview(num_views=8, data_scaling=0.5, window_size=10, num_samples_per_window=1, split='train', ordered=True, ROOT="/path/to/dust3r_data/scannetpp_processed", resolution=512, aug_crop=16)

for idx in np.random.permutation(len(dataset)):
 # views = dataset[idx]
 views = dataset[-1]
 assert len(views) == dataset.num_views
 print([view_name(view) for view in views])
 viz = SceneViz()
 poses = [views[view_idx]['camera_pose'] for view_idx in range(dataset.num_views)]
 cam_size = max(auto_cam_size(poses), 1)
 for view_idx in range(dataset.num_views):
 pts3d = views[view_idx]['pts3d']
 valid_mask = views[view_idx]['valid_mask']
 colors = rgb(views[view_idx]['img'])
 viz.add_pointcloud(pts3d, colors, valid_mask)
 viz.add_camera(pose_c2w=views[view_idx]['camera_pose'],
 focal=views[view_idx]['camera_intrinsics'][0, 0],
 color=(view_idx*255, (1 - view_idx)*255, 0),
 image=colors,
 cam_size=cam_size)
 display(viz.show(point_size=100, viewer="notebook"))
 break



In [None]:
dataset = 80_000 @ dataset
dataset.set_epoch(0)

In [None]:
%%timeit

dataset[100]

In [None]:
views = dataset[1005]
assert len(views) == dataset.num_views
print([view_name(view) for view in views])
viz = SceneViz()
poses = [views[view_idx]['camera_pose'] for view_idx in range(dataset.num_views)]
cam_size = max(auto_cam_size(poses), 1)
for view_idx in range(dataset.num_views):
 pts3d = views[view_idx]['pts3d']
 valid_mask = views[view_idx]['valid_mask']
 colors = rgb(views[view_idx]['img'])
 viz.add_pointcloud(pts3d, colors, valid_mask)
 viz.add_camera(pose_c2w=views[view_idx]['camera_pose'],
 focal=views[view_idx]['camera_intrinsics'][0, 0],
 color=(idx*255, (1 - idx)*255, 0),
 image=colors,
 cam_size=cam_size)
display(viz.show(point_size=100, viewer="notebook"))

In [None]:
##### MegaDepth

import itertools
import json
import os.path as osp
from collections import deque

import numpy as np

# from dust3r.datasets.megadepth_multiview import MegaDepth_Multiview
from dust3r.datasets.megadepth import MegaDepth

from dust3r.datasets.base.base_stereo_view_dataset import view_name
from dust3r.utils.image import rgb
from dust3r.viz import SceneViz, auto_cam_size
from IPython.display import display



# dataset = MegaDepth_Multiview(split='train', num_views=4, window_size=60, num_samples_per_window=100, ROOT="/path/to/dust3r_data/megadepth_processed", resolution=512, aug_crop=16)
dataset = MegaDepth(split='train', ROOT="/path/to/dust3r_data/megadepth_processed", resolution=512, aug_crop=16)

views = dataset[0]
assert len(views) == dataset.num_views
print([view_name(view) for view in views])
viz = SceneViz()
poses = [views[view_idx]['camera_pose'] for view_idx in range(dataset.num_views)]
cam_size = max(auto_cam_size(poses), 1)
for view_idx in range(dataset.num_views):
 pts3d = views[view_idx]['pts3d']
 valid_mask = views[view_idx]['valid_mask']
 colors = rgb(views[view_idx]['img'])
 viz.add_pointcloud(pts3d, colors, valid_mask)
 viz.add_camera(pose_c2w=views[view_idx]['camera_pose'],
 focal=views[view_idx]['camera_intrinsics'][0, 0],
 color=(view_idx*255, (1 - view_idx)*255, 0),
 image=colors,
 cam_size=cam_size)
display(viz.show(point_size=100, viewer="notebook"))

In [None]:
##### MegaDepth_Multiview
import rootutils
rootutils.setup_root("/path/to/fast3r/fast3r", indicator=".project-root", pythonpath=True)

import itertools
import json
import os.path as osp
from collections import deque

import numpy as np

from fast3r.dust3r.datasets.megadepth_multiview import MegaDepth_Multiview

from fast3r.dust3r.datasets.base.base_stereo_view_dataset import view_name
from fast3r.dust3r.utils.image import rgb
from fast3r.dust3r.viz import SceneViz, auto_cam_size
from IPython.display import display



# dataset = MegaDepth_Multiview(split='train', num_views=20, window_size=40, num_samples_per_window=1, ROOT="/path/to/dust3r_data/megadepth_processed", resolution=512, aug_crop=16)
dataset = 100 @ MegaDepth_Multiview(split='val', num_views=12, window_size=24, num_samples_per_window=100, ROOT="/path/to/dust3r_data/megadepth_processed", resolution=(512, 336), seed=777)
dataset.set_epoch(0)
print(dataset)

for idx in np.random.permutation(len(dataset)):
 views = dataset[idx]
 # views = dataset[-1]
 # assert len(views) == dataset.num_views
 # print([view_name(view) for view in views])
 viz = SceneViz()
 poses = [views[view_idx]['camera_pose'] for view_idx in range(len(views))]
 cam_size = max(auto_cam_size(poses), 1)
 for view_idx in range(len(views)):
 pts3d = views[view_idx]['pts3d']
 valid_mask = views[view_idx]['valid_mask']
 colors = rgb(views[view_idx]['img'])
 viz.add_pointcloud(pts3d, colors, valid_mask)
 viz.add_camera(pose_c2w=views[view_idx]['camera_pose'],
 # focal=views[view_idx]['camera_intrinsics'][0, 0],
 color=(view_idx*255, (1 - view_idx)*255, 0),
 image=colors,
 cam_size=cam_size)
 display(viz.show(point_size=100, viewer="notebook"))
 break

In [None]:
# visualize the rgb
import matplotlib.pyplot as plt
import numpy as np

# Load the images from the views
views = dataset[89]
images = [rgb(view['img']) for view in views]

# Plot the images
fig, axes = plt.subplots(1, len(images), figsize=(40, 8))
for i, ax in enumerate(axes.flat):
 ax.imshow(images[i])
 ax.axis('off')
 

plt.show()

# show the valid mask
# Load the images from the views
fig, axes = plt.subplots(1, len(images), figsize=(40, 8))
for i, ax in enumerate(axes.flat):
 ax.imshow(views[i]['valid_mask'])
 ax.axis('off')

In [None]:
views[0].keys()

In [None]:
for idx in np.random.permutation(len(dataset)):
 views = dataset[idx]
 # views = dataset[-1]
 assert len(views) == dataset.num_views
 print([view_name(view) for view in views])
 viz = SceneViz()
 poses = [views[view_idx]['camera_pose'] for view_idx in range(dataset.num_views)]
 cam_size = max(auto_cam_size(poses), 1)
 for view_idx in range(dataset.num_views):
 pts3d = views[view_idx]['pts3d']
 valid_mask = views[view_idx]['valid_mask']
 colors = rgb(views[view_idx]['img'])
 viz.add_pointcloud(pts3d, colors, valid_mask)
 viz.add_camera(pose_c2w=views[view_idx]['camera_pose'],
 focal=views[view_idx]['camera_intrinsics'][0, 0],
 color=(view_idx*255, (1 - view_idx)*255, 0),
 image=colors,
 cam_size=cam_size)
 # display(viz.show(point_size=100, viewer="notebook"))

# ArkitScenes

In [None]:
# ArkitScenes_Multiview
%load_ext autoreload
%autoreload 2

import rootutils
rootutils.setup_root("/path/to/fast3r/fast3r", indicator=".project-root", pythonpath=True)

import numpy as np

from fast3r.dust3r.datasets.arkitscenes_multiview import ARKitScenes_Multiview

from fast3r.dust3r.datasets.base.base_stereo_view_dataset import view_name
from fast3r.dust3r.utils.image import rgb
from fast3r.dust3r.viz import SceneViz, auto_cam_size
from IPython.display import display

dataset = ARKitScenes_Multiview(
 split='train', data_scaling=0.5, num_views=20, window_size=30, num_samples_per_window=2, ROOT="/path/to/dust3r_data/arkitscenes_processed", resolution=(512,100), aug_crop=256
)

for idx in np.random.permutation(len(dataset)):
 views = dataset[idx]
 assert len(views) == dataset.num_views
 print(dataset.num_views)
 print([view_name(view) for view in views])
 viz = SceneViz()
 poses = [views[view_idx]['camera_pose'] for view_idx in range(dataset.num_views)]
 cam_size = max(auto_cam_size(poses), 0.2)
 for view_idx in range(dataset.num_views):
 pts3d = views[view_idx]['pts3d']
 valid_mask = views[view_idx]['valid_mask']
 colors = rgb(views[view_idx]['img'])
 viz.add_pointcloud(pts3d, colors, valid_mask)
 viz.add_camera(pose_c2w=views[view_idx]['camera_pose'],
 # focal=views[view_idx]['camera_intrinsics'][0, 0],
 color=(view_idx * 255, (1 - view_idx) * 255, 0),
 image=colors,
 cam_size=cam_size)
 display(viz.show())
 break

In [None]:
views = dataset[100]
assert len(views) == dataset.num_views
print(dataset.num_views)
print([view_name(view) for view in views])
viz = SceneViz()
poses = [views[view_idx]['camera_pose'] for view_idx in range(dataset.num_views)]
cam_size = max(auto_cam_size(poses), 0.001)
for view_idx in range(dataset.num_views):
 pts3d = views[view_idx]['pts3d']
 valid_mask = views[view_idx]['valid_mask']
 colors = rgb(views[view_idx]['img'])
 viz.add_pointcloud(pts3d, colors, valid_mask)
 viz.add_camera(pose_c2w=views[view_idx]['camera_pose'],
 focal=views[view_idx]['camera_intrinsics'][0, 0],
 color=(view_idx * 255, (1 - view_idx) * 255, 0),
 image=colors,
 cam_size=cam_size)
display(viz.show())
break

In [None]:
# visualize the images from views
import matplotlib.pyplot as plt
import numpy as np

# Load the images from the views
images = [rgb(view['img']) for view in views]

# Plot the images
fig, axes = plt.subplots(2, 4, figsize=(16, 8))
for i, ax in enumerate(axes.flat):
 ax.imshow(images[i])
 ax.axis('off')
 ax.set_title(f"View {i}")
plt.show()


# Habitat

In [None]:
# Habitat
%load_ext autoreload
%autoreload 2

import rootutils
rootutils.setup_root("/path/to/fast3r/fast3r", indicator=".project-root", pythonpath=True)

import numpy as np

from fast3r.dust3r.datasets.habitat import Habitat

from fast3r.dust3r.datasets.base.base_stereo_view_dataset import view_name
from fast3r.dust3r.utils.image import rgb
from fast3r.dust3r.viz import SceneViz, auto_cam_size
from IPython.display import display

dataset = Habitat(1_000, split='train', ROOT="/path/to/dust3r_data/habitat_processed",
 resolution=224, aug_crop=16)

for idx in np.random.permutation(len(dataset)):
 views = dataset[idx]
 assert len(views) == 2
 print(view_name(views[0]), view_name(views[1]))
 viz = SceneViz()
 poses = [views[view_idx]['camera_pose'] for view_idx in [0, 1]]
 cam_size = max(auto_cam_size(poses), 0.001)
 for view_idx in [0, 1]:
 pts3d = views[view_idx]['pts3d']
 valid_mask = views[view_idx]['valid_mask']
 colors = rgb(views[view_idx]['img'])
 viz.add_pointcloud(pts3d, colors, valid_mask)
 viz.add_camera(pose_c2w=views[view_idx]['camera_pose'],
 focal=views[view_idx]['camera_intrinsics'][0, 0],
 color=(idx * 255, (1 - idx) * 255, 0),
 image=colors,
 cam_size=cam_size)
 display(viz.show())
 break

In [None]:
# Habitat_Multiview
%load_ext autoreload
%autoreload 2

import rootutils
rootutils.setup_root("/path/to/fast3r/fast3r", indicator=".project-root", pythonpath=True)

import numpy as np

from fast3r.dust3r.datasets.habitat_multiview import Habitat_Multiview

from fast3r.dust3r.datasets.base.base_stereo_view_dataset import view_name
from fast3r.dust3r.utils.image import rgb
from fast3r.dust3r.viz import SceneViz, auto_cam_size
from IPython.display import display

dataset = Habitat_Multiview(1_000, data_scaling=0.5, split='train', num_views=12, ROOT="/path/to/dust3r_data/habitat_processed", aug_crop=16, resolution=512)
# dataset = Habitat_Multiview(1_000_000, split='train', num_views=4, ROOT='/path/to/dust3r_data/habitat_processed', aug_crop=16, resolution=(512,384))
# dataset = 100 @ Habitat_Multiview(100000, split='val', num_views=12, ROOT="/path/to/dust3r_data/habitat_processed", resolution=(512,384), seed=777)
dataset.set_epoch(0)
print(len(dataset))

for idx in np.random.permutation(len(dataset)):
 views = dataset[idx]
 assert len(views) == dataset.num_views
 print(len(views))
 print([view_name(view) for view in views])
 viz = SceneViz()
 poses = [views[view_idx]['camera_pose'] for view_idx in [0, 1]]
 cam_size = max(auto_cam_size(poses), 0.2)
 for view_idx in range(dataset.num_views):
 pts3d = views[view_idx]['pts3d']
 valid_mask = views[view_idx]['valid_mask']
 colors = rgb(views[view_idx]['img'])
 viz.add_pointcloud(pts3d, colors, valid_mask)
 viz.add_camera(pose_c2w=views[view_idx]['camera_pose'],
 focal=views[view_idx]['camera_intrinsics'][0, 0],
 color=(idx * 255, (1 - idx) * 255, 0),
 image=colors,
 cam_size=cam_size)
 display(viz.show())
 break

In [None]:
len(dataset)

# BlendedMVS

In [None]:
# BlendedMVS from Spann3r
%load_ext autoreload
%autoreload 2

import rootutils
rootutils.setup_root("/path/to/fast3r/fast3r", indicator=".project-root", pythonpath=True)

import numpy as np

from fast3r.dust3r.datasets.blendedmvs_multiview import BlendedMVS_Multiview
from fast3r.data.components.spann3r_datasets.blendedmvs import BlendMVS

from fast3r.dust3r.datasets.base.base_stereo_view_dataset import view_name
from fast3r.dust3r.utils.image import rgb
from fast3r.dust3r.viz import SceneViz, auto_cam_size
from IPython.display import display

# dataset = BlendedMVS_Multiview(split='train', ROOT="/path/to/dust3r_data/blendedmvs_processed", resolution=512, num_views=4, window_size=6, num_samples_per_window=10, ordered=True, aug_crop=16)
dataset = BlendMVS(split='train', num_frames=20, num_seq=200, ROOT='/path/to/dust3r_data/datasets_raw/BlendedMVS', resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)])

dataset.set_epoch(0)
print(len(dataset))

for idx in np.random.permutation(len(dataset)):
 views = dataset[(idx,0)]
 # assert len(views) == dataset.num_views
 print(len(views))
 print([view_name(view) for view in views])
 viz = SceneViz()
 poses = [views[view_idx]['camera_pose'] for view_idx in [0, 1]]
 cam_size = max(auto_cam_size(poses), 0.5)
 for view_idx in range(len(views)):
 pts3d = views[view_idx]['pts3d']
 valid_mask = views[view_idx]['valid_mask']
 colors = rgb(views[view_idx]['img'])
 viz.add_pointcloud(pts3d, colors, valid_mask)
 viz.add_camera(pose_c2w=views[view_idx]['camera_pose'],
 focal=views[view_idx]['camera_intrinsics'][0, 0],
 color=(idx * 255, (1 - idx) * 255, 0),
 image=colors,
 cam_size=cam_size)
 display(viz.show())
 break

# DTU

In [None]:
# BlendedMVS from Spann3r
%load_ext autoreload
%autoreload 2

import rootutils
rootutils.setup_root("/path/to/fast3r/fast3r", indicator=".project-root", pythonpath=True)

import numpy as np

from fast3r.data.components.spann3r_datasets.dtu import DTU

from fast3r.dust3r.datasets.base.base_stereo_view_dataset import view_name
from fast3r.dust3r.utils.image import rgb
from fast3r.dust3r.viz import SceneViz, auto_cam_size
from IPython.display import display

dataset = DTU(split='test', ROOT='/path/to/dust3r_data/dtu_test_mvsnet_release', resolution=512, num_seq=1, full_video=True, kf_every=5)

dataset.set_epoch(0)
print(len(dataset))

for idx in np.random.permutation(len(dataset)):
 views = dataset[(idx,0)]
 # assert len(views) == dataset.num_views
 print(len(views))
 print([view_name(view) for view in views])
 viz = SceneViz()
 poses = [views[view_idx]['camera_pose'] for view_idx in [0, 1]]
 cam_size = max(auto_cam_size(poses), 0.5)
 for view_idx in range(len(views)):
 pts3d = views[view_idx]['pts3d']
 valid_mask = views[view_idx]['valid_mask']
 colors = rgb(views[view_idx]['img'])
 viz.add_pointcloud(pts3d, colors, valid_mask)
 viz.add_camera(pose_c2w=views[view_idx]['camera_pose'],
 focal=views[view_idx]['camera_intrinsics'][0, 0],
 color=(idx * 255, (1 - idx) * 255, 0),
 image=colors,
 cam_size=cam_size)
 display(viz.show())
 break

In [None]:
# BlendedMVS from Spann3r
%load_ext autoreload
%autoreload 2

import rootutils
rootutils.setup_root("/path/to/fast3r/fast3r", indicator=".project-root", pythonpath=True)

import numpy as np

from fast3r.data.components.spann3r_datasets.seven_scenes import SevenScenes

from fast3r.dust3r.datasets.base.base_stereo_view_dataset import view_name
from fast3r.dust3r.utils.image import rgb
from fast3r.dust3r.viz import SceneViz, auto_cam_size
from IPython.display import display

dataset = SevenScenes(split='test', ROOT='/path/to/dust3r_data/7_scenes_processed', resolution=512, num_seq=1, full_video=False, tuple_path="/path/to/dust3r_data/7_scenes_processed/")

dataset.set_epoch(0)
print(len(dataset))

for idx in np.random.permutation(len(dataset)):
 views = dataset[(idx,0)]
 # assert len(views) == dataset.num_views
 print(len(views))
 print([view_name(view) for view in views])
 viz = SceneViz()
 poses = [views[view_idx]['camera_pose'] for view_idx in [0, 1]]
 cam_size = max(auto_cam_size(poses), 0.5)
 for view_idx in range(len(views)):
 pts3d = views[view_idx]['pts3d']
 valid_mask = views[view_idx]['valid_mask']
 colors = rgb(views[view_idx]['img'])
 viz.add_pointcloud(pts3d, colors, valid_mask)
 viz.add_camera(pose_c2w=views[view_idx]['camera_pose'],
 focal=views[view_idx]['camera_intrinsics'][0, 0],
 color=(idx * 255, (1 - idx) * 255, 0),
 image=colors,
 cam_size=cam_size)
 display(viz.show())
 break

# ASE

In [None]:
# ASE_Multiview
%load_ext autoreload
%autoreload 2

import rootutils
rootutils.setup_root("../fast3r", indicator=".project-root", pythonpath=True)

import numpy as np

from fast3r.dust3r.datasets.ase_multiview import ASE_Multiview, ASE_Multiview_Simple

from fast3r.dust3r.datasets.base.base_stereo_view_dataset import view_name
from fast3r.dust3r.utils.image import rgb
from fast3r.dust3r.viz import SceneViz, auto_cam_size
from IPython.display import display

# dataset = ASE_Multiview(
# split='train', data_scaling=0.5, num_views=30, window_size=30, num_samples_per_window=1, ROOT="/home/jianingy/research/fast3r/data/aria", resolution=512, aug_crop=256
# )
dataset = ASE_Multiview_Simple(
 split='train', data_scaling=0.5, num_views=30, ROOT="/home/jianingy/research/fast3r/data/aria", resolution=512, aug_crop=256
)

for idx in np.random.permutation(len(dataset)):
 views = dataset[idx]
 assert len(views) == dataset.num_views
 print(dataset.num_views)
 print([view_name(view) for view in views])
 viz = SceneViz()
 poses = [views[view_idx]['camera_pose'] for view_idx in range(dataset.num_views)]
 cam_size = max(auto_cam_size(poses), 0.5)
 for view_idx in range(dataset.num_views):
 height, width = views[view_idx]["true_shape"]
 pts3d = views[view_idx]['pts3d']
 valid_mask = views[view_idx]['valid_mask']
 colors = rgb(views[view_idx]['img'])
 viz.add_pointcloud(pts3d, colors, valid_mask)
 viz.add_camera(pose_c2w=views[view_idx]['camera_pose'],
 # focal=views[view_idx]['camera_intrinsics'][0, 0],
 color=(view_idx * 255, (1 - view_idx) * 255, 0),
 image=np.uint8((views[view_idx]['img'].swapaxes(1, 2) if width < height else views[view_idx]['img']).permute(1, 2, 0) * 127.5 + 127.5),
 cam_size=cam_size * 3 if width < height else cam_size)
 display(viz.show())
 break

In [None]:
# visualize the rgb
import matplotlib.pyplot as plt
import numpy as np

# Load the images from the views
images = [rgb(view['img']) for view in views]

# Plot the images
fig, axes = plt.subplots(1, len(images), figsize=(40, 8))
for i, ax in enumerate(axes.flat):
 ax.imshow(images[i])
 ax.axis('off')
 # ax.set_title(f"View {i}")

plt.show()