Spaces:

DDoggo
/

fast3r

Runtime error

File size: 75,562 Bytes
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "##### Co3D_Multiview\n",
    "import rootutils\n",
    "rootutils.setup_root(\"/path/to/fast3r/fast3r\", indicator=\".project-root\", pythonpath=True)\n",
    "\n",
    "import numpy as np\n",
    "\n",
    "from fast3r.dust3r.datasets.co3d_multiview import Co3d_Multiview\n",
    "from fast3r.dust3r.datasets.base.base_stereo_view_dataset import view_name\n",
    "from fast3r.dust3r.utils.image import rgb\n",
    "from fast3r.dust3r.viz import SceneViz, auto_cam_size\n",
    "from IPython.display import display\n",
    "\n",
    "\n",
    "dataset = Co3d_Multiview(\n",
    "    split=\"train\", num_views=10, window_degree_range=360, num_samples_per_window=100, data_scaling=0.9, mask_bg='rand', ROOT=\"/path/to/dust3r_data/co3d_50_seqs_per_category_subset_processed\", resolution=224, aug_crop=16,\n",
    ")\n",
    "\n",
    "# dataset = Co3d_Multiview(\n",
    "#     split=\"train\", num_views=40, window_degree_range=360, num_samples_per_window=1, mask_bg='rand', ROOT=\"/path/to/dust3r_data/co3d_all_seqs_per_category_subset_processed\", resolution=512, aug_crop=16,\n",
    "# )\n",
    "\n",
    "for idx in np.random.permutation(len(dataset)):\n",
    "    views = dataset[idx]\n",
    "    assert len(views) == dataset.num_views\n",
    "    print([view_name(view) for view in views])\n",
    "    viz = SceneViz()\n",
    "    poses = [views[view_idx][\"camera_pose\"] for view_idx in range(dataset.num_views)]\n",
    "    cam_size = max(auto_cam_size(poses), 1)\n",
    "    for view_idx in range(dataset.num_views):\n",
    "        pts3d = views[view_idx][\"pts3d\"]\n",
    "        valid_mask = views[view_idx][\"valid_mask\"]\n",
    "        colors = rgb(views[view_idx][\"img\"])\n",
    "        viz.add_pointcloud(pts3d, colors, valid_mask)\n",
    "        viz.add_camera(\n",
    "            pose_c2w=views[view_idx][\"camera_pose\"],\n",
    "            focal=views[view_idx][\"camera_intrinsics\"][0, 0],\n",
    "            color=(view_idx * 255, (1 - view_idx) * 255, 0),\n",
    "            image=colors,\n",
    "            cam_size=cam_size,\n",
    "        )\n",
    "    display(viz.show(point_size=100, viewer=\"notebook\"))\n",
    "    break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(dataset)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "views[0]['camera_pose']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "views[view_idx][\"img\"].shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import plotly.graph_objects as go\n",
    "from scipy.linalg import rq\n",
    "from fast3r.dust3r.datasets.co3d_multiview import Co3d_Multiview\n",
    "from fast3r.dust3r.utils.image import rgb\n",
    "from IPython.display import display\n",
    "\n",
    "# Load dataset\n",
    "dataset = Co3d_Multiview(\n",
    "    split=\"train\", num_views=10, window_degree_range=360, num_samples_per_window=100, mask_bg='rand', \n",
    "    ROOT=\"/path/to/dust3r_data/co3d_50_seqs_per_category_subset_processed\", resolution=224, aug_crop=16,\n",
    ")\n",
    "\n",
    "# Function to estimate the projection matrix using Direct Linear Transformation (DLT)\n",
    "def estimate_projection_matrix(world_points, image_points):\n",
    "    num_points = world_points.shape[0]\n",
    "    A = []\n",
    "\n",
    "    for i in range(num_points):\n",
    "        X, Y, Z = world_points[i]\n",
    "        u, v = image_points[i]\n",
    "        \n",
    "        A.append([-X, -Y, -Z, -1, 0, 0, 0, 0, u*X, u*Y, u*Z, u])\n",
    "        A.append([0, 0, 0, 0, -X, -Y, -Z, -1, v*X, v*Y, v*Z, v])\n",
    "    \n",
    "    A = np.array(A)\n",
    "    \n",
    "    # Solve using SVD (least squares solution)\n",
    "    U, S, Vh = np.linalg.svd(A)\n",
    "    P = Vh[-1, :].reshape(3, 4)\n",
    "    \n",
    "    return P\n",
    "\n",
    "# Function to decompose the projection matrix into intrinsic and extrinsic matrices\n",
    "def decompose_projection_matrix(P):\n",
    "    # Decompose P into K[R|t] using RQ decomposition\n",
    "    M = P[:, :3]\n",
    "    K, R = rq(M)\n",
    "    \n",
    "    # Normalize K to make sure the diagonal elements are positive\n",
    "    T = np.diag(np.sign(np.diag(K)))\n",
    "    K = K @ T\n",
    "    R = T @ R\n",
    "    \n",
    "    # Extract translation vector\n",
    "    t = np.linalg.inv(K) @ P[:, 3]\n",
    "    \n",
    "    return K, R, t\n",
    "\n",
    "# Function to plot the cameras as cones in 3D space based on the intrinsic matrix K\n",
    "def plot_camera_cones(fig, R, t, K, color='blue', scale=0.1):\n",
    "    \"\"\"\n",
    "    Plot the camera as a cone in 3D space based on the intrinsic matrix K for focal length.\n",
    "    \n",
    "    Parameters:\n",
    "    fig (plotly.graph_objects.Figure): The existing Plotly figure.\n",
    "    R (np.ndarray): The 3x3 rotation matrix.\n",
    "    t (np.ndarray): The 3x1 translation vector.\n",
    "    K (np.ndarray): The 3x3 intrinsic matrix.\n",
    "    color (str): Color of the camera cone.\n",
    "    scale (float): Scale factor for the size of the cone base.\n",
    "    \"\"\"\n",
    "    # The focal length is the element K[0, 0] (assuming fx and fy are equal)\n",
    "    focal_length = K[0, 0] / K[2, 2]\n",
    "\n",
    "    # The camera center (apex of the cone)\n",
    "    camera_center = -R.T @ t\n",
    "\n",
    "    # Define the orientation of the cone based on the inverse of the rotation matrix\n",
    "    direction = R.T @ np.array([0, 0, -1])  # Camera looks along the -Z axis in world space\n",
    "\n",
    "    # Scale the direction by the focal length\n",
    "    direction = direction * focal_length\n",
    "\n",
    "    # Plot the camera cone\n",
    "    fig.add_trace(go.Cone(\n",
    "        x=[camera_center[0]],\n",
    "        y=[camera_center[1]],\n",
    "        z=[camera_center[2]],\n",
    "        u=[direction[0]],\n",
    "        v=[direction[1]],\n",
    "        w=[direction[2]],\n",
    "        colorscale=[[0, color], [1, color]],  # Single color for the cone\n",
    "        showscale=False,\n",
    "        sizemode=\"absolute\",\n",
    "        sizeref=scale,  # The size of the cone base\n",
    "        anchor=\"tip\",  # The tip of the cone is the camera center\n",
    "        name=\"Camera Cone\"\n",
    "    ))\n",
    "\n",
    "# Function to visualize 3D points with RGB colors and estimated camera poses as cones using Plotly\n",
    "def plot_3d_scene_with_estimated_poses(points_list, colors_list, estimated_poses):\n",
    "    fig = go.Figure()\n",
    "\n",
    "    # Plot 3D points with RGB colors\n",
    "    for pts3d, colors in zip(points_list, colors_list):\n",
    "        x, y, z = pts3d[:, 0], pts3d[:, 1], pts3d[:, 2]\n",
    "        colors_rgb = colors.reshape(-1, 3)\n",
    "        fig.add_trace(go.Scatter3d(\n",
    "            x=x, y=y, z=z, mode='markers',\n",
    "            marker=dict(size=2, color=colors_rgb, colorscale=None, opacity=0.8),\n",
    "            name='3D Points'\n",
    "        ))\n",
    "\n",
    "    # Plot estimated camera cones\n",
    "    for idx, (R, t, K) in enumerate(estimated_poses):\n",
    "        plot_camera_cones(fig, R, t, K, color='blue', scale=5)\n",
    "\n",
    "    # Update layout for better visualization\n",
    "    fig.update_layout(\n",
    "        scene=dict(\n",
    "            xaxis_title='X',\n",
    "            yaxis_title='Y',\n",
    "            zaxis_title='Z',\n",
    "            aspectmode='data'\n",
    "        ),\n",
    "        margin=dict(r=0, l=0, b=0, t=0)\n",
    "    )\n",
    "    \n",
    "    fig.show()\n",
    "\n",
    "# Processing a single batch of views\n",
    "def process_views(N=5000):\n",
    "    for idx in np.random.permutation(len(dataset)):\n",
    "        views = dataset[idx]\n",
    "\n",
    "        # Collect all 3D points, RGB colors, and estimated poses for visualization\n",
    "        points_list = []\n",
    "        colors_list = []\n",
    "        estimated_poses = []\n",
    "\n",
    "        for view_idx in range(dataset.num_views):\n",
    "            pts3d = views[view_idx][\"pts3d\"]  # (224, 224, 3)\n",
    "            valid_mask = views[view_idx][\"valid_mask\"]  # Only keep valid points\n",
    "\n",
    "            # Flatten the valid 3D points\n",
    "            pts3d = pts3d.reshape(-1, 3)\n",
    "            valid_mask_flat = valid_mask.flatten()\n",
    "            pts3d = pts3d[valid_mask_flat]\n",
    "\n",
    "            # Flatten the RGB image and apply the valid mask\n",
    "            img_rgb = rgb(views[view_idx][\"img\"]).reshape(-1, 3)\n",
    "            img_rgb = img_rgb[valid_mask_flat]\n",
    "\n",
    "            # Generate x and y coordinates for the image\n",
    "            x_coords = np.tile(np.arange(224), 224)\n",
    "            y_coords = np.repeat(np.arange(224), 224)\n",
    "            pixel_coords = np.stack((x_coords, y_coords), axis=1)\n",
    "            valid_pixel_coords = pixel_coords[valid_mask_flat]\n",
    "\n",
    "            # Sample N points to speed up estimation\n",
    "            if len(pts3d) > N:\n",
    "                sample_indices = np.random.choice(len(pts3d), N, replace=False)\n",
    "                pts3d = pts3d[sample_indices]\n",
    "                img_rgb = img_rgb[sample_indices]\n",
    "                valid_pixel_coords = valid_pixel_coords[sample_indices]\n",
    "\n",
    "            points_list.append(pts3d)\n",
    "            colors_list.append(img_rgb)\n",
    "\n",
    "            image_points = valid_pixel_coords  # Now image_points correspond to pts3d\n",
    "\n",
    "            # Estimate projection matrix for this view\n",
    "            P = estimate_projection_matrix(pts3d, image_points)\n",
    "            \n",
    "            # Decompose the projection matrix into intrinsic and extrinsic matrices\n",
    "            K, R, t = decompose_projection_matrix(P)\n",
    "\n",
    "            # Print the estimated K, R, and t\n",
    "            print(f\"View {view_idx} - Intrinsic matrix (K):\\n{K}\")\n",
    "            print(f\"View {view_idx} - Rotation matrix (R):\\n{R}\")\n",
    "            print(f\"View {view_idx} - Translation vector (t):\\n{t}\\n\")\n",
    "            \n",
    "            # Store the estimated rotation (R), translation (t), and intrinsic matrix (K)\n",
    "            estimated_poses.append((R, t, K))\n",
    "\n",
    "        # Plot the 3D scene with estimated camera cones\n",
    "        plot_3d_scene_with_estimated_poses(points_list, colors_list, estimated_poses)\n",
    "        \n",
    "        break  # Process one sample\n",
    "\n",
    "\n",
    "# Run the process with N point sampling\n",
    "process_views(N=10000)  # You can change N for faster/slower performance\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Using Ground Truth Intrinsic Matrix + cv2.solvePnP\n",
    "\n",
    "import numpy as np\n",
    "import cv2  # OpenCV for solvePnP\n",
    "from fast3r.dust3r.datasets.co3d_multiview import Co3d_Multiview\n",
    "from fast3r.dust3r.utils.image import rgb\n",
    "from fast3r.dust3r.viz import SceneViz, auto_cam_size\n",
    "from IPython.display import display\n",
    "\n",
    "# Load dataset\n",
    "dataset = Co3d_Multiview(\n",
    "    split=\"train\", num_views=10, window_degree_range=360, num_samples_per_window=100, mask_bg='rand', \n",
    "    ROOT=\"/path/to/dust3r_data/co3d_50_seqs_per_category_subset_processed\", resolution=224, aug_crop=16,\n",
    ")\n",
    "\n",
    "# Function to convert estimated rotation and translation (R, t) into a camera pose (4x4 matrix)\n",
    "def Rt_to_pose(R, t):\n",
    "    \"\"\"Convert rotation matrix and translation vector to a 4x4 camera pose matrix.\"\"\"\n",
    "    pose = np.eye(4)\n",
    "    pose[:3, :3] = R\n",
    "    pose[:3, 3] = t[:, 0]  # Convert t from (3, 1) to (3,) shape\n",
    "    return pose\n",
    "\n",
    "# Function to invert a 4x4 pose matrix (world-to-camera to camera-to-world)\n",
    "def invert_pose(pose):\n",
    "    \"\"\"Invert a 4x4 pose matrix.\"\"\"\n",
    "    R_inv = pose[:3, :3].T  # Transpose the rotation part\n",
    "    t_inv = -R_inv @ pose[:3, 3]  # Invert the translation\n",
    "    pose_inv = np.eye(4)\n",
    "    pose_inv[:3, :3] = R_inv\n",
    "    pose_inv[:3, 3] = t_inv\n",
    "    return pose_inv\n",
    "\n",
    "# Processing a single batch of views\n",
    "def process_views(N=5000):\n",
    "    for idx in np.random.permutation(len(dataset)):\n",
    "        views = dataset[idx]\n",
    "        assert len(views) == dataset.num_views\n",
    "        print([view_name(view) for view in views])\n",
    "\n",
    "        # Initialize SceneViz for visualization\n",
    "        viz = SceneViz()\n",
    "        \n",
    "        # Estimate camera poses and set up visualization\n",
    "        points_list = []\n",
    "        colors_list = []\n",
    "        estimated_poses = []\n",
    "        poses_c2w = []  # List for the camera-to-world poses to visualize\n",
    "\n",
    "        for view_idx in range(dataset.num_views):\n",
    "            pts3d = views[view_idx][\"pts3d\"]  # (224, 224, 3)\n",
    "            valid_mask = views[view_idx][\"valid_mask\"]  # Only keep valid points\n",
    "            img_rgb = rgb(views[view_idx][\"img\"])\n",
    "\n",
    "            # Flatten the valid 3D points\n",
    "            pts3d = pts3d.reshape(-1, 3)\n",
    "            valid_mask_flat = valid_mask.flatten()\n",
    "            pts3d = pts3d[valid_mask_flat]\n",
    "\n",
    "            # Flatten the RGB image and apply the valid mask\n",
    "            img_rgb = img_rgb.reshape(-1, 3)\n",
    "            img_rgb = img_rgb[valid_mask_flat]\n",
    "\n",
    "            # Generate x and y coordinates for the image\n",
    "            x_coords = np.tile(np.arange(224), 224)\n",
    "            y_coords = np.repeat(np.arange(224), 224)\n",
    "            pixel_coords = np.stack((x_coords, y_coords), axis=1)\n",
    "            valid_pixel_coords = pixel_coords[valid_mask_flat]\n",
    "\n",
    "            # Sample N points to speed up estimation\n",
    "            if len(pts3d) > N:\n",
    "                sample_indices = np.random.choice(len(pts3d), N, replace=False)\n",
    "                pts3d = pts3d[sample_indices]\n",
    "                img_rgb = img_rgb[sample_indices]\n",
    "                valid_pixel_coords = valid_pixel_coords[sample_indices]\n",
    "\n",
    "            points_list.append(pts3d)\n",
    "            colors_list.append(img_rgb)\n",
    "\n",
    "            image_points = valid_pixel_coords  # Now image_points correspond to pts3d\n",
    "\n",
    "            # Convert pts3d and image_points to float32\n",
    "            pts3d = pts3d.astype(np.float32)\n",
    "            image_points = image_points.astype(np.float32)\n",
    "\n",
    "            # Get intrinsic matrix from the dataset and ensure it's float32\n",
    "            K = np.array(views[view_idx][\"camera_intrinsics\"], dtype=np.float32)\n",
    "\n",
    "            # Check if we have at least 4 points\n",
    "            if len(pts3d) < 4 or len(image_points) < 4:\n",
    "                raise ValueError(\"Not enough points to run solvePnP. Need at least 4.\")\n",
    "\n",
    "            # Solve for the camera pose (R, t) using OpenCV's solvePnP\n",
    "            success, rvec, tvec = cv2.solvePnP(pts3d, image_points, K, None)\n",
    "            R, _ = cv2.Rodrigues(rvec)  # Convert rotation vector to matrix\n",
    "\n",
    "            # Convert (R, t) to world-to-camera pose matrix (4x4)\n",
    "            pose_w2c = Rt_to_pose(R, tvec)\n",
    "\n",
    "            # Invert the pose to get camera-to-world pose\n",
    "            pose_c2w = invert_pose(pose_w2c)\n",
    "            poses_c2w.append(pose_c2w)\n",
    "\n",
    "        # Use auto_cam_size to get the camera size for visualization\n",
    "        cam_size = max(auto_cam_size(poses_c2w), 1)\n",
    "\n",
    "        # Add the point clouds and estimated camera poses to the visualization\n",
    "        for view_idx in range(dataset.num_views):\n",
    "            pts3d = views[view_idx][\"pts3d\"]\n",
    "            valid_mask = views[view_idx][\"valid_mask\"]\n",
    "            colors = rgb(views[view_idx][\"img\"])\n",
    "\n",
    "            # Add the pointcloud to the visualization\n",
    "            viz.add_pointcloud(pts3d, colors, valid_mask)\n",
    "\n",
    "            # Add the estimated camera pose (camera-to-world matrix)\n",
    "            viz.add_camera(\n",
    "                pose_c2w=poses_c2w[view_idx],  # Use the inverted camera-to-world pose\n",
    "                focal=views[view_idx][\"camera_intrinsics\"][0, 0],\n",
    "                color=(view_idx * 255, (1 - view_idx) * 255, 0),\n",
    "                image=colors,\n",
    "                cam_size=cam_size,\n",
    "            )\n",
    "\n",
    "        # Show the visualization\n",
    "        display(viz.show(point_size=100, viewer=\"notebook\"))\n",
    "\n",
    "        break  # Process one sample\n",
    "\n",
    "\n",
    "# Run the process\n",
    "process_views(N=10000)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Using DLT to extimate Intrinsic Matrix\n",
    "\n",
    "import numpy as np\n",
    "from scipy.linalg import rq  # For RQ decomposition\n",
    "from fast3r.dust3r.datasets.co3d_multiview import Co3d_Multiview\n",
    "from fast3r.dust3r.utils.image import rgb\n",
    "from fast3r.dust3r.viz import SceneViz, auto_cam_size\n",
    "from IPython.display import display\n",
    "\n",
    "# Load dataset\n",
    "dataset = Co3d_Multiview(\n",
    "    split=\"train\", num_views=10, window_degree_range=360, num_samples_per_window=100, mask_bg='rand', \n",
    "    ROOT=\"/path/to/dust3r_data/co3d_50_seqs_per_category_subset_processed\", resolution=224, aug_crop=16,\n",
    ")\n",
    "\n",
    "# Function to estimate the projection matrix using Direct Linear Transformation (DLT)\n",
    "def estimate_projection_matrix(world_points, image_points):\n",
    "    num_points = world_points.shape[0]\n",
    "    A = []\n",
    "\n",
    "    for i in range(num_points):\n",
    "        X, Y, Z = world_points[i]\n",
    "        u, v = image_points[i]\n",
    "        \n",
    "        A.append([-X, -Y, -Z, -1, 0, 0, 0, 0, u*X, u*Y, u*Z, u])\n",
    "        A.append([0, 0, 0, 0, -X, -Y, -Z, -1, v*X, v*Y, v*Z, v])\n",
    "    \n",
    "    A = np.array(A)\n",
    "    \n",
    "    # Solve using SVD (least squares solution)\n",
    "    U, S, Vh = np.linalg.svd(A)\n",
    "    P = Vh[-1, :].reshape(3, 4)\n",
    "    \n",
    "    return P\n",
    "\n",
    "# Function to decompose the projection matrix into intrinsic and extrinsic matrices\n",
    "def decompose_projection_matrix(P):\n",
    "    \"\"\"Decompose the projection matrix P into intrinsic matrix K and extrinsic parameters (R, t).\"\"\"\n",
    "    # Decompose P into K[R|t] using RQ decomposition\n",
    "    M = P[:, :3]\n",
    "    K, R = rq(M)\n",
    "    \n",
    "    # Normalize K to make sure the diagonal elements are positive\n",
    "    T = np.diag(np.sign(np.diag(K)))\n",
    "    K = K @ T\n",
    "    R = T @ R\n",
    "    \n",
    "    # Extract translation vector\n",
    "    t = np.linalg.inv(K) @ P[:, 3]\n",
    "    \n",
    "    return K, R, t\n",
    "\n",
    "# Function to convert estimated rotation and translation (R, t) into a camera pose (4x4 matrix)\n",
    "def Rt_to_pose(R, t):\n",
    "    \"\"\"Convert rotation matrix and translation vector to a 4x4 camera pose matrix.\"\"\"\n",
    "    pose = np.eye(4)\n",
    "    pose[:3, :3] = R\n",
    "    pose[:3, 3] = t[:, 0]  # Convert t from (3, 1) to (3,) shape\n",
    "    return pose\n",
    "\n",
    "# Function to invert a 4x4 pose matrix (world-to-camera to camera-to-world)\n",
    "def invert_pose(pose):\n",
    "    \"\"\"Invert a 4x4 pose matrix.\"\"\"\n",
    "    R_inv = pose[:3, :3].T  # Transpose the rotation part\n",
    "    t_inv = -R_inv @ pose[:3, 3]  # Invert the translation\n",
    "    pose_inv = np.eye(4)\n",
    "    pose_inv[:3, :3] = R_inv\n",
    "    pose_inv[:3, 3] = t_inv\n",
    "    return pose_inv\n",
    "\n",
    "# Processing a single batch of views\n",
    "def process_views(N=5000):\n",
    "    for idx in np.random.permutation(len(dataset)):\n",
    "        views = dataset[idx]\n",
    "        assert len(views) == dataset.num_views\n",
    "        print([view_name(view) for view in views])\n",
    "\n",
    "        # Initialize SceneViz for visualization\n",
    "        viz = SceneViz()\n",
    "        \n",
    "        # Estimate camera poses and intrinsics, and set up visualization\n",
    "        points_list = []\n",
    "        colors_list = []\n",
    "        estimated_poses = []\n",
    "        poses_c2w = []  # List for the camera-to-world poses to visualize\n",
    "\n",
    "        for view_idx in range(dataset.num_views):\n",
    "            pts3d = views[view_idx][\"pts3d\"]  # (224, 224, 3)\n",
    "            valid_mask = views[view_idx][\"valid_mask\"]  # Only keep valid points\n",
    "            img_rgb = rgb(views[view_idx][\"img\"])\n",
    "\n",
    "            # Flatten the valid 3D points\n",
    "            pts3d = pts3d.reshape(-1, 3)\n",
    "            valid_mask_flat = valid_mask.flatten()\n",
    "            pts3d = pts3d[valid_mask_flat]\n",
    "\n",
    "            # Flatten the RGB image and apply the valid mask\n",
    "            img_rgb = img_rgb.reshape(-1, 3)\n",
    "            img_rgb = img_rgb[valid_mask_flat]\n",
    "\n",
    "            # Generate x and y coordinates for the image\n",
    "            x_coords = np.tile(np.arange(224), 224)\n",
    "            y_coords = np.repeat(np.arange(224), 224)\n",
    "            pixel_coords = np.stack((x_coords, y_coords), axis=1)\n",
    "            valid_pixel_coords = pixel_coords[valid_mask_flat]\n",
    "\n",
    "            # Sample N points to speed up estimation\n",
    "            if len(pts3d) > N:\n",
    "                sample_indices = np.random.choice(len(pts3d), N, replace=False)\n",
    "                pts3d = pts3d[sample_indices]\n",
    "                img_rgb = img_rgb[sample_indices]\n",
    "                valid_pixel_coords = valid_pixel_coords[sample_indices]\n",
    "\n",
    "            points_list.append(pts3d)\n",
    "            colors_list.append(img_rgb)\n",
    "\n",
    "            image_points = valid_pixel_coords  # Now image_points correspond to pts3d\n",
    "\n",
    "            # Convert pts3d and image_points to float32\n",
    "            pts3d = pts3d.astype(np.float32)\n",
    "            image_points = image_points.astype(np.float32)\n",
    "\n",
    "            # Estimate the projection matrix using DLT\n",
    "            P = estimate_projection_matrix(pts3d, image_points)\n",
    "\n",
    "            # Decompose the projection matrix into intrinsics and extrinsics\n",
    "            K, R, t = decompose_projection_matrix(P)\n",
    "\n",
    "            # Print the estimated intrinsics and extrinsics\n",
    "            print(f\"View {view_idx} - Estimated Intrinsic matrix (K):\\n{K}\")\n",
    "            print(f\"View {view_idx} - Estimated Rotation matrix (R):\\n{R}\")\n",
    "            print(f\"View {view_idx} - Estimated Translation vector (t):\\n{t}\\n\")\n",
    "\n",
    "            # Convert (R, t) to world-to-camera pose matrix (4x4)\n",
    "            pose_w2c = Rt_to_pose(R, t.reshape(-1, 1))\n",
    "\n",
    "            # Invert the pose to get camera-to-world pose\n",
    "            pose_c2w = invert_pose(pose_w2c)\n",
    "            poses_c2w.append(pose_c2w)\n",
    "\n",
    "        # Use auto_cam_size to get the camera size for visualization\n",
    "        cam_size = max(auto_cam_size(poses_c2w), 1)\n",
    "\n",
    "        # Add the point clouds and estimated camera poses to the visualization\n",
    "        for view_idx in range(dataset.num_views):\n",
    "            pts3d = views[view_idx][\"pts3d\"]\n",
    "            valid_mask = views[view_idx][\"valid_mask\"]\n",
    "            colors = rgb(views[view_idx][\"img\"])\n",
    "\n",
    "            # Add the pointcloud to the visualization\n",
    "            viz.add_pointcloud(pts3d, colors, valid_mask)\n",
    "\n",
    "            # Add the estimated camera pose (camera-to-world matrix)\n",
    "            viz.add_camera(\n",
    "                pose_c2w=poses_c2w[view_idx],  # Use the inverted camera-to-world pose\n",
    "                focal=K[0, 0] / K[2, 2],  # Use the estimated focal length from K\n",
    "                # focal=None,\n",
    "                color=(view_idx * 255, (1 - view_idx) * 255, 0),\n",
    "                image=colors,\n",
    "                cam_size=cam_size,\n",
    "            )\n",
    "\n",
    "        # Show the visualization\n",
    "        display(viz.show(point_size=100, viewer=\"notebook\"))\n",
    "\n",
    "        break  # Process one sample\n",
    "\n",
    "\n",
    "# Run the process\n",
    "process_views(N=10000)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Guess focal length and use cv2.solvePnPRansac to solve for extrinsics\n",
    "\n",
    "import rootutils\n",
    "rootutils.setup_root(\"/path/to/fast3r/fast3r\", indicator=\".project-root\", pythonpath=True)\n",
    "\n",
    "\n",
    "import numpy as np\n",
    "import torch\n",
    "import cv2\n",
    "from fast3r.dust3r.datasets.co3d_multiview import Co3d_Multiview\n",
    "from fast3r.dust3r.datasets.base.base_stereo_view_dataset import view_name\n",
    "from fast3r.dust3r.utils.image import rgb\n",
    "from fast3r.dust3r.viz import SceneViz, auto_cam_size\n",
    "from IPython.display import display\n",
    "from fast3r.dust3r.cloud_opt.init_im_poses import fast_pnp  # Import fast_pnp\n",
    "\n",
    "# Load dataset\n",
    "dataset = Co3d_Multiview(\n",
    "    split=\"train\", num_views=2, window_degree_range=360, num_samples_per_window=100, mask_bg='rand', \n",
    "    ROOT=\"/path/to/dust3r_data/co3d_50_seqs_per_category_subset_processed\", resolution=224, aug_crop=16,\n",
    ")\n",
    "\n",
    "# Function to process views and estimate camera poses using fast_pnp\n",
    "def process_views_with_fast_pnp(niter_PnP=10):\n",
    "    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
    "    \n",
    "    for idx in np.random.permutation(len(dataset)):\n",
    "        views = dataset[idx]\n",
    "        assert len(views) == dataset.num_views\n",
    "        print([view_name(view) for view in views])\n",
    "\n",
    "        # Initialize SceneViz for visualization\n",
    "        viz = SceneViz()\n",
    "        \n",
    "        # Estimate camera poses and focal lengths, and set up visualization\n",
    "        points_list = []\n",
    "        colors_list = []\n",
    "        estimated_poses = []\n",
    "        estimated_focals = []  # List for the guessed focal lengths\n",
    "        poses_c2w = []  # List for the camera-to-world poses to visualize\n",
    "\n",
    "        for view_idx in range(dataset.num_views):\n",
    "            pts3d = views[view_idx][\"pts3d\"]  # (224, 224, 3) shape\n",
    "            valid_mask = views[view_idx][\"valid_mask\"]  # (224, 224) mask\n",
    "            img_rgb = rgb(views[view_idx][\"img\"])\n",
    "\n",
    "            # Do not flatten pts3d or valid_mask here for fast_pnp\n",
    "            points_list.append(pts3d)\n",
    "            colors_list.append(img_rgb)\n",
    "\n",
    "            # Call fast_pnp with unflattened pts3d and mask\n",
    "            focal_length, pose_c2w = fast_pnp(\n",
    "                torch.tensor(pts3d, device=device),  # Pass original unmasked pts3d\n",
    "                None,  # Guess focal length\n",
    "                torch.tensor(valid_mask, device=device, dtype=torch.bool),  # Valid mask (unflattened)\n",
    "                device,\n",
    "                pp=None,  # Use default principal point (center of image)\n",
    "                niter_PnP=niter_PnP\n",
    "            )\n",
    "\n",
    "            if pose_c2w is None:\n",
    "                print(f\"Failed to estimate pose for view {view_idx}\")\n",
    "                continue\n",
    "\n",
    "            # Store the estimated camera-to-world pose and focal length\n",
    "            poses_c2w.append(pose_c2w.cpu().numpy())\n",
    "            estimated_focals.append(focal_length)\n",
    "            print(f\"View {view_idx} - Estimated Focal Length: {focal_length}\")\n",
    "\n",
    "        # Use auto_cam_size to get the camera size for visualization\n",
    "        cam_size = max(auto_cam_size(poses_c2w), 1)\n",
    "\n",
    "        # Add the point clouds and estimated camera poses to the visualization\n",
    "        for view_idx in range(dataset.num_views):\n",
    "            pts3d = views[view_idx][\"pts3d\"]\n",
    "            valid_mask = views[view_idx][\"valid_mask\"]\n",
    "            colors = rgb(views[view_idx][\"img\"])\n",
    "\n",
    "            # Add the pointcloud to the visualization\n",
    "            viz.add_pointcloud(pts3d, colors, valid_mask)\n",
    "\n",
    "            # Add the estimated camera pose (camera-to-world matrix) and focal length\n",
    "            viz.add_camera(\n",
    "                pose_c2w=poses_c2w[view_idx],  # Use the estimated camera-to-world pose\n",
    "                focal=estimated_focals[view_idx],  # Use the estimated focal length for each view\n",
    "                color=np.random.randint(0, 256, size=3),  # Generate a random RGB color\n",
    "                image=colors,\n",
    "                cam_size=cam_size,\n",
    "            )\n",
    "\n",
    "        # Show the visualization\n",
    "        display(viz.show(point_size=100, viewer=\"notebook\"))\n",
    "\n",
    "        break  # Process one sample\n",
    "\n",
    "\n",
    "# Run the process using fast_pnp\n",
    "process_views_with_fast_pnp()\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "\n",
    "import rootutils\n",
    "rootutils.setup_root(\"/path/to/fast3r/fast3r\", indicator=\".project-root\", pythonpath=True)\n",
    "\n",
    "import fast3r\n",
    "from fast3r.dust3r.viz_plotly import SceneViz  # Import Plotly version for visualization\n",
    "\n",
    "import importlib\n",
    "importlib.reload(fast3r.dust3r.viz_plotly)\n",
    "\n",
    "# Load dataset\n",
    "dataset = Co3d_Multiview(\n",
    "    split=\"train\", num_views=2, window_degree_range=360, num_samples_per_window=100, mask_bg='rand', \n",
    "    ROOT=\"/path/to/dust3r_data/co3d_50_seqs_per_category_subset_processed\", resolution=224, aug_crop=16,\n",
    ")\n",
    "\n",
    "# Run the process using fast_pnp\n",
    "process_views_with_fast_pnp()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import plotly.graph_objects as go\n",
    "from sklearn.cluster import KMeans\n",
    "from sklearn.utils import shuffle\n",
    "from PIL import Image\n",
    "\n",
    "def image2zvals(img, n_colors=4, n_training_pixels=1000, random_seed=42):\n",
    "    \"\"\"Perform color quantization on the image using K-means clustering.\"\"\"\n",
    "    if img.ndim != 3:\n",
    "        raise ValueError(f\"Your image does not appear to be a color image. Its shape is {img.shape}\")\n",
    "    \n",
    "    rows, cols, d = img.shape\n",
    "    if d < 3:\n",
    "        raise ValueError(f\"A color image should have the shape (m, n, d), d=3 or 4. Your d={d}\")\n",
    "    \n",
    "    if img.max() > 1:\n",
    "        img = np.clip(img / 255.0, 0, 1)\n",
    "\n",
    "    observations = img[:, :, :3].reshape(rows * cols, 3)\n",
    "    training_pixels = shuffle(observations, random_state=random_seed)[:n_training_pixels]\n",
    "    \n",
    "    kmeans = KMeans(n_clusters=n_colors, random_state=random_seed).fit(training_pixels)\n",
    "    codebook = kmeans.cluster_centers_\n",
    "    indices = kmeans.predict(observations)\n",
    "    \n",
    "    z_vals = indices.astype(float) / (n_colors - 1)  # Normalize to [0, 1]\n",
    "    z_vals = z_vals.reshape(rows, cols)\n",
    "\n",
    "    # Generate the colorscale for Plotly\n",
    "    scale = np.linspace(0, 1, n_colors)\n",
    "    colors = (codebook * 255).astype(np.uint8)\n",
    "    plotly_colorscale = [[s, f'rgb{tuple(c)}'] for s, c in zip(scale, colors)]\n",
    "    \n",
    "    return z_vals, plotly_colorscale\n",
    "\n",
    "def plot_quantized_heatmap(image):\n",
    "    \"\"\"Plot the quantized image as a 2D heatmap to debug the color quantization.\"\"\"\n",
    "    z_vals, pl_colorscale = image2zvals(image)\n",
    "\n",
    "    fig = go.Figure(data=go.Heatmap(\n",
    "        z=z_vals, \n",
    "        colorscale=pl_colorscale,\n",
    "        showscale=False\n",
    "    ))\n",
    "\n",
    "    fig.update_layout(\n",
    "        title=\"Quantized Image Heatmap\",\n",
    "        xaxis=dict(visible=False),\n",
    "        yaxis=dict(visible=False),\n",
    "        width=600,\n",
    "        height=600\n",
    "    )\n",
    "    \n",
    "    fig.show()\n",
    "\n",
    "# Example Usage\n",
    "image_path = '/path/to/beef_jerky/IMG_0050.jpg'\n",
    "image = np.array(Image.open(image_path))\n",
    "\n",
    "plot_quantized_heatmap(image)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from PIL import Image\n",
    "from sklearn.cluster import KMeans\n",
    "from sklearn.utils import shuffle\n",
    "import plotly.graph_objects as go\n",
    "\n",
    "# Function to create a surface (modified from the blog)\n",
    "def surface(rows, cols):\n",
    "    \"\"\"Generate a surface with a sine and cosine wave for testing.\"\"\"\n",
    "    x = np.linspace(-np.pi, np.pi, cols)\n",
    "    y = np.linspace(-np.pi, np.pi, rows)\n",
    "    x, y = np.meshgrid(x, y)\n",
    "    z = 0.5 * np.cos(x / 2) + 0.2 * np.sin(y / 4)\n",
    "    return x, y, z\n",
    "\n",
    "# Helper function to quantize the image using K-means\n",
    "def image2zvals(img, n_colors=64, n_training_pixels=10000, rngs=123):\n",
    "    \"\"\"Quantize the image to n_colors using KMeans.\"\"\"\n",
    "    rows, cols, _ = img.shape\n",
    "\n",
    "    # Normalize the image if necessary\n",
    "    if img.max() > 1:\n",
    "        img = np.clip(img / 255.0, 0, 1)\n",
    "\n",
    "    observations = img[:, :, :3].reshape(rows * cols, 3)\n",
    "    training_pixels = shuffle(observations, random_state=rngs)[:n_training_pixels]\n",
    "\n",
    "    kmeans = KMeans(n_clusters=n_colors, random_state=rngs).fit(training_pixels)\n",
    "    codebook = kmeans.cluster_centers_\n",
    "    indices = kmeans.predict(observations)\n",
    "\n",
    "    z_vals = indices.astype(float) / (n_colors - 1)  # Normalize to [0, 1]\n",
    "    z_vals = z_vals.reshape(rows, cols)\n",
    "\n",
    "    # Generate the Plotly colorscale\n",
    "    scale = np.linspace(0, 1, n_colors)\n",
    "    colors = (codebook * 255).astype(np.uint8)\n",
    "    plotly_colorscale = [[s, f'rgb{tuple(c)}'] for s, c in zip(scale, colors)]\n",
    "\n",
    "    return z_vals, plotly_colorscale\n",
    "\n",
    "# Generate triangles for the mesh\n",
    "def regular_triangles(rows, cols):\n",
    "    \"\"\"Generate regular triangles for a mesh.\"\"\"\n",
    "    triangles = []\n",
    "    for i in range(rows - 1):\n",
    "        for j in range(cols - 1):\n",
    "            k = j + i * cols\n",
    "            triangles.extend([[k, k + cols, k + 1 + cols], [k, k + 1 + cols, k + 1]])\n",
    "    return np.array(triangles)\n",
    "\n",
    "# Create mesh data for texture mapping\n",
    "def mesh_data(img, n_colors=32, n_training_pixels=1000):\n",
    "    \"\"\"Generate mesh data with quantized color intensities for the image.\"\"\"\n",
    "    # Quantize the downsampled image\n",
    "    z_vals, pl_colorscale = image2zvals(img, n_colors=n_colors, n_training_pixels=n_training_pixels)\n",
    "\n",
    "    # Generate triangles\n",
    "    rows, cols, _ = img.shape\n",
    "    triangles = regular_triangles(rows, cols)\n",
    "    I, J, K = triangles.T\n",
    "\n",
    "    # Assign intensity to each triangle\n",
    "    zc = z_vals.flatten()[triangles]\n",
    "    tri_color_intensity = [zc[k][2] if k % 2 else zc[k][1] for k in range(len(zc))]\n",
    "\n",
    "    return I, J, K, tri_color_intensity, pl_colorscale\n",
    "\n",
    "# Function to downsample the image and create the 3D Mesh3d object for plotting\n",
    "def create_mesh3d(img, resolution=64, n_colors=256, view_idx=0):\n",
    "    \"\"\"Creates a Mesh3d object for the image texture mapping with downsampled image.\"\"\"\n",
    "    # Downsample the image first\n",
    "    img_downsampled = np.array(Image.fromarray(img).resize((resolution, resolution)))\n",
    "\n",
    "    # Generate the surface mesh based on downsampled resolution\n",
    "    rows, cols, _ = img_downsampled.shape\n",
    "    x, y, z = surface(rows, cols)\n",
    "\n",
    "    # Get the mesh data\n",
    "    I, J, K, tri_color_intensity, pl_colorscale = mesh_data(img_downsampled, n_colors=n_colors)\n",
    "\n",
    "    # Create the Mesh3d trace\n",
    "    mesh3d_trace = go.Mesh3d(\n",
    "        x=x.flatten(), y=np.flipud(y).flatten(), z=z.flatten() + view_idx,  # Offset z for different views\n",
    "        i=I, j=J, k=K,\n",
    "        intensity=tri_color_intensity,\n",
    "        intensitymode=\"cell\",\n",
    "        colorscale=pl_colorscale,\n",
    "        showscale=False,\n",
    "        name=f\"Image {view_idx}\"\n",
    "    )\n",
    "\n",
    "    return mesh3d_trace\n",
    "\n",
    "# Load two images for testing\n",
    "image1_path = '/path/to/beef_jerky/IMG_0050.jpg'\n",
    "image2_path = '/path/to/beef_jerky/IMG_0051.jpg'\n",
    "\n",
    "image1 = np.array(Image.open(image1_path))\n",
    "image2 = np.array(Image.open(image2_path))\n",
    "\n",
    "# Test with one surface using Mesh3d\n",
    "fig1 = go.Figure()\n",
    "\n",
    "# Add the first mesh with the first image\n",
    "mesh1 = create_mesh3d(image1, resolution=256, n_colors=128, view_idx=0)\n",
    "fig1.add_trace(mesh1)\n",
    "\n",
    "fig1.update_layout(\n",
    "    title=\"One Surface Mesh3d Test (Using Surface Mesh and Downsampled Image)\",\n",
    "    scene=dict(aspectmode='data')\n",
    ")\n",
    "\n",
    "fig1.show()\n",
    "\n",
    "# Test with two surfaces using Mesh3d\n",
    "fig2 = go.Figure()\n",
    "\n",
    "# Add the first mesh with the first image\n",
    "mesh1 = create_mesh3d(image1, resolution=128, n_colors=256, view_idx=0)\n",
    "fig2.add_trace(mesh1)\n",
    "\n",
    "# Add the second mesh with the second image\n",
    "mesh2 = create_mesh3d(image2, resolution=64, n_colors=256, view_idx=1)\n",
    "fig2.add_trace(mesh2)\n",
    "\n",
    "fig2.update_layout(\n",
    "    title=\"Two Surface Mesh3d Test (Using Surface Mesh and Downsampled Image)\",\n",
    "    scene=dict(aspectmode='data')\n",
    ")\n",
    "\n",
    "fig2.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import plotly.graph_objects as go\n",
    "from PIL import Image\n",
    "\n",
    "def create_camera_frustum_with_image(pose_c2w, focal, H, W, image=None, scale=0.05, color='blue', resolution=64):\n",
    "    # Create frustum points in camera space\n",
    "    depth = focal * scale\n",
    "    hw_ratio = W / H\n",
    "    \n",
    "    frustum_points = np.array([\n",
    "        [0, 0, 0],  # Camera origin\n",
    "        [-hw_ratio * depth, -depth, depth],  # Bottom left corner of the frustum\n",
    "        [hw_ratio * depth, -depth, depth],  # Bottom right corner\n",
    "        [hw_ratio * depth, depth, depth],  # Top right corner\n",
    "        [-hw_ratio * depth, depth, depth],  # Top left corner\n",
    "    ])\n",
    "    \n",
    "    # Transform frustum points to world coordinates\n",
    "    frustum_points_homogeneous = np.hstack([frustum_points, np.ones((frustum_points.shape[0], 1))])  # Homogeneous coordinates\n",
    "    frustum_points_world = (pose_c2w @ frustum_points_homogeneous.T).T[:, :3]  # Apply pose transformation\n",
    "\n",
    "    # Frustum lines (edges of the pyramid)\n",
    "    edges = [\n",
    "        (0, 1), (0, 2), (0, 3), (0, 4),  # From camera to corners of the image plane\n",
    "        (1, 2), (2, 3), (3, 4), (4, 1)   # Edges of the image plane\n",
    "    ]\n",
    "    \n",
    "    # Combine all edges into one trace\n",
    "    x_vals, y_vals, z_vals = [], [], []\n",
    "    for edge in edges:\n",
    "        x_vals += [frustum_points_world[edge[0], 0], frustum_points_world[edge[1], 0], None]  # Add None to break the line\n",
    "        y_vals += [frustum_points_world[edge[0], 1], frustum_points_world[edge[1], 1], None]\n",
    "        z_vals += [frustum_points_world[edge[0], 2], frustum_points_world[edge[1], 2], None]\n",
    "    \n",
    "    frustum_trace = go.Scatter3d(\n",
    "        x=x_vals,\n",
    "        y=y_vals,\n",
    "        z=z_vals,\n",
    "        mode='lines',\n",
    "        line=dict(color=color),\n",
    "        name=\"Camera Frustum\",\n",
    "        legendgroup=f\"frustum_{id(pose_c2w)}\",\n",
    "        showlegend=True\n",
    "    )\n",
    "\n",
    "    # Add image to the base of the frustum if available\n",
    "    image_surface_trace = None\n",
    "    if image is not None:\n",
    "        # Downsample the image to finer resolution for better color mapping\n",
    "        img = np.array(image.resize((resolution, resolution)))  # Resize for faster processing\n",
    "        H_img, W_img, _ = img.shape\n",
    "\n",
    "        # Create mesh grid on the base of the frustum\n",
    "        u = np.linspace(0, 1, W_img)\n",
    "        v = np.linspace(0, 1, H_img)\n",
    "        uu, vv = np.meshgrid(u, v)\n",
    "\n",
    "        # Bottom rectangle vertices of the frustum (for image mapping)\n",
    "        img_vertices = frustum_points_world[1:5]  # Bottom rectangle of the frustum\n",
    "        img_x, img_y, img_z = img_vertices[:, 0], img_vertices[:, 1], img_vertices[:, 2]\n",
    "\n",
    "        # Bilinearly interpolate to create a fine grid for the image mapping\n",
    "        X = img_x[0] * (1 - uu) * (1 - vv) + img_x[1] * uu * (1 - vv) + img_x[3] * (1 - uu) * vv + img_x[2] * uu * vv\n",
    "        Y = img_y[0] * (1 - uu) * (1 - vv) + img_y[1] * uu * (1 - vv) + img_y[3] * (1 - uu) * vv + img_y[2] * uu * vv\n",
    "        Z = img_z[0] * (1 - uu) * (1 - vv) + img_z[1] * uu * (1 - vv) + img_z[3] * (1 - uu) * vv + img_z[2] * uu * vv\n",
    "\n",
    "        # Compute grayscale intensity (average of RGB channels)\n",
    "        grayscale_img = np.mean(img, axis=-1) / 255.0  # Normalize to [0, 1]\n",
    "\n",
    "        # Create surface trace for the grayscale image\n",
    "        image_surface_trace = go.Surface(\n",
    "            x=X,\n",
    "            y=Y,\n",
    "            z=Z,\n",
    "            surfacecolor=grayscale_img,  # Use the grayscale image\n",
    "            colorscale='gray',  # Grayscale color scale\n",
    "            showscale=False,\n",
    "            name=\"Camera Frustum Image\",\n",
    "            legendgroup=f\"frustum_{id(pose_c2w)}\",  # Link with the frustum lines\n",
    "            showlegend=False  # Hide separate legend, it's linked to the frustum lines\n",
    "        )\n",
    "\n",
    "    return [frustum_trace, image_surface_trace] if image_surface_trace else [frustum_trace]\n",
    "\n",
    "def plot_cameras(camera_poses, focals, H, W, images=None, scale=0.05, resolution=64):\n",
    "    fig = go.Figure()\n",
    "\n",
    "    # Add camera frustums to the plot\n",
    "    for i, (pose_c2w, focal) in enumerate(zip(camera_poses, focals)):\n",
    "        image = images[i] if images is not None else None\n",
    "        frustum_traces = create_camera_frustum_with_image(pose_c2w, focal, H, W, image, scale, color=f'rgb({50*i}, {100}, {150})', resolution=resolution)\n",
    "        for trace in frustum_traces:\n",
    "            if trace is not None:\n",
    "                fig.add_trace(trace)\n",
    "\n",
    "    # Set 3D aspect ratio and layout\n",
    "    fig.update_layout(scene=dict(aspectmode='data'),\n",
    "                      scene_camera=dict(eye=dict(x=1.5, y=1.5, z=1.5)),\n",
    "                      title=\"Camera Poses and Frustums with Images\")\n",
    "    \n",
    "    fig.show()\n",
    "\n",
    "# Example usage with camera poses (4x4 matrices) and focals\n",
    "camera_poses = [\n",
    "    np.eye(4),  # Identity pose for the first camera\n",
    "    np.array([[1, 0, 0, 0.5], [0, 1, 0, 0.5], [0, 0, 1, 0.5], [0, 0, 0, 1]])  # Example pose for the second camera\n",
    "]\n",
    "focals = [20, 500]  # Focal lengths of the cameras\n",
    "H, W = 1080, 1920  # Example image dimensions\n",
    "\n",
    "# Load example images (replace with real images)\n",
    "image1 = Image.open('/path/to/beef_jerky/IMG_0050.jpg')\n",
    "image2 = Image.open('/path/to/beef_jerky/IMG_0050.jpg')\n",
    "\n",
    "plot_cameras(camera_poses, focals, H, W, images=[image1, image2], scale=0.1, resolution=128)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# ScanNet++"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "##### ScanNetpp_Multiview\n",
    "\n",
    "import rootutils\n",
    "rootutils.setup_root(\"/path/to/fast3r/fast3r\", indicator=\".project-root\", pythonpath=True)\n",
    "\n",
    "import itertools\n",
    "import json\n",
    "import os.path as osp\n",
    "from collections import deque\n",
    "\n",
    "import numpy as np\n",
    "\n",
    "from fast3r.dust3r.datasets.scannetpp_multiview import ScanNetpp_Multiview\n",
    "\n",
    "from fast3r.dust3r.datasets.base.base_stereo_view_dataset import view_name\n",
    "from fast3r.dust3r.utils.image import rgb\n",
    "from fast3r.dust3r.viz import SceneViz, auto_cam_size\n",
    "from IPython.display import display\n",
    "\n",
    "\n",
    "\n",
    "dataset = ScanNetpp_Multiview(num_views=8, data_scaling=0.5, window_size=10, num_samples_per_window=1, split='train', ordered=True, ROOT=\"/path/to/dust3r_data/scannetpp_processed\", resolution=512, aug_crop=16)\n",
    "\n",
    "for idx in np.random.permutation(len(dataset)):\n",
    "    # views = dataset[idx]\n",
    "    views = dataset[-1]\n",
    "    assert len(views) == dataset.num_views\n",
    "    print([view_name(view) for view in views])\n",
    "    viz = SceneViz()\n",
    "    poses = [views[view_idx]['camera_pose'] for view_idx in range(dataset.num_views)]\n",
    "    cam_size = max(auto_cam_size(poses), 1)\n",
    "    for view_idx in range(dataset.num_views):\n",
    "        pts3d = views[view_idx]['pts3d']\n",
    "        valid_mask = views[view_idx]['valid_mask']\n",
    "        colors = rgb(views[view_idx]['img'])\n",
    "        viz.add_pointcloud(pts3d, colors, valid_mask)\n",
    "        viz.add_camera(pose_c2w=views[view_idx]['camera_pose'],\n",
    "                        focal=views[view_idx]['camera_intrinsics'][0, 0],\n",
    "                        color=(view_idx*255, (1 - view_idx)*255, 0),\n",
    "                        image=colors,\n",
    "                        cam_size=cam_size)\n",
    "    display(viz.show(point_size=100, viewer=\"notebook\"))\n",
    "    break\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset = 80_000 @ dataset\n",
    "dataset.set_epoch(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%timeit\n",
    "\n",
    "dataset[100]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "views = dataset[1005]\n",
    "assert len(views) == dataset.num_views\n",
    "print([view_name(view) for view in views])\n",
    "viz = SceneViz()\n",
    "poses = [views[view_idx]['camera_pose'] for view_idx in range(dataset.num_views)]\n",
    "cam_size = max(auto_cam_size(poses), 1)\n",
    "for view_idx in range(dataset.num_views):\n",
    "    pts3d = views[view_idx]['pts3d']\n",
    "    valid_mask = views[view_idx]['valid_mask']\n",
    "    colors = rgb(views[view_idx]['img'])\n",
    "    viz.add_pointcloud(pts3d, colors, valid_mask)\n",
    "    viz.add_camera(pose_c2w=views[view_idx]['camera_pose'],\n",
    "                    focal=views[view_idx]['camera_intrinsics'][0, 0],\n",
    "                    color=(idx*255, (1 - idx)*255, 0),\n",
    "                    image=colors,\n",
    "                    cam_size=cam_size)\n",
    "display(viz.show(point_size=100, viewer=\"notebook\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "##### MegaDepth\n",
    "\n",
    "import itertools\n",
    "import json\n",
    "import os.path as osp\n",
    "from collections import deque\n",
    "\n",
    "import numpy as np\n",
    "\n",
    "# from dust3r.datasets.megadepth_multiview import MegaDepth_Multiview\n",
    "from dust3r.datasets.megadepth import MegaDepth\n",
    "\n",
    "from dust3r.datasets.base.base_stereo_view_dataset import view_name\n",
    "from dust3r.utils.image import rgb\n",
    "from dust3r.viz import SceneViz, auto_cam_size\n",
    "from IPython.display import display\n",
    "\n",
    "\n",
    "\n",
    "# dataset = MegaDepth_Multiview(split='train', num_views=4, window_size=60, num_samples_per_window=100, ROOT=\"/path/to/dust3r_data/megadepth_processed\", resolution=512, aug_crop=16)\n",
    "dataset = MegaDepth(split='train', ROOT=\"/path/to/dust3r_data/megadepth_processed\", resolution=512, aug_crop=16)\n",
    "\n",
    "views = dataset[0]\n",
    "assert len(views) == dataset.num_views\n",
    "print([view_name(view) for view in views])\n",
    "viz = SceneViz()\n",
    "poses = [views[view_idx]['camera_pose'] for view_idx in range(dataset.num_views)]\n",
    "cam_size = max(auto_cam_size(poses), 1)\n",
    "for view_idx in range(dataset.num_views):\n",
    "    pts3d = views[view_idx]['pts3d']\n",
    "    valid_mask = views[view_idx]['valid_mask']\n",
    "    colors = rgb(views[view_idx]['img'])\n",
    "    viz.add_pointcloud(pts3d, colors, valid_mask)\n",
    "    viz.add_camera(pose_c2w=views[view_idx]['camera_pose'],\n",
    "                    focal=views[view_idx]['camera_intrinsics'][0, 0],\n",
    "                    color=(view_idx*255, (1 - view_idx)*255, 0),\n",
    "                    image=colors,\n",
    "                    cam_size=cam_size)\n",
    "display(viz.show(point_size=100, viewer=\"notebook\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "##### MegaDepth_Multiview\n",
    "import rootutils\n",
    "rootutils.setup_root(\"/path/to/fast3r/fast3r\", indicator=\".project-root\", pythonpath=True)\n",
    "\n",
    "import itertools\n",
    "import json\n",
    "import os.path as osp\n",
    "from collections import deque\n",
    "\n",
    "import numpy as np\n",
    "\n",
    "from fast3r.dust3r.datasets.megadepth_multiview import MegaDepth_Multiview\n",
    "\n",
    "from fast3r.dust3r.datasets.base.base_stereo_view_dataset import view_name\n",
    "from fast3r.dust3r.utils.image import rgb\n",
    "from fast3r.dust3r.viz import SceneViz, auto_cam_size\n",
    "from IPython.display import display\n",
    "\n",
    "\n",
    "\n",
    "# dataset = MegaDepth_Multiview(split='train', num_views=20, window_size=40, num_samples_per_window=1, ROOT=\"/path/to/dust3r_data/megadepth_processed\", resolution=512, aug_crop=16)\n",
    "dataset = 100 @ MegaDepth_Multiview(split='val', num_views=12, window_size=24, num_samples_per_window=100, ROOT=\"/path/to/dust3r_data/megadepth_processed\", resolution=(512, 336), seed=777)\n",
    "dataset.set_epoch(0)\n",
    "print(dataset)\n",
    "\n",
    "for idx in np.random.permutation(len(dataset)):\n",
    "    views = dataset[idx]\n",
    "    # views = dataset[-1]\n",
    "    # assert len(views) == dataset.num_views\n",
    "    # print([view_name(view) for view in views])\n",
    "    viz = SceneViz()\n",
    "    poses = [views[view_idx]['camera_pose'] for view_idx in range(len(views))]\n",
    "    cam_size = max(auto_cam_size(poses), 1)\n",
    "    for view_idx in range(len(views)):\n",
    "        pts3d = views[view_idx]['pts3d']\n",
    "        valid_mask = views[view_idx]['valid_mask']\n",
    "        colors = rgb(views[view_idx]['img'])\n",
    "        viz.add_pointcloud(pts3d, colors, valid_mask)\n",
    "        viz.add_camera(pose_c2w=views[view_idx]['camera_pose'],\n",
    "                        # focal=views[view_idx]['camera_intrinsics'][0, 0],\n",
    "                        color=(view_idx*255, (1 - view_idx)*255, 0),\n",
    "                        image=colors,\n",
    "                        cam_size=cam_size)\n",
    "    display(viz.show(point_size=100, viewer=\"notebook\"))\n",
    "    break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# visualize the rgb\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "\n",
    "# Load the images from the views\n",
    "views = dataset[89]\n",
    "images = [rgb(view['img']) for view in views]\n",
    "\n",
    "# Plot the images\n",
    "fig, axes = plt.subplots(1, len(images), figsize=(40, 8))\n",
    "for i, ax in enumerate(axes.flat):\n",
    "    ax.imshow(images[i])\n",
    "    ax.axis('off')\n",
    "    \n",
    "\n",
    "plt.show()\n",
    "\n",
    "# show the valid mask\n",
    "# Load the images from the views\n",
    "fig, axes = plt.subplots(1, len(images), figsize=(40, 8))\n",
    "for i, ax in enumerate(axes.flat):\n",
    "    ax.imshow(views[i]['valid_mask'])\n",
    "    ax.axis('off')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "views[0].keys()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for idx in np.random.permutation(len(dataset)):\n",
    "    views = dataset[idx]\n",
    "    # views = dataset[-1]\n",
    "    assert len(views) == dataset.num_views\n",
    "    print([view_name(view) for view in views])\n",
    "    viz = SceneViz()\n",
    "    poses = [views[view_idx]['camera_pose'] for view_idx in range(dataset.num_views)]\n",
    "    cam_size = max(auto_cam_size(poses), 1)\n",
    "    for view_idx in range(dataset.num_views):\n",
    "        pts3d = views[view_idx]['pts3d']\n",
    "        valid_mask = views[view_idx]['valid_mask']\n",
    "        colors = rgb(views[view_idx]['img'])\n",
    "        viz.add_pointcloud(pts3d, colors, valid_mask)\n",
    "        viz.add_camera(pose_c2w=views[view_idx]['camera_pose'],\n",
    "                        focal=views[view_idx]['camera_intrinsics'][0, 0],\n",
    "                        color=(view_idx*255, (1 - view_idx)*255, 0),\n",
    "                        image=colors,\n",
    "                        cam_size=cam_size)\n",
    "    # display(viz.show(point_size=100, viewer=\"notebook\"))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# ArkitScenes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# ArkitScenes_Multiview\n",
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "\n",
    "import rootutils\n",
    "rootutils.setup_root(\"/path/to/fast3r/fast3r\", indicator=\".project-root\", pythonpath=True)\n",
    "\n",
    "import numpy as np\n",
    "\n",
    "from fast3r.dust3r.datasets.arkitscenes_multiview import ARKitScenes_Multiview\n",
    "\n",
    "from fast3r.dust3r.datasets.base.base_stereo_view_dataset import view_name\n",
    "from fast3r.dust3r.utils.image import rgb\n",
    "from fast3r.dust3r.viz import SceneViz, auto_cam_size\n",
    "from IPython.display import display\n",
    "\n",
    "dataset = ARKitScenes_Multiview(\n",
    "    split='train', data_scaling=0.5, num_views=20, window_size=30, num_samples_per_window=2, ROOT=\"/path/to/dust3r_data/arkitscenes_processed\", resolution=(512,100), aug_crop=256\n",
    ")\n",
    "\n",
    "for idx in np.random.permutation(len(dataset)):\n",
    "    views = dataset[idx]\n",
    "    assert len(views) == dataset.num_views\n",
    "    print(dataset.num_views)\n",
    "    print([view_name(view) for view in views])\n",
    "    viz = SceneViz()\n",
    "    poses = [views[view_idx]['camera_pose'] for view_idx in range(dataset.num_views)]\n",
    "    cam_size = max(auto_cam_size(poses), 0.2)\n",
    "    for view_idx in range(dataset.num_views):\n",
    "        pts3d = views[view_idx]['pts3d']\n",
    "        valid_mask = views[view_idx]['valid_mask']\n",
    "        colors = rgb(views[view_idx]['img'])\n",
    "        viz.add_pointcloud(pts3d, colors, valid_mask)\n",
    "        viz.add_camera(pose_c2w=views[view_idx]['camera_pose'],\n",
    "                        # focal=views[view_idx]['camera_intrinsics'][0, 0],\n",
    "                        color=(view_idx * 255, (1 - view_idx) * 255, 0),\n",
    "                        image=colors,\n",
    "                        cam_size=cam_size)\n",
    "    display(viz.show())\n",
    "    break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "views = dataset[100]\n",
    "assert len(views) == dataset.num_views\n",
    "print(dataset.num_views)\n",
    "print([view_name(view) for view in views])\n",
    "viz = SceneViz()\n",
    "poses = [views[view_idx]['camera_pose'] for view_idx in range(dataset.num_views)]\n",
    "cam_size = max(auto_cam_size(poses), 0.001)\n",
    "for view_idx in range(dataset.num_views):\n",
    "    pts3d = views[view_idx]['pts3d']\n",
    "    valid_mask = views[view_idx]['valid_mask']\n",
    "    colors = rgb(views[view_idx]['img'])\n",
    "    viz.add_pointcloud(pts3d, colors, valid_mask)\n",
    "    viz.add_camera(pose_c2w=views[view_idx]['camera_pose'],\n",
    "                    focal=views[view_idx]['camera_intrinsics'][0, 0],\n",
    "                    color=(view_idx * 255, (1 - view_idx) * 255, 0),\n",
    "                    image=colors,\n",
    "                    cam_size=cam_size)\n",
    "display(viz.show())\n",
    "break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# visualize the images from views\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "\n",
    "# Load the images from the views\n",
    "images = [rgb(view['img']) for view in views]\n",
    "\n",
    "# Plot the images\n",
    "fig, axes = plt.subplots(2, 4, figsize=(16, 8))\n",
    "for i, ax in enumerate(axes.flat):\n",
    "    ax.imshow(images[i])\n",
    "    ax.axis('off')\n",
    "    ax.set_title(f\"View {i}\")\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Habitat"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Habitat\n",
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "\n",
    "import rootutils\n",
    "rootutils.setup_root(\"/path/to/fast3r/fast3r\", indicator=\".project-root\", pythonpath=True)\n",
    "\n",
    "import numpy as np\n",
    "\n",
    "from fast3r.dust3r.datasets.habitat import Habitat\n",
    "\n",
    "from fast3r.dust3r.datasets.base.base_stereo_view_dataset import view_name\n",
    "from fast3r.dust3r.utils.image import rgb\n",
    "from fast3r.dust3r.viz import SceneViz, auto_cam_size\n",
    "from IPython.display import display\n",
    "\n",
    "dataset = Habitat(1_000, split='train', ROOT=\"/path/to/dust3r_data/habitat_processed\",\n",
    "                    resolution=224, aug_crop=16)\n",
    "\n",
    "for idx in np.random.permutation(len(dataset)):\n",
    "    views = dataset[idx]\n",
    "    assert len(views) == 2\n",
    "    print(view_name(views[0]), view_name(views[1]))\n",
    "    viz = SceneViz()\n",
    "    poses = [views[view_idx]['camera_pose'] for view_idx in [0, 1]]\n",
    "    cam_size = max(auto_cam_size(poses), 0.001)\n",
    "    for view_idx in [0, 1]:\n",
    "        pts3d = views[view_idx]['pts3d']\n",
    "        valid_mask = views[view_idx]['valid_mask']\n",
    "        colors = rgb(views[view_idx]['img'])\n",
    "        viz.add_pointcloud(pts3d, colors, valid_mask)\n",
    "        viz.add_camera(pose_c2w=views[view_idx]['camera_pose'],\n",
    "                        focal=views[view_idx]['camera_intrinsics'][0, 0],\n",
    "                        color=(idx * 255, (1 - idx) * 255, 0),\n",
    "                        image=colors,\n",
    "                        cam_size=cam_size)\n",
    "    display(viz.show())\n",
    "    break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Habitat_Multiview\n",
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "\n",
    "import rootutils\n",
    "rootutils.setup_root(\"/path/to/fast3r/fast3r\", indicator=\".project-root\", pythonpath=True)\n",
    "\n",
    "import numpy as np\n",
    "\n",
    "from fast3r.dust3r.datasets.habitat_multiview import Habitat_Multiview\n",
    "\n",
    "from fast3r.dust3r.datasets.base.base_stereo_view_dataset import view_name\n",
    "from fast3r.dust3r.utils.image import rgb\n",
    "from fast3r.dust3r.viz import SceneViz, auto_cam_size\n",
    "from IPython.display import display\n",
    "\n",
    "dataset = Habitat_Multiview(1_000, data_scaling=0.5, split='train', num_views=12, ROOT=\"/path/to/dust3r_data/habitat_processed\", aug_crop=16, resolution=512)\n",
    "# dataset = Habitat_Multiview(1_000_000, split='train', num_views=4, ROOT='/path/to/dust3r_data/habitat_processed', aug_crop=16, resolution=(512,384))\n",
    "# dataset = 100 @ Habitat_Multiview(100000, split='val', num_views=12, ROOT=\"/path/to/dust3r_data/habitat_processed\", resolution=(512,384), seed=777)\n",
    "dataset.set_epoch(0)\n",
    "print(len(dataset))\n",
    "\n",
    "for idx in np.random.permutation(len(dataset)):\n",
    "    views = dataset[idx]\n",
    "    assert len(views) == dataset.num_views\n",
    "    print(len(views))\n",
    "    print([view_name(view) for view in views])\n",
    "    viz = SceneViz()\n",
    "    poses = [views[view_idx]['camera_pose'] for view_idx in [0, 1]]\n",
    "    cam_size = max(auto_cam_size(poses), 0.2)\n",
    "    for view_idx in range(dataset.num_views):\n",
    "        pts3d = views[view_idx]['pts3d']\n",
    "        valid_mask = views[view_idx]['valid_mask']\n",
    "        colors = rgb(views[view_idx]['img'])\n",
    "        viz.add_pointcloud(pts3d, colors, valid_mask)\n",
    "        viz.add_camera(pose_c2w=views[view_idx]['camera_pose'],\n",
    "                        focal=views[view_idx]['camera_intrinsics'][0, 0],\n",
    "                        color=(idx * 255, (1 - idx) * 255, 0),\n",
    "                        image=colors,\n",
    "                        cam_size=cam_size)\n",
    "    display(viz.show())\n",
    "    break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(dataset)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# BlendedMVS"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# BlendedMVS from Spann3r\n",
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "\n",
    "import rootutils\n",
    "rootutils.setup_root(\"/path/to/fast3r/fast3r\", indicator=\".project-root\", pythonpath=True)\n",
    "\n",
    "import numpy as np\n",
    "\n",
    "from fast3r.dust3r.datasets.blendedmvs_multiview import BlendedMVS_Multiview\n",
    "from fast3r.data.components.spann3r_datasets.blendedmvs import BlendMVS\n",
    "\n",
    "from fast3r.dust3r.datasets.base.base_stereo_view_dataset import view_name\n",
    "from fast3r.dust3r.utils.image import rgb\n",
    "from fast3r.dust3r.viz import SceneViz, auto_cam_size\n",
    "from IPython.display import display\n",
    "\n",
    "# dataset = BlendedMVS_Multiview(split='train', ROOT=\"/path/to/dust3r_data/blendedmvs_processed\", resolution=512, num_views=4, window_size=6, num_samples_per_window=10, ordered=True, aug_crop=16)\n",
    "dataset = BlendMVS(split='train', num_frames=20, num_seq=200, ROOT='/path/to/dust3r_data/datasets_raw/BlendedMVS', resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)])\n",
    "\n",
    "dataset.set_epoch(0)\n",
    "print(len(dataset))\n",
    "\n",
    "for idx in np.random.permutation(len(dataset)):\n",
    "    views = dataset[(idx,0)]\n",
    "    # assert len(views) == dataset.num_views\n",
    "    print(len(views))\n",
    "    print([view_name(view) for view in views])\n",
    "    viz = SceneViz()\n",
    "    poses = [views[view_idx]['camera_pose'] for view_idx in [0, 1]]\n",
    "    cam_size = max(auto_cam_size(poses), 0.5)\n",
    "    for view_idx in range(len(views)):\n",
    "        pts3d = views[view_idx]['pts3d']\n",
    "        valid_mask = views[view_idx]['valid_mask']\n",
    "        colors = rgb(views[view_idx]['img'])\n",
    "        viz.add_pointcloud(pts3d, colors, valid_mask)\n",
    "        viz.add_camera(pose_c2w=views[view_idx]['camera_pose'],\n",
    "                        focal=views[view_idx]['camera_intrinsics'][0, 0],\n",
    "                        color=(idx * 255, (1 - idx) * 255, 0),\n",
    "                        image=colors,\n",
    "                        cam_size=cam_size)\n",
    "    display(viz.show())\n",
    "    break"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# DTU"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# BlendedMVS from Spann3r\n",
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "\n",
    "import rootutils\n",
    "rootutils.setup_root(\"/path/to/fast3r/fast3r\", indicator=\".project-root\", pythonpath=True)\n",
    "\n",
    "import numpy as np\n",
    "\n",
    "from fast3r.data.components.spann3r_datasets.dtu import DTU\n",
    "\n",
    "from fast3r.dust3r.datasets.base.base_stereo_view_dataset import view_name\n",
    "from fast3r.dust3r.utils.image import rgb\n",
    "from fast3r.dust3r.viz import SceneViz, auto_cam_size\n",
    "from IPython.display import display\n",
    "\n",
    "dataset = DTU(split='test', ROOT='/path/to/dust3r_data/dtu_test_mvsnet_release', resolution=512, num_seq=1, full_video=True, kf_every=5)\n",
    "\n",
    "dataset.set_epoch(0)\n",
    "print(len(dataset))\n",
    "\n",
    "for idx in np.random.permutation(len(dataset)):\n",
    "    views = dataset[(idx,0)]\n",
    "    # assert len(views) == dataset.num_views\n",
    "    print(len(views))\n",
    "    print([view_name(view) for view in views])\n",
    "    viz = SceneViz()\n",
    "    poses = [views[view_idx]['camera_pose'] for view_idx in [0, 1]]\n",
    "    cam_size = max(auto_cam_size(poses), 0.5)\n",
    "    for view_idx in range(len(views)):\n",
    "        pts3d = views[view_idx]['pts3d']\n",
    "        valid_mask = views[view_idx]['valid_mask']\n",
    "        colors = rgb(views[view_idx]['img'])\n",
    "        viz.add_pointcloud(pts3d, colors, valid_mask)\n",
    "        viz.add_camera(pose_c2w=views[view_idx]['camera_pose'],\n",
    "                        focal=views[view_idx]['camera_intrinsics'][0, 0],\n",
    "                        color=(idx * 255, (1 - idx) * 255, 0),\n",
    "                        image=colors,\n",
    "                        cam_size=cam_size)\n",
    "    display(viz.show())\n",
    "    break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# BlendedMVS from Spann3r\n",
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "\n",
    "import rootutils\n",
    "rootutils.setup_root(\"/path/to/fast3r/fast3r\", indicator=\".project-root\", pythonpath=True)\n",
    "\n",
    "import numpy as np\n",
    "\n",
    "from fast3r.data.components.spann3r_datasets.seven_scenes import SevenScenes\n",
    "\n",
    "from fast3r.dust3r.datasets.base.base_stereo_view_dataset import view_name\n",
    "from fast3r.dust3r.utils.image import rgb\n",
    "from fast3r.dust3r.viz import SceneViz, auto_cam_size\n",
    "from IPython.display import display\n",
    "\n",
    "dataset = SevenScenes(split='test', ROOT='/path/to/dust3r_data/7_scenes_processed', resolution=512, num_seq=1, full_video=False, tuple_path=\"/path/to/dust3r_data/7_scenes_processed/\")\n",
    "\n",
    "dataset.set_epoch(0)\n",
    "print(len(dataset))\n",
    "\n",
    "for idx in np.random.permutation(len(dataset)):\n",
    "    views = dataset[(idx,0)]\n",
    "    # assert len(views) == dataset.num_views\n",
    "    print(len(views))\n",
    "    print([view_name(view) for view in views])\n",
    "    viz = SceneViz()\n",
    "    poses = [views[view_idx]['camera_pose'] for view_idx in [0, 1]]\n",
    "    cam_size = max(auto_cam_size(poses), 0.5)\n",
    "    for view_idx in range(len(views)):\n",
    "        pts3d = views[view_idx]['pts3d']\n",
    "        valid_mask = views[view_idx]['valid_mask']\n",
    "        colors = rgb(views[view_idx]['img'])\n",
    "        viz.add_pointcloud(pts3d, colors, valid_mask)\n",
    "        viz.add_camera(pose_c2w=views[view_idx]['camera_pose'],\n",
    "                        focal=views[view_idx]['camera_intrinsics'][0, 0],\n",
    "                        color=(idx * 255, (1 - idx) * 255, 0),\n",
    "                        image=colors,\n",
    "                        cam_size=cam_size)\n",
    "    display(viz.show())\n",
    "    break"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# ASE"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# ASE_Multiview\n",
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "\n",
    "import rootutils\n",
    "rootutils.setup_root(\"../fast3r\", indicator=\".project-root\", pythonpath=True)\n",
    "\n",
    "import numpy as np\n",
    "\n",
    "from fast3r.dust3r.datasets.ase_multiview import ASE_Multiview, ASE_Multiview_Simple\n",
    "\n",
    "from fast3r.dust3r.datasets.base.base_stereo_view_dataset import view_name\n",
    "from fast3r.dust3r.utils.image import rgb\n",
    "from fast3r.dust3r.viz import SceneViz, auto_cam_size\n",
    "from IPython.display import display\n",
    "\n",
    "# dataset = ASE_Multiview(\n",
    "#     split='train', data_scaling=0.5, num_views=30, window_size=30, num_samples_per_window=1, ROOT=\"/home/jianingy/research/fast3r/data/aria\", resolution=512, aug_crop=256\n",
    "# )\n",
    "dataset = ASE_Multiview_Simple(\n",
    "    split='train', data_scaling=0.5, num_views=30, ROOT=\"/home/jianingy/research/fast3r/data/aria\", resolution=512, aug_crop=256\n",
    ")\n",
    "\n",
    "for idx in np.random.permutation(len(dataset)):\n",
    "    views = dataset[idx]\n",
    "    assert len(views) == dataset.num_views\n",
    "    print(dataset.num_views)\n",
    "    print([view_name(view) for view in views])\n",
    "    viz = SceneViz()\n",
    "    poses = [views[view_idx]['camera_pose'] for view_idx in range(dataset.num_views)]\n",
    "    cam_size = max(auto_cam_size(poses), 0.5)\n",
    "    for view_idx in range(dataset.num_views):\n",
    "        height, width = views[view_idx][\"true_shape\"]\n",
    "        pts3d = views[view_idx]['pts3d']\n",
    "        valid_mask = views[view_idx]['valid_mask']\n",
    "        colors = rgb(views[view_idx]['img'])\n",
    "        viz.add_pointcloud(pts3d, colors, valid_mask)\n",
    "        viz.add_camera(pose_c2w=views[view_idx]['camera_pose'],\n",
    "                        # focal=views[view_idx]['camera_intrinsics'][0, 0],\n",
    "                        color=(view_idx * 255, (1 - view_idx) * 255, 0),\n",
    "                        image=np.uint8((views[view_idx]['img'].swapaxes(1, 2) if width < height else views[view_idx]['img']).permute(1, 2, 0) * 127.5 + 127.5),\n",
    "                        cam_size=cam_size * 3 if width < height else cam_size)\n",
    "    display(viz.show())\n",
    "    break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# visualize the rgb\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "\n",
    "# Load the images from the views\n",
    "images = [rgb(view['img']) for view in views]\n",
    "\n",
    "# Plot the images\n",
    "fig, axes = plt.subplots(1, len(images), figsize=(40, 8))\n",
    "for i, ax in enumerate(axes.flat):\n",
    "    ax.imshow(images[i])\n",
    "    ax.axis('off')\n",
    "    # ax.set_title(f\"View {i}\")\n",
    "\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "fast3r",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}