Spaces:

DDoggo
/

fast3r

Runtime error

File size: 10,968 Bytes
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "from torchvision import transforms\n",
    "from PIL import Image\n",
    "import matplotlib.pyplot as plt\n",
    "from sklearn.decomposition import PCA\n",
    "import numpy as np\n",
    "import plotly.express as px\n",
    "from plotly.subplots import make_subplots\n",
    "import plotly.graph_objects as go\n",
    "\n",
    "# Load DINOv2 model from Torch Hub\n",
    "model = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitl14')\n",
    "model.eval()\n",
    "\n",
    "# Function to resize and center crop image to multiples of 14\n",
    "def resize_and_crop_to_multiple_of_14(image, max_size=512, patch_size=14):\n",
    "    # Resize the image to max_size while preserving aspect ratio\n",
    "    width, height = image.size\n",
    "    if width > height:\n",
    "        new_width = max_size\n",
    "        new_height = int((max_size / width) * height)\n",
    "    else:\n",
    "        new_height = max_size\n",
    "        new_width = int((max_size / height) * width)\n",
    "    \n",
    "    image = image.resize((new_width, new_height))\n",
    "\n",
    "    # Calculate the target dimensions that are multiples of patch_size\n",
    "    new_width = (new_width // patch_size) * patch_size\n",
    "    new_height = (new_height // patch_size) * patch_size\n",
    "\n",
    "    # Center-crop the image to these dimensions\n",
    "    left = (image.width - new_width) // 2\n",
    "    top = (image.height - new_height) // 2\n",
    "    right = (image.width + new_width) // 2\n",
    "    bottom = (image.height + new_height) // 2\n",
    "\n",
    "    return image.crop((left, top, right, bottom)), new_width, new_height\n",
    "\n",
    "# Image preprocessing (preserving aspect ratio with max size 512 and cropping to patch size multiple)\n",
    "preprocess = transforms.Compose([\n",
    "    transforms.ToTensor(),\n",
    "    # transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),\n",
    "    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))\n",
    "])\n",
    "\n",
    "# Load an example image\n",
    "image_path = \"/path/to/unseen_book/IMG_9837.jpg\"  # Change this to your image path\n",
    "original_img = Image.open(image_path)\n",
    "\n",
    "# Resize and crop image to ensure dimensions are multiples of 14\n",
    "processed_img, new_width, new_height = resize_and_crop_to_multiple_of_14(original_img)\n",
    "\n",
    "# Print the new dimensions after cropping/rescaling\n",
    "print(f\"Image size after cropping/rescaling: {new_width}x{new_height}\")\n",
    "\n",
    "# Preprocess the image for DINOv2\n",
    "img = preprocess(processed_img).unsqueeze(0)  # Add batch dimension\n",
    "\n",
    "# Forward pass through the model to get patch tokens\n",
    "with torch.no_grad():\n",
    "    features = model.forward_features(img)['x_norm_patchtokens']  # Extract patch tokens\n",
    "    features_np = features.squeeze().cpu().numpy()  # Remove batch dimension (now num_patches x 1024)\n",
    "\n",
    "# Apply PCA to reduce each patch's 1024 features to 3D (for RGB visualization)\n",
    "pca = PCA(n_components=3)\n",
    "pca_result = pca.fit_transform(features_np)  # Shape: (num_patches * num_patches, 3)\n",
    "\n",
    "# Normalize the PCA components to range [0, 1] for RGB\n",
    "pca_result_normalized = (pca_result - pca_result.min()) / (pca_result.max() - pca_result.min())\n",
    "\n",
    "# Compute the number of patches (height and width divided by patch size 14)\n",
    "num_patches_w = new_width // 14\n",
    "num_patches_h = new_height // 14\n",
    "\n",
    "# Reshape PCA result into a grid for visualization\n",
    "pca_grid = pca_result_normalized.reshape(num_patches_h, num_patches_w, 3)\n",
    "\n",
    "# Convert the PCA grid to an image format suitable for Plotly\n",
    "pca_grid_img = (pca_grid * 255).astype(np.uint8)\n",
    "\n",
    "# Create a figure with two subplots for the original image and the PCA visualization\n",
    "fig = make_subplots(rows=1, cols=2, subplot_titles=(\"Original Image\", \"PCA of Patch Features\"))\n",
    "\n",
    "# Add the original image in the first subplot\n",
    "fig.add_trace(go.Image(z=np.array(processed_img)), row=1, col=1)\n",
    "\n",
    "# Add the PCA visualization in the second subplot\n",
    "fig.add_trace(go.Image(z=pca_grid_img), row=1, col=2)\n",
    "\n",
    "# Update layout\n",
    "fig.update_layout(\n",
    "    title=\"Original Image and PCA of DINOv2 Patch Features\",\n",
    "    margin=dict(l=20, r=20, t=40, b=20),\n",
    "    height=600,\n",
    "    width=1000\n",
    ")\n",
    "\n",
    "fig.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "img.min()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import rootutils\n",
    "rootutils.setup_root(\"/path/to/fast3r/fast3r\", indicator=\".project-root\", pythonpath=True)\n",
    "\n",
    "\n",
    "import torch\n",
    "import numpy as np\n",
    "import plotly.graph_objects as go\n",
    "from plotly.subplots import make_subplots\n",
    "from sklearn.decomposition import PCA\n",
    "from fast3r.dust3r.datasets.co3d_multiview import Co3d_Multiview\n",
    "from fast3r.dust3r.datasets.habitat_multiview import Habitat_Multiview\n",
    "from fast3r.dust3r.datasets.base.base_stereo_view_dataset import view_name\n",
    "\n",
    "# Function to unnormalize the image for visualization\n",
    "def unnormalize_image(tensor_img):\n",
    "    # Unnormalize using the ImageNet statistics\n",
    "    mean = torch.tensor([0.485, 0.456, 0.406]).view(3, 1, 1)\n",
    "    std = torch.tensor([0.229, 0.224, 0.225]).view(3, 1, 1)\n",
    "    return tensor_img * std + mean\n",
    "\n",
    "# Load DINOv2 model from Torch Hub\n",
    "model = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitl14')\n",
    "model.eval()\n",
    "\n",
    "# Initialize the Co3d_Multiview dataset\n",
    "# dataset = Co3d_Multiview(\n",
    "#     split=\"train\", num_views=10, window_degree_range=360, num_samples_per_window=100, mask_bg='rand',\n",
    "#     ROOT=\"/path/to/dust3r_data/co3d_50_seqs_per_category_subset_processed\", resolution=[(910, 910)], aug_crop=16,\n",
    "# )\n",
    "\n",
    "dataset = Habitat_Multiview(1_000_000, split='train', num_views=4, ROOT='/path/to/dust3r_data/habitat_processed', aug_crop=16, resolution=[(448, 336)])\n",
    "\n",
    "# Iterate through dataset (using one sample for this example)\n",
    "for idx in np.random.permutation(len(dataset)):\n",
    "    views = dataset[idx]\n",
    "    assert len(views) == dataset.num_views\n",
    "    print([view_name(view) for view in views])\n",
    "\n",
    "    # Extract the image for a specific view index (already a torch tensor)\n",
    "    view_idx = 0  # Choose a view to test\n",
    "    img_tensor = views[view_idx][\"img\"]\n",
    "\n",
    "    # Forward pass through the model to get patch tokens (no preprocessing needed)\n",
    "    with torch.no_grad():\n",
    "        features = model.forward_features(img_tensor.unsqueeze(0))['x_norm_patchtokens']  # Add batch dimension\n",
    "        features_np = features.squeeze().cpu().numpy()  # Remove batch dimension (now num_patches_h * num_patches_w x 1024)\n",
    "\n",
    "    # Apply PCA to reduce each patch's 1024 features to 3D (for RGB visualization)\n",
    "    pca = PCA(n_components=3)\n",
    "    pca_result = pca.fit_transform(features_np)  # Shape: (num_patches_h * num_patches_w, 3)\n",
    "\n",
    "    # Normalize the PCA components to range [0, 1] for RGB\n",
    "    pca_result_normalized = (pca_result - pca_result.min()) / (pca_result.max() - pca_result.min())\n",
    "\n",
    "    # Compute the number of patches for both height and width\n",
    "    patch_size = 14  # DINOv2 uses 14x14 patches\n",
    "    num_patches_h = img_tensor.shape[1] // patch_size\n",
    "    num_patches_w = img_tensor.shape[2] // patch_size\n",
    "\n",
    "    # Reshape PCA result into a grid for visualization\n",
    "    pca_grid = pca_result_normalized.reshape(num_patches_h, num_patches_w, 3)\n",
    "\n",
    "    # Convert the PCA grid to an image format suitable for Plotly\n",
    "    pca_grid_img = (pca_grid * 255).astype(np.uint8)\n",
    "\n",
    "    # Unnormalize the original image for visualization\n",
    "    img_unnormalized = unnormalize_image(img_tensor).cpu().numpy()\n",
    "    img_unnormalized = np.transpose(img_unnormalized, (1, 2, 0))  # Convert to HxWxC for display\n",
    "\n",
    "    # Create a figure with two subplots for the original image and the PCA visualization\n",
    "    fig = make_subplots(rows=1, cols=2, subplot_titles=(\"Original Image\", \"PCA of Patch Features\"))\n",
    "\n",
    "    # Add the original image in the first subplot\n",
    "    fig.add_trace(go.Image(z=(img_unnormalized * 255).astype(np.uint8)), row=1, col=1)\n",
    "\n",
    "    # Add the PCA visualization in the second subplot\n",
    "    fig.add_trace(go.Image(z=pca_grid_img), row=1, col=2)\n",
    "\n",
    "    # Update layout\n",
    "    fig.update_layout(\n",
    "        title=\"Original Image and PCA of DINOv2 Patch Features\",\n",
    "        margin=dict(l=20, r=20, t=40, b=20),\n",
    "        height=600,\n",
    "        width=1000\n",
    "    )\n",
    "\n",
    "    # Show the figure\n",
    "    fig.show()\n",
    "    break  # Break after one iteration to test the output\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "features.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "views[view_idx][\"img\"].min()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "img.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.forward_features(img)['x_norm_patchtokens'].shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "dust3r",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}