File size: 10,968 Bytes
2913579 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 |
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import torch\n",
"from torchvision import transforms\n",
"from PIL import Image\n",
"import matplotlib.pyplot as plt\n",
"from sklearn.decomposition import PCA\n",
"import numpy as np\n",
"import plotly.express as px\n",
"from plotly.subplots import make_subplots\n",
"import plotly.graph_objects as go\n",
"\n",
"# Load DINOv2 model from Torch Hub\n",
"model = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitl14')\n",
"model.eval()\n",
"\n",
"# Function to resize and center crop image to multiples of 14\n",
"def resize_and_crop_to_multiple_of_14(image, max_size=512, patch_size=14):\n",
" # Resize the image to max_size while preserving aspect ratio\n",
" width, height = image.size\n",
" if width > height:\n",
" new_width = max_size\n",
" new_height = int((max_size / width) * height)\n",
" else:\n",
" new_height = max_size\n",
" new_width = int((max_size / height) * width)\n",
" \n",
" image = image.resize((new_width, new_height))\n",
"\n",
" # Calculate the target dimensions that are multiples of patch_size\n",
" new_width = (new_width // patch_size) * patch_size\n",
" new_height = (new_height // patch_size) * patch_size\n",
"\n",
" # Center-crop the image to these dimensions\n",
" left = (image.width - new_width) // 2\n",
" top = (image.height - new_height) // 2\n",
" right = (image.width + new_width) // 2\n",
" bottom = (image.height + new_height) // 2\n",
"\n",
" return image.crop((left, top, right, bottom)), new_width, new_height\n",
"\n",
"# Image preprocessing (preserving aspect ratio with max size 512 and cropping to patch size multiple)\n",
"preprocess = transforms.Compose([\n",
" transforms.ToTensor(),\n",
" # transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),\n",
" transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))\n",
"])\n",
"\n",
"# Load an example image\n",
"image_path = \"/path/to/unseen_book/IMG_9837.jpg\" # Change this to your image path\n",
"original_img = Image.open(image_path)\n",
"\n",
"# Resize and crop image to ensure dimensions are multiples of 14\n",
"processed_img, new_width, new_height = resize_and_crop_to_multiple_of_14(original_img)\n",
"\n",
"# Print the new dimensions after cropping/rescaling\n",
"print(f\"Image size after cropping/rescaling: {new_width}x{new_height}\")\n",
"\n",
"# Preprocess the image for DINOv2\n",
"img = preprocess(processed_img).unsqueeze(0) # Add batch dimension\n",
"\n",
"# Forward pass through the model to get patch tokens\n",
"with torch.no_grad():\n",
" features = model.forward_features(img)['x_norm_patchtokens'] # Extract patch tokens\n",
" features_np = features.squeeze().cpu().numpy() # Remove batch dimension (now num_patches x 1024)\n",
"\n",
"# Apply PCA to reduce each patch's 1024 features to 3D (for RGB visualization)\n",
"pca = PCA(n_components=3)\n",
"pca_result = pca.fit_transform(features_np) # Shape: (num_patches * num_patches, 3)\n",
"\n",
"# Normalize the PCA components to range [0, 1] for RGB\n",
"pca_result_normalized = (pca_result - pca_result.min()) / (pca_result.max() - pca_result.min())\n",
"\n",
"# Compute the number of patches (height and width divided by patch size 14)\n",
"num_patches_w = new_width // 14\n",
"num_patches_h = new_height // 14\n",
"\n",
"# Reshape PCA result into a grid for visualization\n",
"pca_grid = pca_result_normalized.reshape(num_patches_h, num_patches_w, 3)\n",
"\n",
"# Convert the PCA grid to an image format suitable for Plotly\n",
"pca_grid_img = (pca_grid * 255).astype(np.uint8)\n",
"\n",
"# Create a figure with two subplots for the original image and the PCA visualization\n",
"fig = make_subplots(rows=1, cols=2, subplot_titles=(\"Original Image\", \"PCA of Patch Features\"))\n",
"\n",
"# Add the original image in the first subplot\n",
"fig.add_trace(go.Image(z=np.array(processed_img)), row=1, col=1)\n",
"\n",
"# Add the PCA visualization in the second subplot\n",
"fig.add_trace(go.Image(z=pca_grid_img), row=1, col=2)\n",
"\n",
"# Update layout\n",
"fig.update_layout(\n",
" title=\"Original Image and PCA of DINOv2 Patch Features\",\n",
" margin=dict(l=20, r=20, t=40, b=20),\n",
" height=600,\n",
" width=1000\n",
")\n",
"\n",
"fig.show()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"img.min()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import rootutils\n",
"rootutils.setup_root(\"/path/to/fast3r/fast3r\", indicator=\".project-root\", pythonpath=True)\n",
"\n",
"\n",
"import torch\n",
"import numpy as np\n",
"import plotly.graph_objects as go\n",
"from plotly.subplots import make_subplots\n",
"from sklearn.decomposition import PCA\n",
"from fast3r.dust3r.datasets.co3d_multiview import Co3d_Multiview\n",
"from fast3r.dust3r.datasets.habitat_multiview import Habitat_Multiview\n",
"from fast3r.dust3r.datasets.base.base_stereo_view_dataset import view_name\n",
"\n",
"# Function to unnormalize the image for visualization\n",
"def unnormalize_image(tensor_img):\n",
" # Unnormalize using the ImageNet statistics\n",
" mean = torch.tensor([0.485, 0.456, 0.406]).view(3, 1, 1)\n",
" std = torch.tensor([0.229, 0.224, 0.225]).view(3, 1, 1)\n",
" return tensor_img * std + mean\n",
"\n",
"# Load DINOv2 model from Torch Hub\n",
"model = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitl14')\n",
"model.eval()\n",
"\n",
"# Initialize the Co3d_Multiview dataset\n",
"# dataset = Co3d_Multiview(\n",
"# split=\"train\", num_views=10, window_degree_range=360, num_samples_per_window=100, mask_bg='rand',\n",
"# ROOT=\"/path/to/dust3r_data/co3d_50_seqs_per_category_subset_processed\", resolution=[(910, 910)], aug_crop=16,\n",
"# )\n",
"\n",
"dataset = Habitat_Multiview(1_000_000, split='train', num_views=4, ROOT='/path/to/dust3r_data/habitat_processed', aug_crop=16, resolution=[(448, 336)])\n",
"\n",
"# Iterate through dataset (using one sample for this example)\n",
"for idx in np.random.permutation(len(dataset)):\n",
" views = dataset[idx]\n",
" assert len(views) == dataset.num_views\n",
" print([view_name(view) for view in views])\n",
"\n",
" # Extract the image for a specific view index (already a torch tensor)\n",
" view_idx = 0 # Choose a view to test\n",
" img_tensor = views[view_idx][\"img\"]\n",
"\n",
" # Forward pass through the model to get patch tokens (no preprocessing needed)\n",
" with torch.no_grad():\n",
" features = model.forward_features(img_tensor.unsqueeze(0))['x_norm_patchtokens'] # Add batch dimension\n",
" features_np = features.squeeze().cpu().numpy() # Remove batch dimension (now num_patches_h * num_patches_w x 1024)\n",
"\n",
" # Apply PCA to reduce each patch's 1024 features to 3D (for RGB visualization)\n",
" pca = PCA(n_components=3)\n",
" pca_result = pca.fit_transform(features_np) # Shape: (num_patches_h * num_patches_w, 3)\n",
"\n",
" # Normalize the PCA components to range [0, 1] for RGB\n",
" pca_result_normalized = (pca_result - pca_result.min()) / (pca_result.max() - pca_result.min())\n",
"\n",
" # Compute the number of patches for both height and width\n",
" patch_size = 14 # DINOv2 uses 14x14 patches\n",
" num_patches_h = img_tensor.shape[1] // patch_size\n",
" num_patches_w = img_tensor.shape[2] // patch_size\n",
"\n",
" # Reshape PCA result into a grid for visualization\n",
" pca_grid = pca_result_normalized.reshape(num_patches_h, num_patches_w, 3)\n",
"\n",
" # Convert the PCA grid to an image format suitable for Plotly\n",
" pca_grid_img = (pca_grid * 255).astype(np.uint8)\n",
"\n",
" # Unnormalize the original image for visualization\n",
" img_unnormalized = unnormalize_image(img_tensor).cpu().numpy()\n",
" img_unnormalized = np.transpose(img_unnormalized, (1, 2, 0)) # Convert to HxWxC for display\n",
"\n",
" # Create a figure with two subplots for the original image and the PCA visualization\n",
" fig = make_subplots(rows=1, cols=2, subplot_titles=(\"Original Image\", \"PCA of Patch Features\"))\n",
"\n",
" # Add the original image in the first subplot\n",
" fig.add_trace(go.Image(z=(img_unnormalized * 255).astype(np.uint8)), row=1, col=1)\n",
"\n",
" # Add the PCA visualization in the second subplot\n",
" fig.add_trace(go.Image(z=pca_grid_img), row=1, col=2)\n",
"\n",
" # Update layout\n",
" fig.update_layout(\n",
" title=\"Original Image and PCA of DINOv2 Patch Features\",\n",
" margin=dict(l=20, r=20, t=40, b=20),\n",
" height=600,\n",
" width=1000\n",
" )\n",
"\n",
" # Show the figure\n",
" fig.show()\n",
" break # Break after one iteration to test the output\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"features.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"views[view_idx][\"img\"].min()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"img.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model.forward_features(img)['x_norm_patchtokens'].shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "dust3r",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
|