ZayarnyukNick's picture
Upload folder using huggingface_hub
864ebc9 verified
raw
history blame contribute delete
5.19 kB
# Copyright (C) 2022-present Naver Corporation. All rights reserved.
# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
# --------------------------------------------------------
# CroCo model for downstream tasks
# --------------------------------------------------------
import torch
from .croco import CroCoNet
def croco_args_from_ckpt(ckpt):
if "croco_kwargs" in ckpt: # CroCo v2 released models
return ckpt["croco_kwargs"]
elif "args" in ckpt and hasattr(
ckpt["args"], "model"
): # pretrained using the official code release
s = ckpt[
"args"
].model # eg "CroCoNet(enc_embed_dim=1024, enc_num_heads=16, enc_depth=24)"
assert s.startswith("CroCoNet(")
return eval(
"dict" + s[len("CroCoNet") :]
) # transform it into the string of a dictionary and evaluate it
else: # CroCo v1 released models
return dict()
class CroCoDownstreamMonocularEncoder(CroCoNet):
def __init__(self, head, **kwargs):
"""Build network for monocular downstream task, only using the encoder.
It takes an extra argument head, that is called with the features
and a dictionary img_info containing 'width' and 'height' keys
The head is setup with the croconet arguments in this init function
NOTE: It works by *calling super().__init__() but with redefined setters
"""
super(CroCoDownstreamMonocularEncoder, self).__init__(**kwargs)
head.setup(self)
self.head = head
def _set_mask_generator(self, *args, **kwargs):
"""No mask generator"""
return
def _set_mask_token(self, *args, **kwargs):
"""No mask token"""
self.mask_token = None
return
def _set_decoder(self, *args, **kwargs):
"""No decoder"""
return
def _set_prediction_head(self, *args, **kwargs):
"""No 'prediction head' for downstream tasks."""
return
def forward(self, img):
"""
img if of size batch_size x 3 x h x w
"""
B, C, H, W = img.size()
img_info = {"height": H, "width": W}
need_all_layers = (
hasattr(self.head, "return_all_blocks") and self.head.return_all_blocks
)
out, _, _ = self._encode_image(
img, do_mask=False, return_all_blocks=need_all_layers
)
return self.head(out, img_info)
class CroCoDownstreamBinocular(CroCoNet):
def __init__(self, head, **kwargs):
"""Build network for binocular downstream task
It takes an extra argument head, that is called with the features
and a dictionary img_info containing 'width' and 'height' keys
The head is setup with the croconet arguments in this init function
"""
super(CroCoDownstreamBinocular, self).__init__(**kwargs)
head.setup(self)
self.head = head
def _set_mask_generator(self, *args, **kwargs):
"""No mask generator"""
return
def _set_mask_token(self, *args, **kwargs):
"""No mask token"""
self.mask_token = None
return
def _set_prediction_head(self, *args, **kwargs):
"""No prediction head for downstream tasks, define your own head"""
return
def encode_image_pairs(self, img1, img2, return_all_blocks=False):
"""run encoder for a pair of images
it is actually ~5% faster to concatenate the images along the batch dimension
than to encode them separately
"""
## the two commented lines below is the naive version with separate encoding
# out, pos, _ = self._encode_image(img1, do_mask=False, return_all_blocks=return_all_blocks)
# out2, pos2, _ = self._encode_image(img2, do_mask=False, return_all_blocks=False)
## and now the faster version
out, pos, _ = self._encode_image(
torch.cat((img1, img2), dim=0),
do_mask=False,
return_all_blocks=return_all_blocks,
)
if return_all_blocks:
out, out2 = list(map(list, zip(*[o.chunk(2, dim=0) for o in out])))
out2 = out2[-1]
else:
out, out2 = out.chunk(2, dim=0)
pos, pos2 = pos.chunk(2, dim=0)
return out, out2, pos, pos2
def forward(self, img1, img2):
B, C, H, W = img1.size()
img_info = {"height": H, "width": W}
return_all_blocks = (
hasattr(self.head, "return_all_blocks") and self.head.return_all_blocks
)
out, out2, pos, pos2 = self.encode_image_pairs(
img1, img2, return_all_blocks=return_all_blocks
)
if return_all_blocks:
decout = self._decoder(
out[-1], pos, None, out2, pos2, return_all_blocks=return_all_blocks
)
decout = out + decout
else:
decout = self._decoder(
out, pos, None, out2, pos2, return_all_blocks=return_all_blocks
)
return self.head(decout, img_info)