allencbzhang commited on
Commit
f55c49d
·
verified ·
1 Parent(s): c782801

Create multi_scale_deform_attn.py

Browse files
Files changed (1) hide show
  1. multi_scale_deform_attn.py +418 -0
multi_scale_deform_attn.py ADDED
@@ -0,0 +1,418 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2022 The IDEA Authors. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ # ------------------------------------------------------------------------------------------------
16
+ # Deformable DETR
17
+ # Copyright (c) 2020 SenseTime. All Rights Reserved.
18
+ # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
19
+ # ------------------------------------------------------------------------------------------------
20
+ # Modified from:
21
+ # https://github.com/fundamentalvision/Deformable-DETR/blob/main/models/ops/functions/ms_deform_attn_func.py
22
+ # https://github.com/fundamentalvision/Deformable-DETR/blob/main/models/ops/modules/ms_deform_attn.py
23
+ # https://github.com/open-mmlab/mmcv/blob/master/mmcv/ops/multi_scale_deform_attn.py
24
+ # ------------------------------------------------------------------------------------------------
25
+
26
+ import math
27
+ import warnings
28
+ from typing import Optional
29
+ import torch
30
+ import torch.nn as nn
31
+ import torch.nn.functional as F
32
+ from torch.autograd import Function
33
+ from torch.autograd.function import once_differentiable
34
+ from torch.nn.init import constant_, xavier_uniform_
35
+
36
+
37
+ # helpers
38
+ def _is_power_of_2(n):
39
+ if (not isinstance(n, int)) or (n < 0):
40
+ raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n)))
41
+ return (n & (n - 1) == 0) and n != 0
42
+
43
+
44
+ class MultiScaleDeformableAttnFunction(Function):
45
+ @staticmethod
46
+ def forward(
47
+ ctx,
48
+ value,
49
+ value_spatial_shapes,
50
+ value_level_start_index,
51
+ sampling_locations,
52
+ attention_weights,
53
+ im2col_step,
54
+ ):
55
+ ctx.im2col_step = im2col_step
56
+ output = _C.ms_deform_attn_forward(
57
+ value,
58
+ value_spatial_shapes,
59
+ value_level_start_index,
60
+ sampling_locations,
61
+ attention_weights,
62
+ ctx.im2col_step,
63
+ )
64
+ ctx.save_for_backward(
65
+ value,
66
+ value_spatial_shapes,
67
+ value_level_start_index,
68
+ sampling_locations,
69
+ attention_weights,
70
+ )
71
+ return output
72
+
73
+ @staticmethod
74
+ @once_differentiable
75
+ def backward(ctx, grad_output):
76
+ (
77
+ value,
78
+ value_spatial_shapes,
79
+ value_level_start_index,
80
+ sampling_locations,
81
+ attention_weights,
82
+ ) = ctx.saved_tensors
83
+ grad_value, grad_sampling_loc, grad_attn_weight = _C.ms_deform_attn_backward(
84
+ value,
85
+ value_spatial_shapes,
86
+ value_level_start_index,
87
+ sampling_locations,
88
+ attention_weights,
89
+ grad_output,
90
+ ctx.im2col_step,
91
+ )
92
+
93
+ return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
94
+
95
+
96
+ def multi_scale_deformable_attn_pytorch(
97
+ value: torch.Tensor,
98
+ value_spatial_shapes: torch.Tensor,
99
+ sampling_locations: torch.Tensor,
100
+ attention_weights: torch.Tensor,
101
+ ) -> torch.Tensor:
102
+
103
+ bs, _, num_heads, embed_dims = value.shape
104
+ _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape
105
+ value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
106
+ sampling_grids = 2 * sampling_locations - 1
107
+ sampling_value_list = []
108
+ for level, (H_, W_) in enumerate(value_spatial_shapes):
109
+ # bs, H_*W_, num_heads, embed_dims ->
110
+ # bs, H_*W_, num_heads*embed_dims ->
111
+ # bs, num_heads*embed_dims, H_*W_ ->
112
+ # bs*num_heads, embed_dims, H_, W_
113
+ value_l_ = (
114
+ value_list[level].flatten(2).transpose(1, 2).reshape(bs * num_heads, embed_dims, H_, W_)
115
+ )
116
+ # bs, num_queries, num_heads, num_points, 2 ->
117
+ # bs, num_heads, num_queries, num_points, 2 ->
118
+ # bs*num_heads, num_queries, num_points, 2
119
+ sampling_grid_l_ = sampling_grids[:, :, :, level].transpose(1, 2).flatten(0, 1)
120
+ # bs*num_heads, embed_dims, num_queries, num_points
121
+ sampling_value_l_ = F.grid_sample(
122
+ value_l_, sampling_grid_l_, mode="bilinear", padding_mode="zeros", align_corners=False
123
+ )
124
+ sampling_value_list.append(sampling_value_l_)
125
+ # (bs, num_queries, num_heads, num_levels, num_points) ->
126
+ # (bs, num_heads, num_queries, num_levels, num_points) ->
127
+ # (bs, num_heads, 1, num_queries, num_levels*num_points)
128
+ attention_weights = attention_weights.transpose(1, 2).reshape(
129
+ bs * num_heads, 1, num_queries, num_levels * num_points
130
+ )
131
+ output = (
132
+ (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights)
133
+ .sum(-1)
134
+ .view(bs, num_heads * embed_dims, num_queries)
135
+ )
136
+ return output.transpose(1, 2).contiguous()
137
+
138
+
139
+ class MultiScaleDeformableAttention(nn.Module):
140
+ """Multi-Scale Deformable Attention Module used in Deformable-DETR
141
+
142
+ `Deformable DETR: Deformable Transformers for End-to-End Object Detection.
143
+ <https://arxiv.org/pdf/2010.04159.pdf>`_.
144
+
145
+ Args:
146
+ embed_dim (int): The embedding dimension of Attention. Default: 256.
147
+ num_heads (int): The number of attention heads. Default: 8.
148
+ num_levels (int): The number of feature map used in Attention. Default: 4.
149
+ num_points (int): The number of sampling points for each query
150
+ in each head. Default: 4.
151
+ img2col_steps (int): The step used in image_to_column. Defualt: 64.
152
+ dropout (float): Dropout layer used in output. Default: 0.1.
153
+ batch_first (bool): if ``True``, then the input and output tensor will be
154
+ provided as `(bs, n, embed_dim)`. Default: False. `(n, bs, embed_dim)`
155
+ """
156
+
157
+ def __init__(
158
+ self,
159
+ embed_dim: int = 256,
160
+ num_heads: int = 8,
161
+ num_levels: int = 4,
162
+ num_points: int = 4,
163
+ img2col_step: int = 64,
164
+ dropout: float = 0.1,
165
+ batch_first: bool = False,
166
+ ):
167
+ super().__init__()
168
+ if embed_dim % num_heads != 0:
169
+ raise ValueError(
170
+ "embed_dim must be divisible by num_heads, but got {} and {}".format(
171
+ embed_dim, num_heads
172
+ )
173
+ )
174
+ head_dim = embed_dim // num_heads
175
+
176
+ self.dropout = nn.Dropout(dropout)
177
+ self.batch_first = batch_first
178
+
179
+ if not _is_power_of_2(head_dim):
180
+ warnings.warn(
181
+ """
182
+ You'd better set d_model in MSDeformAttn to make sure that
183
+ each dim of the attention head a power of 2, which is more efficient.
184
+ """
185
+ )
186
+
187
+ self.im2col_step = img2col_step
188
+ self.embed_dim = embed_dim
189
+ self.num_heads = num_heads
190
+ self.num_levels = num_levels
191
+ self.num_points = num_points
192
+ # n_heads * n_points and n_levels for multi-level feature inputs
193
+ self.sampling_offsets = nn.Linear(embed_dim, num_heads * num_levels * num_points * 2)
194
+ self.attention_weights = nn.Linear(embed_dim, num_heads * num_levels * num_points)
195
+ self.value_proj = nn.Linear(embed_dim, embed_dim)
196
+ self.output_proj = nn.Linear(embed_dim, embed_dim)
197
+
198
+ self.init_weights()
199
+
200
+ def init_weights(self):
201
+ """
202
+ Default initialization for Parameters of Module.
203
+ """
204
+ constant_(self.sampling_offsets.weight.data, 0.0)
205
+ thetas = torch.arange(self.num_heads, dtype=torch.float32) * (
206
+ 2.0 * math.pi / self.num_heads
207
+ )
208
+ grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
209
+ grid_init = (
210
+ (grid_init / grid_init.abs().max(-1, keepdim=True)[0])
211
+ .view(self.num_heads, 1, 1, 2)
212
+ .repeat(1, self.num_levels, self.num_points, 1)
213
+ )
214
+ for i in range(self.num_points):
215
+ grid_init[:, :, i, :] *= i + 1
216
+ with torch.no_grad():
217
+ self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
218
+ constant_(self.attention_weights.weight.data, 0.0)
219
+ constant_(self.attention_weights.bias.data, 0.0)
220
+ xavier_uniform_(self.value_proj.weight.data)
221
+ constant_(self.value_proj.bias.data, 0.0)
222
+ xavier_uniform_(self.output_proj.weight.data)
223
+ constant_(self.output_proj.bias.data, 0.0)
224
+
225
+ def forward(
226
+ self,
227
+ query: torch.Tensor,
228
+ key: Optional[torch.Tensor] = None,
229
+ value: Optional[torch.Tensor] = None,
230
+ identity: Optional[torch.Tensor] = None,
231
+ query_pos: Optional[torch.Tensor] = None,
232
+ key_padding_mask: Optional[torch.Tensor] = None,
233
+ reference_points: Optional[torch.Tensor] = None,
234
+ spatial_shapes: Optional[torch.Tensor] = None,
235
+ level_start_index: Optional[torch.Tensor] = None,
236
+ **kwargs
237
+ ) -> torch.Tensor:
238
+
239
+ """Forward Function of MultiScaleDeformableAttention
240
+
241
+ Args:
242
+ query (torch.Tensor): Query embeddings with shape
243
+ `(num_query, bs, embed_dim)`
244
+ key (torch.Tensor): Key embeddings with shape
245
+ `(num_key, bs, embed_dim)`
246
+ value (torch.Tensor): Value embeddings with shape
247
+ `(num_key, bs, embed_dim)`
248
+ identity (torch.Tensor): The tensor used for addition, with the
249
+ same shape as `query`. Default: None. If None, `query` will be
250
+ used.
251
+ query_pos (torch.Tensor): The position embedding for `query`. Default: None.
252
+ key_padding_mask (torch.Tensor): ByteTensor for `query`, with shape `(bs, num_key)`,
253
+ indicating which elements within `key` to be ignored in attention.
254
+ reference_points (torch.Tensor): The normalized reference points
255
+ with shape `(bs, num_query, num_levels, 2)`,
256
+ all elements is range in [0, 1], top-left (0, 0),
257
+ bottom-right (1, 1), including padding are.
258
+ or `(N, Length_{query}, num_levels, 4)`, add additional
259
+ two dimensions `(h, w)` to form reference boxes.
260
+ spatial_shapes (torch.Tensor): Spatial shape of features in different levels.
261
+ With shape `(num_levels, 2)`, last dimension represents `(h, w)`.
262
+ level_start_index (torch.Tensor): The start index of each level. A tensor with
263
+ shape `(num_levels, )` which can be represented as
264
+ `[0, h_0 * w_0, h_0 * w_0 + h_1 * w_1, ...]`.
265
+
266
+ Returns:
267
+ torch.Tensor: forward results with shape `(num_query, bs, embed_dim)`
268
+ """
269
+
270
+ if value is None:
271
+ value = query
272
+
273
+ if identity is None:
274
+ identity = query
275
+ if query_pos is not None:
276
+ query = query + query_pos
277
+
278
+ if not self.batch_first:
279
+ # change to (bs, num_query ,embed_dims)
280
+ query = query.permute(1, 0, 2)
281
+ value = value.permute(1, 0, 2)
282
+
283
+ bs, num_query, _ = query.shape
284
+ bs, num_value, _ = value.shape
285
+
286
+ assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value
287
+
288
+ # value projection
289
+ value = self.value_proj(value)
290
+ # fill "0" for the padding part
291
+ if key_padding_mask is not None:
292
+ value = value.masked_fill(key_padding_mask[..., None], float(0))
293
+ # [bs, all hw, 256] -> [bs, all hw, 8, 32]
294
+ value = value.view(bs, num_value, self.num_heads, -1)
295
+ # [bs, all hw, 8, 4, 4, 2]: 8 heads, 4 level features, 4 sampling points, 2 offsets
296
+ sampling_offsets = self.sampling_offsets(query).view(
297
+ bs, num_query, self.num_heads, self.num_levels, self.num_points, 2
298
+ )
299
+ # [bs, all hw, 8, 16]: 4 level 4 sampling points: 16 features total
300
+ attention_weights = self.attention_weights(query).view(
301
+ bs, num_query, self.num_heads, self.num_levels * self.num_points
302
+ )
303
+ attention_weights = attention_weights.softmax(-1)
304
+ attention_weights = attention_weights.view(
305
+ bs,
306
+ num_query,
307
+ self.num_heads,
308
+ self.num_levels,
309
+ self.num_points,
310
+ )
311
+
312
+ # bs, num_query, num_heads, num_levels, num_points, 2
313
+ if reference_points.shape[-1] == 2:
314
+
315
+ # reference_points [bs, all hw, 4, 2] -> [bs, all hw, 1, 4, 1, 2]
316
+ # sampling_offsets [bs, all hw, 8, 4, 4, 2]
317
+ # offset_normalizer [4, 2] -> [1, 1, 1, 4, 1, 2]
318
+ # references_points + sampling_offsets
319
+
320
+ offset_normalizer = torch.stack([spatial_shapes[..., 1], spatial_shapes[..., 0]], -1)
321
+ sampling_locations = (
322
+ reference_points[:, :, None, :, None, :]
323
+ + sampling_offsets / offset_normalizer[None, None, None, :, None, :]
324
+ )
325
+ elif reference_points.shape[-1] == 4:
326
+ sampling_locations = (
327
+ reference_points[:, :, None, :, None, :2]
328
+ + sampling_offsets
329
+ / self.num_points
330
+ * reference_points[:, :, None, :, None, 2:]
331
+ * 0.5
332
+ )
333
+ else:
334
+ raise ValueError(
335
+ "Last dim of reference_points must be 2 or 4, but get {} instead.".format(
336
+ reference_points.shape[-1]
337
+ )
338
+ )
339
+
340
+ # the original impl for fp32 training
341
+ if torch.cuda.is_available() and value.is_cuda:
342
+ output = MultiScaleDeformableAttnFunction.apply(
343
+ value.to(torch.float32) if value.dtype==torch.float16 else value,
344
+ spatial_shapes,
345
+ level_start_index,
346
+ sampling_locations,
347
+ attention_weights,
348
+ self.im2col_step,
349
+ )
350
+ else:
351
+ output = multi_scale_deformable_attn_pytorch(
352
+ value, spatial_shapes, sampling_locations, attention_weights
353
+ )
354
+
355
+ if value.dtype==torch.float16:
356
+ output=output.to(torch.float16)
357
+
358
+ output = self.output_proj(output)
359
+
360
+ if not self.batch_first:
361
+ output = output.permute(1, 0, 2)
362
+
363
+ return self.dropout(output) + identity
364
+
365
+
366
+ def create_dummy_class(klass, dependency, message=""):
367
+ """
368
+ When a dependency of a class is not available, create a dummy class which throws ImportError
369
+ when used.
370
+
371
+ Args:
372
+ klass (str): name of the class.
373
+ dependency (str): name of the dependency.
374
+ message: extra message to print
375
+ Returns:
376
+ class: a class object
377
+ """
378
+ err = "Cannot import '{}', therefore '{}' is not available.".format(dependency, klass)
379
+ if message:
380
+ err = err + " " + message
381
+
382
+ class _DummyMetaClass(type):
383
+ # throw error on class attribute access
384
+ def __getattr__(_, __): # noqa: B902
385
+ raise ImportError(err)
386
+
387
+ class _Dummy(object, metaclass=_DummyMetaClass):
388
+ # throw error on constructor
389
+ def __init__(self, *args, **kwargs):
390
+ raise ImportError(err)
391
+
392
+ return _Dummy
393
+
394
+
395
+ def create_dummy_func(func, dependency, message=""):
396
+ """
397
+ When a dependency of a function is not available, create a dummy function which throws
398
+ ImportError when used.
399
+
400
+ Args:
401
+ func (str): name of the function.
402
+ dependency (str or list[str]): name(s) of the dependency.
403
+ message: extra message to print
404
+ Returns:
405
+ function: a function object
406
+ """
407
+ err = "Cannot import '{}', therefore '{}' is not available.".format(dependency, func)
408
+ if message:
409
+ err = err + " " + message
410
+
411
+ if isinstance(dependency, (list, tuple)):
412
+ dependency = ",".join(dependency)
413
+
414
+ def _dummy(*args, **kwargs):
415
+ raise ImportError(err)
416
+
417
+ return _dummy
418
+