Spaces:
Runtime error
Runtime error
# Copyright 2024 MIT Han Lab | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
# | |
# SPDX-License-Identifier: Apache-2.0 | |
import os | |
from typing import Optional, Tuple | |
import torch | |
from torch import nn | |
from torch.nn import functional as F | |
class LiteMLA(nn.Module): | |
r"""Lightweight multiscale linear attention""" | |
PAD_VAL = 1 | |
def __init__( | |
self, | |
in_dim: int, | |
out_dim: int, | |
heads: Optional[int] = None, | |
heads_ratio: float = 1.0, | |
dim=32, | |
kernel_func="relu", | |
scales: Optional[Tuple[int]] = (5,), | |
eps=1e-15, | |
use_bias=False, | |
norm=(None, "bn2d"), | |
act=(None, None), | |
): | |
heads = heads or int(out_dim // dim * heads_ratio) | |
super().__init__() | |
self.in_dim = in_dim | |
self.out_dim = out_dim | |
self.heads = heads | |
self.dim = dim | |
self.scales = scales | |
self.eps = eps | |
self.aggreg = None | |
scales = () | |
self.kernel_func = nn.ReLU(inplace=False) | |
self.qkv = nn.Linear(in_dim, in_dim * 3, bias=use_bias) | |
self.proj = nn.Linear(out_dim, out_dim) | |
def attn_matmul(self, q, k, v: torch.Tensor) -> torch.Tensor: | |
# lightweight linear attention | |
q = self.kernel_func(q) # B, h, h_d, N | |
k = self.kernel_func(k) | |
use_fp32_attention = getattr(self, "fp32_attention", False) # necessary for NAN loss | |
if use_fp32_attention: | |
q, k, v = q.float(), k.float(), v.float() | |
v = F.pad(v, (0, 0, 0, 1), mode="constant", value=LiteMLA.PAD_VAL) | |
vk = torch.matmul(v, k) | |
out = torch.matmul(vk, q) | |
if out.dtype in [torch.float16, torch.bfloat16]: | |
out = out.float() | |
out = out[:, :, :-1] / (out[:, :, -1:] + self.eps) | |
return out | |
def forward(self, x: torch.Tensor) -> torch.Tensor: | |
B, N, C = x.shape | |
qkv = self.qkv(x).reshape(B, N, 3, C).permute(0, 2, 3, 1) | |
# B, 3, C, N --> B, C, N | |
q, k, v = qkv.unbind(1) | |
dtype = q.dtype | |
q = q.reshape(B, C // self.dim, self.dim, N) # b, h, h_d, N | |
k = k.reshape(B, C // self.dim, self.dim, N).transpose(-1, -2) # b, h, N, h_d | |
v = v.reshape(B, C // self.dim, self.dim, N) # b, h, h_d, N | |
out = self.attn_matmul(q, k, v).to(dtype) | |
out = out.view(B, C, N).permute(0, 2, 1) # B, N, C | |
out = self.proj(out) | |
return out | |
def module_str(self) -> str: | |
_str = type(self).__name__ + "(" | |
eps = f"{self.eps:.1E}" | |
_str += f"i={self.in_dim},o={self.out_dim},h={self.heads},d={self.dim},eps={eps}" | |
return _str | |
def __repr__(self): | |
return f"EPS{self.eps}-" + super().__repr__() | |