# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
#
# This source code is licensed under the BSD license found in the
# LICENSE file in the root directory of this source tree.

import logging
import math
import random
from functools import partial
from typing import Any, List, Optional, Sequence, Tuple, Type, TypeVar, Union

import pytest
import torch
import torch.nn.functional as F
from scipy.stats import binomtest
from torch.utils.checkpoint import checkpoint

import xformers.ops
from xformers.attn_bias_utils import create_attn_bias
from xformers.ops import fmha
from xformers.ops.fmha import ALL_BW_OPS, ALL_FW_OPS
from xformers.ops.fmha.common import AttentionFwOpBase, AttentionOpBase
from xformers.ops.fmha.dispatch import _dispatch_fw_priority_list

from .utils import (
    assert_allclose,
    cuda_only,
    disable_on_rocm,
    disable_tf32,
    pack_kv_cache,
    ref_attention_bmhk_for_test,
    ref_attention_for_test,
    rocm_only,
)

compute_capability = (0, 0)
if torch.cuda.is_available():
    compute_capability = torch.cuda.get_device_capability("cuda")
sm70_or_better_only = pytest.mark.skipif(
    compute_capability < (7, 0), reason="requires sm70+"
)
sm75_or_better_only = pytest.mark.skipif(
    compute_capability < (7, 5), reason="requires sm75+"
)
sm80_or_better_only = pytest.mark.skipif(
    compute_capability < (8, 0), reason="requires sm80+"
)
skip_if_rocm = pytest.mark.skipif(
    torch.version.hip is not None, reason="not supported on ROCm"
)
skip_if_pt_cutlass = pytest.mark.skipif(
    fmha.cutlass.USE_TORCH_CUTLASS, reason="using PT cutlass"
)
_devices = ["cpu", "cuda"] if torch.cuda.is_available() else ["cpu"]

T = TypeVar(
    "T", Type[fmha.common.AttentionFwOpBase], Type[fmha.common.AttentionBwOpBase]
)

logger = logging.getLogger("xformers")


def _filter_unsupported_ops(ops: Sequence[T]) -> List[T]:
    return [
        op
        for op in ops
        if (
            "cpu" in op.SUPPORTED_DEVICES
            or op.CUDA_MINIMUM_COMPUTE_CAPABILITY <= compute_capability
        )
        and op.is_available()
    ]


ALL_FW_OPS_NO_UNPADDED_LSE = _filter_unsupported_ops(ALL_FW_OPS)
ALL_FW_OPS = _filter_unsupported_ops(
    ALL_FW_OPS
    + (
        [fmha.flash.FlashFwUnpaddedLSE]
        if fmha.flash.FLASH_SUPPORTS_UNPADDED_LSE
        else []
    )
)
ALL_BW_OPS = _filter_unsupported_ops(ALL_BW_OPS)


def sample_random_supported_fw(
    inp: fmha.Inputs, seed: int
) -> Type[fmha.common.AttentionFwOpBase]:
    r = random.Random(seed)
    fw_ops = list(ALL_FW_OPS)
    r.shuffle(fw_ops)
    for op in fw_ops:
        if op.supports(inp):
            return op
    raise NotImplementedError(f"Could not find a FW operator for: {inp}")


def generate_test_shapes_B_Mq_Mkv_H_K_Kv(op):
    shapes = []
    for B in op._TEST_BATCH_SIZES:
        for Mq in [32, 256]:
            for Mkv in [32, 64, 256, 1024]:
                for K in op._TEST_K:
                    shapes.append((B, Mq, Mkv, 1, K, K))
        Mq = 256
        Mkv = 128
        K = 32
        H = 1
        # Weird values of parameters
        for M in [2, 3, 15, 31, 32, 34, 68, 72, 90, 132, 136]:
            shapes.append((B, M, Mkv, H, K, K))
            shapes.append((B, Mq, M, H, K, K))
        for _K in [1, 2, 3, 31, 34, 36, 38, 40, 64, 80, 160, 256 + 2, 256 + 8, 512]:
            if _K <= op.SUPPORTED_MAX_K:
                shapes.append((B, Mq, Mkv, H, _K, _K))
        # Different value for K / Kv
        if op.SUPPORTS_DIFFERENT_VALUE_EMBED:
            for _K in [32, 36, 64, 256 + 8]:
                shapes.append((B, Mq, Mkv, H, K, _K))
                shapes.append((B, Mq, Mkv, H, _K, K))
        # Exotic sizes
        for K in op._TEST_K:
            shapes.append((B, 16, 1024, H, K, K))
            shapes.append((B, 1024, 16, H, K, K))
        # Some number of heads
        for H in [3, 5, 12]:
            shapes.append((max(1, B // H), Mq, Mkv, H, K, K))
    # Filter-out not supported shapes
    shapes = [
        shape
        for shape in shapes
        if len(
            op.shape_not_supported_reasons(
                Mq=shape[1], Mkv=shape[2], K=shape[4], Kv=shape[5]
            )
        )
        == 0
    ]
    # Add some random shapes
    if op in [
        fmha.cutlass.FwOp,
        fmha.cutlass.BwOp,
        fmha.flash.BwOp,
        fmha.ck.FwOp,
    ]:
        K_CHOICES = [8 * i for i in range(1, 256 // 8)]
        r = random.Random(0)
        found_count = 0
        while found_count < 200:
            B = r.randint(1, 400)
            Mq = r.randint(1, 500)
            Mkv = r.randint(1, 500)
            H = r.randint(2, 11)
            B = max(B // H, 1)
            K = r.choice(K_CHOICES)
            Kv = r.choice(K_CHOICES)
            if not op.SUPPORTS_DIFFERENT_VALUE_EMBED:
                Kv = K
            if len(op.shape_not_supported_reasons(Mq, Mkv, K, Kv)):
                continue
            found_count += 1
            shapes.append((B, Mq, Mkv, H, K, Kv))
    return shapes


def make_id(op, device, dtype, bias_type, *shape):
    return (
        f"{op.NAME}-{device}-{str(dtype)}-{bias_type.__name__}"
        f"-{'-'.join([str(s) for s in shape])}"
    )


def _generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(
    ops_list: Sequence[Type[fmha.AttentionOpBase]], max_shapes_per_op: int = 65000
):
    r = random.Random(0)
    combination = []
    ids = []
    for op in ops_list:
        op_count = 0
        # Sort list of masks, so it's deterministic across runs
        LIST_MASKS = list(sorted(op.SUPPORTED_ATTN_BIAS_TYPES, key=lambda x: str(x)))
        for shape in generate_test_shapes_B_Mq_Mkv_H_K_Kv(op):
            has_one = False
            for device in _devices:
                if device not in op.SUPPORTED_DEVICES:
                    continue
                for dtype in op.SUPPORTED_DTYPES:
                    bias_type = r.choice(LIST_MASKS)
                    # Avoid using too much memory
                    if bias_type not in [
                        type(None),
                        fmha.attn_bias.LowerTriangularMask,
                    ]:
                        B, Mq, Mkv, H, K, Kv = shape
                        B = min(B, 12)

                        if bias_type in {
                            fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask,
                            fmha.attn_bias.BlockDiagonalCausalLocalAttentionFromBottomRightMask,
                        }:
                            Mq, Mkv = min(Mkv, Mq), max(Mkv, Mq) + 2
                        elif bias_type in {
                            fmha.attn_bias.BlockDiagonalCausalWithOffsetGappyKeysMask,
                            fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask,
                            fmha.attn_bias.BlockDiagonalPaddedKeysMask,
                            fmha.attn_bias.PagedBlockDiagonalCausalWithOffsetPaddedKeysMask,
                            fmha.attn_bias.PagedBlockDiagonalPaddedKeysMask,
                        }:
                            Mq, Mkv = min(Mkv, Mq), max(Mkv, Mq)
                        shape = (B, Mq, Mkv, H, K, Kv)
                    combination.append((op, device, dtype, bias_type, *shape))
                    ids.append(
                        f"{op.NAME}-{device}-{str(dtype)}-{bias_type.__name__}"
                        f"-{'-'.join([str(s) for s in shape])}"
                    )
                    has_one = True
            if has_one:
                op_count += 1
            if op_count > max_shapes_per_op:
                break
        # Some specific shapes for which we want to run without any mask
        bias_type = type(None)
        for shape in (
            # Some strides/dims don't fit on an uint16
            (1, 128, 128, 300, 128, 128),
            (13, 1, 67, 200, 8, 8),
            (1, 1 + 2**16, 4, 1, 8, 8),
            (1, 4, 1 + 2**16, 1, 8, 8),
            # TODO: Some strides don't fit on an uint32
            # Crashes on Flash, Errors on Cutlass
            # (1, 1, 64000, 300, 128, 128)
        ):
            for device in _devices:
                if device not in op.SUPPORTED_DEVICES:
                    continue
                for dtype in op.SUPPORTED_DTYPES:
                    combination.append((op, device, dtype, bias_type, *shape))
    return {
        "argvalues": combination,
        "ids": [make_id(*c) for c in combination],
    }


parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv = pytest.mark.parametrize(
    "opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv",
    **_generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(ALL_FW_OPS),
)
parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv_NO_UNPADDED_LSE = (
    pytest.mark.parametrize(
        "opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv",
        **_generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(ALL_FW_OPS_NO_UNPADDED_LSE),
    )
)
parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv__xs = pytest.mark.parametrize(
    "opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv",
    **_generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(ALL_FW_OPS, max_shapes_per_op=1),
)
parametrize_opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv = pytest.mark.parametrize(
    "opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv",
    **_generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(ALL_BW_OPS),
)
parametrize_opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv__xs = pytest.mark.parametrize(
    "opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv",
    **_generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(ALL_BW_OPS, max_shapes_per_op=1),
)


def _rand_partition(r: random.Random, total: int, n: int) -> List[int]:
    # returns list of n nonnegative integers summing to total
    idx = {0, total}
    while len(idx) < n + 1:
        idx.add(r.randint(1, total - 1))
    s = sorted(idx)
    return [e - b for b, e in zip(s[:-1], s[1:])]


def get_bias_grad(attn_bias, clear: bool = False) -> Optional[torch.Tensor]:
    tensor_with_grad: Optional[torch.Tensor] = None
    if isinstance(attn_bias, torch.Tensor):
        tensor_with_grad = attn_bias
    if tensor_with_grad is not None:
        grad = tensor_with_grad.grad
        if clear:
            tensor_with_grad.grad = None
        return grad
    return None


def create_tensors(
    op: Optional[Type[AttentionOpBase]],
    device,
    dtype,
    attn_bias_type,
    B,
    q_len,
    kv_len,
    h,
    k,
    kv,
    *,
    attn_bias_requires_grad: bool = False,
    fmt: str = "BMK",
    g: int = 1,
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, Any]:
    torch.manual_seed(B * q_len + kv_len * k + kv)

    mask_is_bottom_right = attn_bias_type is not None and issubclass(
        attn_bias_type,
        (
            fmha.attn_bias.LowerTriangularFromBottomRightMask,
            fmha.attn_bias.LowerTriangularFromBottomRightLocalAttentionMask,
            fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask,
            fmha.attn_bias.BlockDiagonalCausalLocalAttentionFromBottomRightMask,
            fmha.attn_bias.BlockDiagonalCausalLocalAttentionMask,
            fmha.attn_bias.LocalAttentionFromBottomRightMask,
        ),
    )
    if mask_is_bottom_right and q_len > kv_len:
        # Bottom-right attention and local-attention masks require q_len <= kv_len
        kv_len = q_len

    if attn_bias_type is not None and issubclass(
        attn_bias_type,
        (
            fmha.attn_bias.PagedBlockDiagonalGappyKeysMask,
            fmha.attn_bias.PagedBlockDiagonalPaddedKeysMask,
        ),
    ):
        page_size_choices = [256, 512]
        if op is not None and issubclass(op, fmha.triton_splitk.FwOp):
            # TODO: enable small pages for flash attention when that's implemented
            page_size_choices.extend([64, 128])
        page_size = random.choice(page_size_choices)
        kv_len_paged = (kv_len + page_size - 1) // page_size * page_size
    else:
        kv_len_paged = kv_len
        page_size = None

    scale = 3
    if fmt == "BMK":
        query = torch.randn((B * h, q_len, k), device=device, dtype=dtype)
        key = torch.randn((B * h, kv_len_paged, k), device=device, dtype=dtype)
        value = torch.randn((B * h, kv_len_paged, kv), device=device, dtype=dtype)
    elif fmt == "BMHK":
        query = torch.randn((B, q_len, h, k), device=device, dtype=dtype)
        key = torch.randn((B, kv_len_paged, h, k), device=device, dtype=dtype)
        value = torch.randn((B, kv_len_paged, h, kv), device=device, dtype=dtype)
    else:
        assert fmt == "BMGHK"
        query = torch.randn((B, q_len, g, h, k), device=device, dtype=dtype)
        key = torch.randn((B, kv_len_paged, g, 1, k), device=device, dtype=dtype)
        value = torch.randn((B, kv_len_paged, g, 1, kv), device=device, dtype=dtype)

    for x in [query, key, value]:
        x.mul_(scale)

    if fmt == "BMGHK":
        # Expand - after the in-place mul
        key = key.expand((B, kv_len_paged, g, h, k))
        value = value.expand((B, kv_len_paged, g, h, k))

    if fmt == "BMK" and not fmha.common._is_bias_type_supported_in_BMK(attn_bias_type):
        attn_bias_type = None
    attn_bias = None
    if attn_bias_type is not None:
        attn_bias = create_attn_bias(
            attn_bias_type,
            batch_size=B,
            num_heads=h,
            num_heads_groups=g,
            q_len=q_len,
            kv_len=kv_len,
            dtype=dtype,
            device=device,
            requires_grad=attn_bias_requires_grad,
            fmt=fmt,
            op=op,
            page_size=page_size,
        )
        if isinstance(
            attn_bias,
            (
                fmha.attn_bias.BlockDiagonalMask,
                fmha.attn_bias.BlockDiagonalGappyKeysMask,
                fmha.attn_bias.BlockDiagonalPaddedKeysMask,
                fmha.attn_bias.PagedBlockDiagonalGappyKeysMask,
                fmha.attn_bias.PagedBlockDiagonalPaddedKeysMask,
            ),
        ):
            query, key, value = [
                x.reshape([1, -1, *x.shape[2:]]) for x in [query, key, value]
            ]

    inputs = fmha.Inputs(query=query, key=key, value=value, attn_bias=attn_bias)
    if op is not None:
        reasons = op.not_supported_reasons(inputs)
        if reasons:
            err_msg = f"{op.NAME}: unsupported ({'/'.join(reasons)})"
            # Ensure we free memory to avoid OOMs
            del query, key, value, attn_bias, inputs
            pytest.skip(err_msg)
    return query, key, value, attn_bias


def bmhk2bmk(tensor) -> torch.Tensor:
    return (
        tensor.permute((0, 2, 1, 3))
        .contiguous()
        .view([tensor.shape[0] * tensor.shape[2], tensor.shape[1], tensor.shape[3]])
    )


def bmk2bmhk(tensor, num_heads: int) -> torch.Tensor:
    return tensor.reshape([-1, num_heads, tensor.shape[1], tensor.shape[2]]).permute(
        (0, 2, 1, 3)
    )


def nanify_oob_seqlen(x: torch.Tensor) -> torch.Tensor:
    align_to = 256
    if x.shape[1] % align_to == 0:
        return x
    pad = [0, 0] * x.ndim
    pad[-3] = align_to - (x.shape[1] % align_to)
    x_pad = torch.nn.functional.pad(x, pad, value=math.nan)
    return x_pad[:, : x.shape[1]]


@pytest.mark.parametrize("fmt", ["BMK", "BMHK"])
@pytest.mark.parametrize("packed", [False, True])
@parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv_NO_UNPADDED_LSE
def test_forward(opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv, packed, fmt, **kwargs):
    (
        op,
        device,
        dtype,
        bias_type,
        batch_size,
        q_len,
        kv_len,
        h,
        k,
        kv,
    ) = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
    if packed and issubclass(
        bias_type,
        (
            fmha.attn_bias.PagedBlockDiagonalPaddedKeysMask,
            fmha.attn_bias.PagedBlockDiagonalGappyKeysMask,
        ),
    ):
        pytest.skip(
            "packed doesn't make sense with paged attention, since q has different shape than k/v"
        )
    if packed and not (k == kv and q_len == kv_len):
        pytest.skip(
            f"packed incompatible with `k ({k}) != kv ({kv})` or `q_len ({q_len}) != kv_len ({kv_len})`"
        )
    if fmt == "BMK" and not fmha.common._is_bias_type_supported_in_BMK(bias_type):
        pytest.skip("BMK incompatible with this bias")

    query, key, value, attn_bias = create_tensors(
        *opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
        fmt="BMHK" if packed else fmt,
        **kwargs,
    )

    if packed:
        c = torch.stack([query, key, value], 2)
        if fmt == "BMK":
            # bm3hk -> 3bhmk -> 3Bmk
            c = c.permute(2, 0, 3, 1, 4).view([3, -1, q_len, k])
            query, key, value = c[0], c[1], c[2]
            # Re-create bias in the right format
            attn_bias = create_attn_bias(
                bias_type=bias_type,
                batch_size=batch_size,
                num_heads=h,
                num_heads_groups=1,
                q_len=q_len,
                kv_len=kv_len,
                device=device,
                dtype=dtype,
                requires_grad=False,
                fmt=fmt,
                op=op,
            )
        elif fmt == "BMHK":
            # bm3hk -> 3 x bmhk
            query, key, value = xformers.ops.unbind(c, 2)
        else:
            assert False, f"Unsupport fmt {fmt} with packing"
        assert not query.is_contiguous()

    out = xformers.ops.memory_efficient_attention_forward(
        query, key, value, attn_bias, op=op
    )
    assert not out.isnan().any(), ("Output has NaNs", attn_bias)
    out2 = xformers.ops.memory_efficient_attention_forward(
        nanify_oob_seqlen(query),
        nanify_oob_seqlen(key),
        nanify_oob_seqlen(value),
        attn_bias,
        op=op,
    )
    assert not out2.isnan().any(), "Output has NaNs - most likely reading out-of-bounds"
    assert torch.allclose(out, out2, atol=0.0, rtol=0.0), (
        "Non-deterministic behavior",
        attn_bias,
    )

    ref = ref_attention_for_test(query, key, value, attn_bias)
    assert out.shape == ref.shape, out.shape
    assert_allclose(
        out.float(),
        ref,
        atol=op.ERROR_ATOL[dtype],
        rtol=op.ERROR_RTOL.get(dtype, 1e-5),
    )


@cuda_only
@pytest.mark.parametrize("k_len", [5, 6, 32])
@pytest.mark.parametrize("batch_size", [1, 4])
@pytest.mark.parametrize("kv_len", [128, 512])
@pytest.mark.parametrize("q_len", [128, 512])
def test_key_query_all_ones(q_len, kv_len, batch_size, k_len):
    device = "cuda"
    scale = 3
    # composable kernel doesn't support fp32
    dtype = torch.float16 if torch.version.hip else torch.float32
    query = torch.ones((batch_size, q_len, k_len), device=device, dtype=dtype)
    key = torch.ones((batch_size, kv_len, k_len), device=device, dtype=dtype)
    value = torch.randn((batch_size, kv_len, k_len), device=device, dtype=dtype) * scale

    out = xformers.ops.memory_efficient_attention(query, key, value)
    # this should be equivalent to the average over value
    ref = value.mean(1, keepdim=True).expand_as(query)

    assert_allclose(out, ref, atol=1e-5)


def _block_diag_reshape_lse(
    lse: torch.Tensor, q_seqinfo: fmha.attn_bias._SeqLenInfo
) -> torch.Tensor:
    """LSE can be padded, let's remove the padding"""
    parts = []
    for slice, (start, end) in zip(lse.unbind(0), q_seqinfo.intervals()):
        parts.append(slice[:, : end - start])
    return torch.cat(parts, dim=1).unsqueeze(0)


@disable_tf32
@parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
def test_logsumexp(opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv):
    (
        op,
        device,
        dtype,
        bias_type,
        batch_size,
        q_len,
        kv_len,
        h,
        k,
        kv,
    ) = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv

    if op is fmha.ck.FwOp:
        pytest.skip("logsumexp is not yet supported by ck-tiled fmha!")
    query, key, value, attn_bias = create_tensors(
        *opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
        fmt="BMHK",
    )

    _out, lse = xformers.ops.memory_efficient_attention_forward_requires_grad(
        query,
        key,
        value,
        op=op,
        attn_bias=attn_bias,
    )
    query = query.transpose(1, 2)
    key = key.transpose(1, 2)
    attn = (query.float() / k**0.5) @ key.float().transpose(-2, -1)
    if attn_bias is not None:
        if isinstance(
            attn_bias,
            (fmha.attn_bias.AttentionBias, fmha.attn_bias.AttentionBiasSubTensor),
        ):
            bias_shape = (1, 1, query.shape[2], key.shape[2])
            tensor_bias = attn_bias.materialize(
                bias_shape,
                device=query.device,
                dtype=torch.float32,
            )
        else:
            assert type(attn_bias) is torch.Tensor
            tensor_bias = attn_bias
        attn = attn + tensor_bias.float()
    ref_lse = attn.logsumexp(-1)
    if isinstance(
        attn_bias,
        (
            fmha.attn_bias.BlockDiagonalMask,
            fmha.attn_bias.BlockDiagonalGappyKeysMask,
            fmha.attn_bias.PagedBlockDiagonalPaddedKeysMask,
            fmha.attn_bias.BlockDiagonalPaddedKeysMask,
        ),
    ) and issubclass(op, (fmha.flash.FwOp, fmha.cutlass.FwOp)):
        # Sometimes LSE is returned in padded format, i.e. (B, H, MAX_LEN) instead of (H, TOTAL_LEN).
        # Unpad to compare with the reference.
        # This is the case for Flash Attention when UNPADDED_LSE=False and for CUTLASS.
        if op.UNPADDED_LSE:
            lse = lse.unsqueeze(0)
        else:
            lse = _block_diag_reshape_lse(lse, attn_bias.q_seqinfo)
    if op is fmha.cutlass.FwOp:
        # CUTLASS kernel pads the last dimention of LSE to 32
        lse = lse[:, :, : ref_lse.shape[2]]
    assert_allclose(lse, ref_lse, atol=2e-4)


@cuda_only
@pytest.mark.parametrize("op", [fmha.cutlass.FwOp, fmha.flash.FwOp])
def test_logsumexp_mqa(op):
    if not op.is_available():
        pytest.skip("not available")

    if op.CUDA_MINIMUM_COMPUTE_CAPABILITY > compute_capability:
        skip_reason = (
            f"requires device with capability >= {op.CUDA_MINIMUM_COMPUTE_CAPABILITY} "
            f"but your GPU has capability {compute_capability} (too old)"
        )
        pytest.skip(skip_reason)

    dtype = torch.float16
    s = 3
    query = torch.randn([1, 1, 32, 128], dtype=dtype, device="cuda") * s
    key = (torch.randn([1, 16, 1, 128], dtype=dtype, device="cuda") * s).expand(
        -1, -1, 32, -1
    )
    value = (torch.randn([1, 16, 1, 128], dtype=dtype, device="cuda") * s).expand(
        -1, -1, 32, -1
    )
    assert key.stride(2) == 0

    _, lse = xformers.ops.memory_efficient_attention_forward_requires_grad(
        query,
        key,
        value,
        op=op,
    )
    query, key, value = [x[0].transpose(0, 1) for x in [query, key, value]]
    attn = (query.float() / query.shape[-1] ** 0.5) @ key.float().transpose(-2, -1)
    ref_lse = attn.logsumexp(-1)
    assert_allclose(lse[0, :, 0], ref_lse[:, 0], atol=2e-4)


@disable_tf32
@pytest.mark.parametrize("fmt", ["BMK", "BMHK"])
@pytest.mark.parametrize("grad_out_contiguous", [False, True])
@parametrize_opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
def test_backward(
    opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
    grad_out_contiguous,
    fmt,
):
    (
        op_bw,
        device,
        dtype,
        bias_type,
        batch_size,
        q_len,
        kv_len,
        h,
        k,
        kv,
    ) = opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
    attn_bias_requires_grad = (
        random.Random(q_len + kv_len * batch_size).randint(0, 1) > 0
    )
    query, key, value, attn_bias = create_tensors(
        *opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
        attn_bias_requires_grad=attn_bias_requires_grad,
        fmt=fmt,
    )

    # To understand why we do this, check the comment on the
    # `AttentionBwOpBase` class
    scale = None
    if op_bw.SUPPORTS_CUSTOM_SCALE and query.shape[-1] < 32:
        scale = (1 / 32) ** 0.5
    op_fw = (
        sample_random_supported_fw(
            fmha.Inputs(query=query, key=key, value=value, attn_bias=attn_bias),
            seed=q_len * kv + kv_len * k,
        )
        if op_bw != fmha.cutlass.BwOp
        else fmha.cutlass.FwOp
    )

    if op_bw == fmha.ck.BwOp:
        op_fw = fmha.ck.FwOp
        if dtype == torch.bfloat16:
            pytest.skip(
                "CK Fmha backward for bfloat16 currently is not very accurate for some cases!"
            )
        if grad_out_contiguous is False:
            pytest.skip("CK Fmha does not support contiguous layout for grad_out!")
        if k % 2 != 0:
            pytest.skip(
                "CK Fmha currently requires the headdim size of query input be an even value!"
            )

    qkv = None

    if (
        fmt == "BMHK"
        and query.shape[3] == value.shape[3]
        and query.shape[1] == value.shape[1]
    ):
        qkv = torch.stack([query, key, value], 2)
        qkv.requires_grad_(True)
        # bm3hk -> 3 x bmhk
        query, key, value = xformers.ops.unbind(qkv, 2)
        assert not query.is_contiguous()

    query.requires_grad_(True)
    key.requires_grad_(True)
    value.requires_grad_(True)

    if not op_bw.supports(fmha.Inputs(query, key, value, attn_bias)):
        pytest.skip("inputs not supported")

    out = xformers.ops.memory_efficient_attention(
        query, key, value, attn_bias, scale=scale, op=(op_fw, op_bw)
    )

    grad_out = torch.randn_like(out)
    if grad_out_contiguous is False:
        grad_out = torch.tensor([1.0], dtype=query.dtype, device=device)[
            None, None, :
        ].expand_as(out)

    out.backward(grad_out)

    if qkv is None and op_bw == fmha.cutlass.BwOp:
        assert query.stride() == query.grad.stride()

    grads = []
    if qkv is None:
        grads = [query.grad, key.grad, value.grad]
        query.grad = None
        key.grad = None
        value.grad = None
    else:
        grads = [qkv.grad]
        qkv.grad = None
    if attn_bias_requires_grad:
        attn_bias_grad = get_bias_grad(attn_bias, clear=True)
        if attn_bias_grad is not None:
            grads.append(attn_bias_grad)

    ref = ref_attention_for_test(query, key, value, attn_bias, scale=scale)
    ref.backward(grad_out)

    assert_allclose(
        out.float(),
        ref.float(),
        "fw pass",
        atol=op_fw.ERROR_ATOL[dtype],
        rtol=op_fw.ERROR_RTOL[dtype],
    )

    del out
    del grad_out
    del ref

    atol = op_bw.ERROR_ATOL[dtype]
    rtol = op_bw.ERROR_RTOL[dtype]

    grads_ref = []
    grads_name = []
    if qkv is None:
        assert isinstance(query.grad, torch.Tensor)
        assert isinstance(key.grad, torch.Tensor)
        assert isinstance(value.grad, torch.Tensor)
        grads_ref = [query.grad, key.grad, value.grad]
        grads_name = ["query", "key", "value"]
    else:
        assert isinstance(qkv.grad, torch.Tensor)
        grads_ref = [qkv.grad]
        grads_name = ["qkv"]

    if attn_bias_requires_grad:
        attn_bias_grad = get_bias_grad(attn_bias)
        if attn_bias_grad is not None:
            grads_ref.append(attn_bias.grad)
            grads_name.append("bias")

    del query
    del key
    del value
    del qkv

    assert len(grads_ref) == len(
        grads
    ), "Wrong number of gradients (maybe bias grad didn't backprop?)"
    for name, calc_grad, ref_grad in zip(grads_name, grads, grads_ref):
        assert_allclose(
            calc_grad,
            ref_grad,
            msg=f"{op_fw.NAME}+{op_bw.NAME}:{name}",
            atol=atol,
            rtol=rtol,
        )


def _vec_binom_test(x, n, p):
    """
    vectorized implementation of scipy.stats.binom_test
    this makes our tests much faster
    reference: https://github.com/scipy/scipy/blob/v1.8.0/scipy/stats/_morestats.py#L2609-L2702
    """
    import numpy as np
    from scipy.stats import distributions

    x = np.atleast_1d(x)
    d = distributions.binom.pmf(x, n, p)[:, None]
    rerr = 1 + 1e-7
    # x < p * n case
    i = np.arange(np.ceil(p * n), n + 1)
    y = np.sum(distributions.binom.pmf(i, n, p) <= d * rerr, axis=1)
    pval1 = distributions.binom.cdf(x, n, p) + distributions.binom.sf(n - y, n, p)

    # other case
    i = np.arange(np.floor(p * n) + 1)
    y = np.sum(distributions.binom.pmf(i, n, p) <= d * rerr, axis=1)
    pval2 = distributions.binom.cdf(y - 1, n, p) + distributions.binom.sf(x - 1, n, p)

    pval = np.where(x < p * n, pval1, pval2)
    pval = np.minimum(1.0, pval)
    return pval


def _get_drop_mask(op, batch_size, q_len, kv_len, p, device):
    if op == fmha.cutlass.FwOp:
        mask = torch.empty((batch_size, 1, q_len, kv_len), device=device)
        rand_uniform = torch.ops.xformers._cutlass_rand_uniform(p, mask)
        mask = (rand_uniform > p).to(torch.float32)
        mask = mask.reshape(batch_size, q_len, kv_len)
    elif op == fmha.ck.FwOp:
        mask = torch.empty((batch_size, 1, q_len, kv_len), device=device)
        # rand_uniform is an int8_t tensor
        rand_uniform = torch.ops.xformers._ck_rand_uniform(p, mask)
        mask = (rand_uniform <= int((1.0 - p) * 255.0)).to(torch.float32)
        mask = mask.reshape(batch_size, q_len, kv_len)
    else:
        mask = torch.empty((batch_size, q_len, kv_len), device=device)
        mask = torch.ops.xformers._temp_dropout(mask, p)

    return mask


@cuda_only
@pytest.mark.parametrize("attn_bias", [None, fmha.attn_bias.LowerTriangularMask()])
@pytest.mark.parametrize("seed", [42, 124])
@pytest.mark.parametrize("p", [0.3, 0.7])
@pytest.mark.parametrize("k_len", [32])
@pytest.mark.parametrize("batch_size", [1, 2])
@pytest.mark.parametrize("kv_len", [3, 15, 32, 33, 65])
@pytest.mark.parametrize("q_len", [2, 33])
@pytest.mark.parametrize(
    "op",
    ALL_FW_OPS_NO_UNPADDED_LSE,
    ids=list(map(lambda t: t.NAME, ALL_FW_OPS_NO_UNPADDED_LSE)),
)
def test_dropout(op, q_len, kv_len, batch_size, k_len, p, seed, attn_bias):
    device = "cuda"
    scale = 3

    dtype = torch.float
    if torch.version.hip and op == fmha.ck.FwOp:
        dtype = torch.float16

    query = torch.randn((batch_size, q_len, k_len), device=device, dtype=dtype) * scale
    key = torch.randn((batch_size, kv_len, k_len), device=device, dtype=dtype) * scale
    value = torch.randn((batch_size, kv_len, k_len), device=device, dtype=dtype) * scale

    inputs_for_support_check = fmha.Inputs(query, key, value, attn_bias, p, None)
    if not op.supports(inputs_for_support_check):
        del query, key, value, attn_bias
        pytest.skip(f"{op.NAME}: unsupported input")

    torch.manual_seed(seed)
    out = xformers.ops.memory_efficient_attention(
        query, key, value, attn_bias, p, op=(op, None)
    )

    torch.manual_seed(seed)
    out2 = xformers.ops.memory_efficient_attention(
        query, key, value, attn_bias, p, op=(op, None)
    )

    assert_allclose(out, out2, "dropout reproducibility")

    torch.manual_seed(seed)
    mask = _get_drop_mask(op, batch_size, q_len, kv_len, p, device)
    ref = ref_attention_for_test(query, key, value, attn_bias, mask, p)

    if dtype is torch.float:
        assert_allclose(out, ref, atol=2e-4), f"{(out - ref).abs().max()}"
    else:
        assert_allclose(out.float(), ref, atol=2.8e-2), f"{(out - ref).abs().max()}"

    num_trials = 1000
    p_val_tol = 1e-6
    keep_prob = 1 - p
    masks = []
    for i in range(num_trials):
        mask = _get_drop_mask(op, batch_size, q_len, kv_len, p, device)
        masks.append(mask.clone().cpu())
    masks = torch.stack(masks, dim=0)
    p_value = binomtest(int(masks.sum()), masks.numel(), p=keep_prob).pvalue
    assert p_value > p_val_tol, p_value
    masks = masks.sum(0).flatten()
    p_values = _vec_binom_test(masks, num_trials, p=keep_prob)
    assert all(p_values > p_val_tol)


def _test_dropout_backward(q_len, kv_len, batch_size, k, p, op, dtype):
    if dtype is torch.bfloat16 and compute_capability < (8, 0):
        pytest.skip("bf16 requires Sm80")
    if not op.is_available():
        pytest.skip()

    scale = 3
    device = "cuda"
    query = torch.randn((batch_size, q_len, k), device=device, dtype=dtype) * scale
    key = torch.randn((batch_size, kv_len, k), device=device, dtype=dtype) * scale
    value = torch.randn((batch_size, kv_len, k), device=device, dtype=dtype) * scale

    query.requires_grad_(True)
    key.requires_grad_(True)
    value.requires_grad_(True)

    grad_out = torch.ones_like(query)

    assert op.supports(fmha.Inputs(query=query, key=key, value=value, p=p))

    seed = 42
    torch.manual_seed(seed)
    out = xformers.ops.memory_efficient_attention(query, key, value, p=p, op=(op, None))

    out.backward(grad_out)

    grad_q = query.grad
    grad_k = key.grad
    grad_v = value.grad

    query.grad = None
    key.grad = None
    value.grad = None

    torch.manual_seed(seed)
    mask = _get_drop_mask(op, batch_size, q_len, kv_len, p, device)

    ref = ref_attention_for_test(query, key, value, None, mask, p)
    ref.backward(grad_out)

    atol, rtol = (
        fmha.AttentionBwOpBase.ERROR_ATOL[dtype],
        fmha.AttentionBwOpBase.ERROR_RTOL[dtype],
    )
    assert_allclose(
        grad_v,
        value.grad,
        "grad_v",
        atol=atol,
        rtol=rtol,
    )
    # TODO: Investigate why precision is worse
    if dtype in [torch.float16, torch.bfloat16]:
        atol = atol * 2 + 0.15
        rtol = rtol * 2
    assert_allclose(
        grad_q,
        query.grad,
        "grad_q",
        atol=atol,
        rtol=rtol,
    )
    assert_allclose(
        grad_k,
        key.grad,
        "grad_k",
        atol=atol,
        rtol=rtol,
    )


@cuda_only
@disable_tf32
@pytest.mark.parametrize("p", [0.3, 0.7])
@pytest.mark.parametrize("k", [5, 6, 32])
@pytest.mark.parametrize("batch_size", [1, 2])
@pytest.mark.parametrize("kv_len", [3, 15, 32, 33])
@pytest.mark.parametrize("q_len", [2, 33])
def test_dropout_backward_small_k(q_len, kv_len, batch_size, k, p):
    _test_dropout_backward(
        q_len, kv_len, batch_size, k, p, op=fmha.small_k.FwOp, dtype=torch.float32
    )


@cuda_only
@disable_tf32
@pytest.mark.parametrize("p", [0.000001, 0.3, 0.7])
@pytest.mark.parametrize("k", [16, 128, 256])
@pytest.mark.parametrize("batch_size", [1, 2])
@pytest.mark.parametrize("kv_len", [3, 248, 256])
@pytest.mark.parametrize("q_len", [3, 248, 256])
@pytest.mark.parametrize("dt", ["f16", "bf16", "f32"])
def test_dropout_backward_cutlass(dt, q_len, kv_len, batch_size, k, p):
    _test_dropout_backward(
        q_len,
        kv_len,
        batch_size,
        k,
        p,
        op=fmha.cutlass.FwOp,
        dtype={"f16": torch.float16, "bf16": torch.bfloat16, "f32": torch.float32}[dt],
    )


@cuda_only
@pytest.mark.parametrize("p", [0.000001, 0.3, 0.7])
@pytest.mark.parametrize("k", [16, 64, 128])
@pytest.mark.parametrize("batch_size", [1, 2])
@pytest.mark.parametrize("kv_len", [3, 248, 256])
@pytest.mark.parametrize("q_len", [3, 248, 256])
@pytest.mark.parametrize("dt", ["f16"])
def test_dropout_backward_ck(dt, q_len, kv_len, batch_size, k, p):
    _test_dropout_backward(
        q_len,
        kv_len,
        batch_size,
        k,
        p,
        op=fmha.ck.FwOp,
        dtype={"f16": torch.float16, "bf16": torch.bfloat16, "f32": torch.float32}[dt],
    )


@cuda_only
@disable_tf32
@disable_on_rocm
@pytest.mark.parametrize("k_len", [32])
@pytest.mark.parametrize("batch_size", [1])
@pytest.mark.parametrize("kv_len", [3 * 32])
@pytest.mark.parametrize("q_len", [3 * 32])
def test_memory_efficient_attention_full_block_masked(q_len, kv_len, batch_size, k_len):
    device = "cuda"
    op_fw = fmha.small_k.FwOp
    op_bw = fmha.small_k.BwOp

    scale = 3
    query = torch.randn((batch_size, q_len, k_len), device=device) * scale
    key = torch.randn((batch_size, kv_len, k_len), device=device) * scale
    value = torch.randn((batch_size, kv_len, k_len), device=device) * scale

    # in this case, most of the blocks in a row get masked
    attn_bias = torch.full((3, 32), float("-inf"), device=device)
    attn_bias[:2, :4] = 0
    attn_bias = attn_bias.flatten()[None, None, :].expand(1, q_len, -1)

    out = xformers.ops.memory_efficient_attention(
        query, key, value, attn_bias, op=(op_fw, op_bw)
    )
    ref = ref_attention_for_test(query, key, value, attn_bias)

    assert_allclose(
        out, ref, atol=op_fw.ERROR_ATOL[query.dtype], rtol=op_fw.ERROR_RTOL[query.dtype]
    )

    query.requires_grad_(True)
    key.requires_grad_(True)
    value.requires_grad_(True)

    grad_out = torch.ones_like(query)

    out = xformers.ops.memory_efficient_attention(query, key, value, attn_bias)
    out.backward(grad_out)

    grad_q = query.grad
    grad_k = key.grad
    grad_v = value.grad

    query.grad = None
    key.grad = None
    value.grad = None

    ref = ref_attention_for_test(query, key, value, attn_bias)
    ref.backward(grad_out)

    atol = op_bw.ERROR_ATOL[query.dtype]
    rtol = op_bw.ERROR_RTOL[query.dtype]
    assert_allclose(grad_q, query.grad, "grad_q", atol=atol, rtol=rtol)
    assert_allclose(grad_k, key.grad, "grad_k", atol=atol, rtol=rtol)
    assert_allclose(grad_v, value.grad, "grad_v", atol=atol, rtol=rtol)


@pytest.mark.parametrize("fmt", ["BMK", "BMHK"])
@parametrize_opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv__xs
def test_lowlevel_api_shapes(opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv, fmt):
    query, key, value, attn_bias = create_tensors(
        *opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv, fmt=fmt
    )
    grad_out = torch.ones_like(query)
    query.requires_grad_(True)
    key.requires_grad_(True)
    value.requires_grad_(True)

    out, lse = xformers.ops.memory_efficient_attention_forward_requires_grad(
        query, key, value, attn_bias
    )
    assert out.ndim == query.ndim
    dq, dk, dv = xformers.ops.memory_efficient_attention_backward(
        grad_out, out, lse, query, key, value, attn_bias
    )
    assert dq.shape == query.shape
    assert dk.shape == key.shape
    assert dv.shape == value.shape


@parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv__xs
def test_cuda_streams(
    opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
):
    (
        op,
        device,
        dtype,
        bias_type,
        batch_size,
        q_len,
        kv_len,
        h,
        k,
        kv,
    ) = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
    if device != "cuda":
        pytest.skip("Not CUDA")

    bias_type = None
    opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv = [
        op,
        device,
        dtype,
        bias_type,
        batch_size,
        q_len,
        kv_len,
        h,
        k,
        kv,
    ]
    s_hipri = torch.cuda.Stream(priority=-1)
    s_lopri = torch.cuda.Stream(priority=0)
    query, key, value, attn_bias = create_tensors(
        *opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv, fmt="BMHK"
    )
    torch.cuda.synchronize()
    with torch.cuda.stream(s_lopri):
        torch.cuda._sleep(100_000_000)  # wait 100m cycles
        query *= 2
    s_hipri.wait_stream(s_lopri)
    with torch.cuda.stream(s_hipri):
        # If the kernel is scheduled in the main stream
        # `query * 2` has not been executed yet
        out = xformers.ops.memory_efficient_attention(query, key, value, op=(op, None))
    # Test that `s_lopri` is still sleeping
    # and that `query *= 2` has not been executed yet
    query2_main_stream = query * 2
    torch.cuda.synchronize()
    # TODO: Figure out why this is failing sometimes
    # The sleep timer seems to be high enough already ...
    # assert torch.allclose(query2_main_stream, query), "Need to increase sleep time"
    del query2_main_stream

    ref = ref_attention_for_test(query, key, value)
    assert out.shape == ref.shape, out.shape

    assert_allclose(
        out.float(),
        ref.float(),
        atol=op.ERROR_ATOL[dtype],
        rtol=op.ERROR_RTOL.get(dtype, 1e-5),
    )


@disable_tf32
@parametrize_opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv__xs
def test_custom_scale(opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv):
    p = 0.0
    scale = 0.1

    (
        op_bw,
        device,
        dtype,
        _,
        B,
        q_len,
        kv_len,
        H,
        k,
        Kv,
    ) = opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
    torch.manual_seed(q_len + kv_len + k)
    if device != "cuda":
        pytest.skip("Not CUDA")

    query, key, value, attn_bias = create_tensors(
        *opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv, fmt="BMK"
    )
    inputs = fmha.Inputs(
        query=query, key=key, value=value, attn_bias=attn_bias, scale=scale
    )
    op_fw = sample_random_supported_fw(inputs, seed=q_len * k + kv_len * k)
    grad_out = query.new_ones(B * H, q_len, Kv)
    query.requires_grad_(True)
    key.requires_grad_(True)
    value.requires_grad_(True)

    reasons = op_fw.not_supported_reasons(inputs)
    if reasons:
        pytest.skip(f"{op_fw.NAME}: unsupported ({'/'.join(reasons)})")
    reasons = op_bw.not_supported_reasons(inputs)
    if reasons:
        pytest.skip(f"{op_bw.NAME}: unsupported ({'/'.join(reasons)})")

    # NOTE: we still need to scale the inputs to not blowup
    # the pre-softmax values (numerical stability)
    s = k**-0.5
    out = xformers.ops.memory_efficient_attention(
        query * s, key, value, attn_bias, p, scale, op=(op_fw, op_bw)
    )
    out.backward(grad_out)
    grad_q, grad_k, grad_v = query.grad, key.grad, value.grad
    query.grad = key.grad = value.grad = None

    ref = ref_attention_for_test(query * s, key, value, attn_bias, None, p, scale)
    ref.backward(grad_out)
    ref_grad_q, ref_grad_k, ref_grad_v = query.grad, key.grad, value.grad
    query.grad = key.grad = value.grad = None

    atol = op_fw.ERROR_ATOL[dtype]
    rtol = op_fw.ERROR_RTOL[dtype]
    assert_allclose(out.float(), ref.float(), "out", atol=atol, rtol=rtol)
    atol = op_bw.ERROR_ATOL[dtype]
    rtol = op_bw.ERROR_RTOL[dtype]
    assert_allclose(grad_q, ref_grad_q, "grad_q", atol=atol, rtol=rtol)
    assert_allclose(grad_k, ref_grad_k, "grad_k", atol=atol, rtol=rtol)
    assert_allclose(grad_v, ref_grad_v, "grad_v", atol=atol, rtol=rtol)


def apply_attention(query, key, value, attn_bias, op_fw, proj):
    x = xformers.ops.memory_efficient_attention(
        query, key, value, attn_bias=attn_bias, op=(op_fw, None)
    )
    x = proj(x)
    return x


@pytest.mark.parametrize("use_reentrant", [False, True])
@parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv__xs
def test_grad_checkpointing(
    opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
    use_reentrant,
):
    fmt = "BMHK"
    (
        op,
        device,
        dtype,
        bias_type,
        batch_size,
        q_len,
        kv_len,
        h,
        k,
        kv,
    ) = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
    if op is fmha.triton_splitk.FwOp:
        pytest.skip("Triton Flash Decoding doesn't support backward pass yet")
    if op is fmha.ck.FwOp:
        pytest.skip("ck-tiled FMHA doesn't supported backward pass yet")

    bias_type = None
    opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv = (
        op,
        device,
        dtype,
        bias_type,
        batch_size,
        q_len,
        kv_len,
        h,
        k,
        kv,
    )
    query, key, value, attn_bias = create_tensors(
        *opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
        fmt=fmt,
    )
    qkv = None

    if (
        fmt == "BMHK"
        and query.shape[3] == value.shape[3]
        and query.shape[1] == value.shape[1]
    ):
        qkv = torch.stack([query, key, value], 2)
        qkv.requires_grad_(True)
        # bm3hk -> 3 x bmhk
        query, key, value = xformers.ops.unbind(qkv, 2)
        assert not query.is_contiguous()

    query.requires_grad_(True)
    key.requires_grad_(True)
    value.requires_grad_(True)

    proj = torch.nn.Linear(kv, k, device=device, dtype=dtype)

    x = query
    for _ in range(5):
        x = checkpoint(
            apply_attention,
            x,
            key,
            value,
            attn_bias,
            op,
            proj,
            use_reentrant=use_reentrant,
        )
    x.mean().backward()


ALL_FW_OPS_NO_SMALLK = [op for op in ALL_FW_OPS if op is not fmha.small_k.FwOp]


@pytest.mark.parametrize(
    "op", ALL_FW_OPS_NO_SMALLK, ids=[op.NAME for op in ALL_FW_OPS_NO_SMALLK]
)
def test_unsupported_cpu(op: Type[fmha.AttentionFwOpBase]):
    q = torch.empty([1, 1, 1, 32])
    with pytest.raises(ValueError):
        fmha.memory_efficient_attention(q, q, q, op=(op, None))


@cuda_only
@pytest.mark.parametrize(
    "op", ALL_FW_OPS_NO_SMALLK, ids=[op.NAME for op in ALL_FW_OPS_NO_SMALLK]
)
def test_unsupported_stride_lastdim(op: Type[fmha.AttentionFwOpBase]):
    q = torch.empty([1, 1, 32, 4], device="cuda", dtype=torch.float16).permute(
        0, 3, 1, 2
    )

    try:
        fmha.memory_efficient_attention(q, q, q, op=(op, None))
    except ValueError as e:
        if "Only work on pre-MLIR triton for now" in str(e):
            pytest.skip("Only work on pre-MLIR triton for now")
        q = q.contiguous()
        fmha.memory_efficient_attention(q, q, q, op=(op, None))


@cuda_only
@pytest.mark.parametrize(
    "op", ALL_FW_OPS_NO_SMALLK, ids=[op.NAME for op in ALL_FW_OPS_NO_SMALLK]
)
def test_unsupported_stride_alignment(op: Type[fmha.AttentionFwOpBase]):
    q = torch.empty([1, 2, 1, 33], device="cuda", dtype=torch.float16)[:, :, :, :32]

    try:
        fmha.memory_efficient_attention(q, q, q, op=(op, None))
    except ValueError as e:
        if "Only work on pre-MLIR triton for now" in str(e):
            pytest.skip("Only work on pre-MLIR triton for now")
        q = q.contiguous()
        fmha.memory_efficient_attention(q, q, q, op=(op, None))


@sm75_or_better_only
def test_unsupported_dropout_combine_flash_cutlass() -> None:
    q = torch.empty(
        [1, 4, 1, 16], device="cuda", dtype=torch.float16, requires_grad=True
    )
    with pytest.raises(ValueError):
        out = fmha.memory_efficient_attention(
            q, q, q, p=0.1, op=(fmha.cutlass.FwOp, fmha.flash.BwOp)
        )
        out.backward(out)
    with pytest.raises(ValueError):
        out = fmha.memory_efficient_attention(
            q, q, q, p=0.1, op=(fmha.flash.FwOp, fmha.cutlass.BwOp)
        )
        out.backward(out)


def test_attn_bias_causal() -> None:
    m = -math.inf
    causal_mask = torch.tensor([[0, m], [0, 0], [0, 0]])
    tensor_bias = torch.tensor([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]])

    attn_bias = fmha.attn_bias.LowerTriangularMask()
    assert_allclose(attn_bias.materialize(causal_mask.shape), causal_mask, "causal")
    attn_bias = attn_bias.add_bias(tensor_bias)
    assert_allclose(
        attn_bias.materialize(causal_mask.shape),
        tensor_bias + causal_mask,
        "causal+tensor_bias",
    )


def test_attn_bias_torch_tensor() -> None:
    tensor_bias = torch.tensor([[1.0, 2.0, 3.0], [3.0, 4.0, 5.0]])
    attn_bias = fmha.attn_bias.LowerTriangularMaskWithTensorBias(tensor_bias)
    m = -math.inf
    causal_bias = torch.tensor([[0, m, m], [0, 0, m]])
    assert_allclose(
        attn_bias.materialize((2, 3)), causal_bias + tensor_bias, "tensor_bias+causal"
    )


def test_attn_bias_blockdiag() -> None:
    queries = [
        torch.randn([1, 3, 1, 8]),
        torch.randn([1, 2, 1, 8]),
        torch.randn([1, 5, 1, 8]),
    ]
    attn_bias, q = fmha.BlockDiagonalMask.from_tensor_list(queries)

    # Verify mask
    as_tensor = attn_bias.materialize((10, 10))
    assert int((as_tensor != -math.inf).sum().item()) == 3 * 3 + 2 * 2 + 5 * 5
    assert_allclose(as_tensor[0:3, 0:3], torch.zeros([3, 3]), "batch0")
    assert_allclose(as_tensor[3:5, 3:5], torch.zeros([2, 2]), "batch1")
    assert_allclose(as_tensor[5:, 5:], torch.zeros([5, 5]), "batch2")

    # Verify we can split it back
    queries2 = attn_bias.split(q)
    assert len(queries) == len(queries2)
    for q1, q2 in zip(queries, queries2):
        assert_allclose(q1, q2)


def test_attn_bias_blockdiag_batched() -> None:
    queries = [
        torch.randn([1, 3, 1, 8]),
        torch.randn([3, 2, 1, 8]),
        torch.randn([1, 5, 1, 8]),
    ]
    attn_bias, q = fmha.BlockDiagonalMask.from_tensor_list(queries)

    # Verify mask
    as_tensor = attn_bias.materialize((14, 14))
    assert int((as_tensor != -math.inf).sum().item()) == 3 * 3 + 3 * 2 * 2 + 5 * 5
    assert_allclose(as_tensor[0:3, 0:3], torch.zeros([3, 3]), "batch0")
    assert_allclose(as_tensor[3:5, 3:5], torch.zeros([2, 2]), "batch1.0")
    assert_allclose(as_tensor[5:7, 5:7], torch.zeros([2, 2]), "batch1.1")
    assert_allclose(as_tensor[7:9, 7:9], torch.zeros([2, 2]), "batch1.2")
    assert_allclose(as_tensor[9:, 9:], torch.zeros([5, 5]), "batch2")

    # Verify we can split it back
    queries2 = attn_bias.split(q)
    assert len(queries) == len(queries2)
    for q1, q2 in zip(queries, queries2):
        assert_allclose(q1, q2)


def test_attn_bias_blockdiag_crossattn_causal() -> None:
    # Q / KV have different seqlen
    list_q = [
        torch.randn([1, 3, 1, 8]),
        torch.randn([2, 1, 1, 8]),
    ]
    list_k = [
        torch.randn([1, 2, 1, 8]),
        torch.randn([2, 3, 1, 8]),
    ]

    attn_bias, q, k, _ = fmha.attn_bias.BlockDiagonalMask.from_tensor_lists_qkv(
        list_q, list_k
    )

    # Verify mask
    as_tensor = attn_bias.materialize((q.shape[1], k.shape[1]))
    assert int((as_tensor != -math.inf).sum().item()) == 3 * 2 + 2 * 3 * 1
    assert_allclose(as_tensor[0:3, 0:2], torch.zeros([3, 2]), "batch0")
    assert_allclose(as_tensor[3:4, 2:5], torch.zeros([1, 3]), "batch1.0")
    assert_allclose(as_tensor[4:, 5:], torch.zeros([1, 3]), "batch1.1")

    # Also test causal version
    as_tensor = attn_bias.make_causal().materialize((q.shape[1], k.shape[1]))
    assert_allclose(
        as_tensor[3:4, 2:5],
        fmha.attn_bias.LowerTriangularMask().materialize((1, 3)),
        "batch1.0[causal]",
    )

    # Verify we can split it back
    list_q2 = attn_bias.split_queries(q)
    assert len(list_q) == len(list_q2)
    for q1, q2 in zip(list_q, list_q2):
        assert_allclose(q1, q2)
    with pytest.raises(ValueError):
        attn_bias.split_queries(k)
    list_k2 = attn_bias.split_kv(k)
    assert len(list_k) == len(list_k2)
    for k1, k2 in zip(list_k, list_k2):
        assert_allclose(k1, k2)


def test_attn_bias_blockdiag_crossattn_causal_with_prefix_qk_cond() -> None:
    list_q = [
        torch.randn([1, 3, 1, 8]),
    ]
    list_k = [
        torch.randn([1, 2, 1, 8]),
    ]
    attn_bias, q, k, _ = fmha.attn_bias.BlockDiagonalMask.from_tensor_lists_qkv(
        list_q, list_k
    )
    with pytest.raises(ValueError):
        attn_bias.make_causal_from_bottomright()


def test_attn_bias_blockdiag_crossattn_causal_with_prefix() -> None:
    # Q / KV have different seqlen
    list_q = [
        torch.randn([1, 2, 1, 8]),
        torch.randn([2, 2, 1, 8]),
    ]
    list_k = [
        torch.randn([1, 2, 1, 8]),
        torch.randn([2, 5, 1, 8]),
    ]

    attn_bias, q, k, _ = fmha.attn_bias.BlockDiagonalMask.from_tensor_lists_qkv(
        list_q, list_k
    )
    as_tensor = attn_bias.make_causal_from_bottomright().materialize(
        (q.shape[1], k.shape[1])
    )
    m = -math.inf
    assert_allclose(
        as_tensor[0:2, 0:2],
        torch.tensor([[0, m], [0, 0]], dtype=torch.float32),
        "batch1.1[causal_with_prefix]",
    )
    assert_allclose(
        as_tensor[2:4, 2:7],
        torch.tensor([[0, 0, 0, 0, m], [0, 0, 0, 0, 0]], dtype=torch.float32),
        "batch2.1[causal_with_prefix]",
    )
    assert_allclose(
        as_tensor[4:6, 7:12],
        torch.tensor([[0, 0, 0, 0, m], [0, 0, 0, 0, 0]], dtype=torch.float32),
        "batch2.2[causal_with_prefix]",
    )


@cuda_only
def test_attn_bias_padded() -> None:
    bsize, n_heads, d, padding = 8, 3, 8, 32
    torch.manual_seed(0)

    # Q / KV have different seqlen
    k = torch.randn((bsize, padding, n_heads, d), device="cuda", dtype=torch.float16)
    k_seqlen = [5, 8, 7, 1, 9, 3, 12, 32]
    other = bsize - 1
    v = torch.randn((bsize, padding, n_heads, d), device="cuda", dtype=torch.float16)
    n_q_first = 4
    q = [
        torch.randn((1, n_q_first, n_heads, d), device="cuda", dtype=torch.float16),
        torch.randn((1, other, n_heads, d), device="cuda", dtype=torch.float16),
    ]
    q_cat = torch.cat([x.view(1, -1, n_heads, d) for x in q], dim=1)
    q_seqlen = [n_q_first] + [1] * other

    attn_bias = fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask.from_seqlens(
        q_seqlen=q_seqlen,
        kv_seqlen=k_seqlen,
        kv_padding=padding,
    )

    v = v.view(1, -1, n_heads, d)
    k = k.view(1, -1, n_heads, d)

    scores = (q_cat.transpose(1, 2) @ k.transpose(1, 2).transpose(2, 3)).float()
    assert not scores.isnan().any()
    mask = torch.full_like(scores, -float("inf"))
    for i, (slen, qlen) in enumerate(zip(k_seqlen, q_seqlen)):
        kseq_start = i * padding
        qstart = sum(q_seqlen[:i])
        mask[:, :, qstart : qstart + qlen, kseq_start : kseq_start + slen] = torch.triu(
            mask[:, :, qstart : qstart + qlen, kseq_start : kseq_start + slen].float(),
            diagonal=1 + slen - qlen,
        ).float()

    scores += mask
    assert not scores.isnan().any()
    # 1,3,10,8 @ 1,3,8,256 -> 1,3,10,256
    scores = torch.nn.functional.softmax(scores, -1).half()
    # torch.Size([1, 3, 3, 32]) @ torch.Size([1, 3, 32, 8])
    output = scores @ v.transpose(1, 2)  # 1,3,10,256 @ 1,3,256, 8 -> 1,3,10,8
    output = output.transpose(1, 2).contiguous()

    fmha_output = fmha.memory_efficient_attention_forward(
        q_cat, k, v, attn_bias, scale=1.0
    )

    # assert torch.allclose(output, fmha_output)
    assert_allclose(
        output,
        fmha_output,
        atol=fmha.cutlass.FwOp.ERROR_ATOL[torch.float16],
        rtol=fmha.cutlass.FwOp.ERROR_RTOL[torch.float16],
    )


@cuda_only
def test_attn_bias_to_copy() -> None:
    def _test_to_copy(attn_bias: torch.Tensor) -> None:
        assert attn_bias.device.type == "cpu", f"{attn_bias.device}"
        attn_bias_cuda = attn_bias.cuda()
        assert attn_bias_cuda.device.type == "cuda", f"{attn_bias_cuda.device}"
        attn_bias_fp16 = attn_bias.to(torch.float16)
        assert attn_bias_fp16.device.type == "cpu", f"{attn_bias_fp16.device}"
        assert attn_bias_fp16.dtype == torch.float16, f"{attn_bias_fp16.dtype}"

    attn_bias = fmha.attn_bias.LowerTriangularMask().to("cpu")
    _test_to_copy(attn_bias)

    tensor_bias = torch.tensor([[1.0, 2.0, 3.0], [3.0, 4.0, 5.0]])
    attn_bias = fmha.attn_bias.LowerTriangularMaskWithTensorBias(tensor_bias).to("cpu")
    _test_to_copy(attn_bias)


def _kv_heads_label(kv_heads: Optional[int]) -> str:
    if kv_heads is None:
        return ""
    if kv_heads == 1:
        return "mq"
    return f"gqa{kv_heads}"


@sm70_or_better_only
@pytest.mark.parametrize(
    "op",
    [
        fmha.decoder.FwOp if torch.version.cuda else fmha.ck_decoder.FwOp,
    ],
)
@pytest.mark.parametrize("kv_heads", [None, 1, 2], ids=_kv_heads_label)
@pytest.mark.parametrize("bsz,n_heads", [(1, 1), (1, 16), (1, 32), (8, 1), (4, 8)])
@pytest.mark.parametrize("padding", [32, 4096])
@pytest.mark.parametrize("dtype", ["f16", "bf16", "f32"])
def test_decoder(
    op,
    n_heads: int,
    kv_heads: Optional[int],
    padding: int,
    bsz: int,
    dtype: str,
    dequant: bool = False,
    num_queries: int = 1,
    d: int = 128,
) -> None:
    # kv_heads = 1: multiquery
    # kv_heads = None: neither MQA nor GQA
    # kv_heads > 1: BMGHK
    if dtype == "bf16" and compute_capability < (8, 0):
        raise pytest.skip("BF16 is only supported on SM80+")
    import triton

    if dequant and triton.__version__[:4] < "3.0.":
        raise pytest.skip("dequant needs triton updates")
    dtype_ = {"f16": torch.float16, "bf16": torch.bfloat16, "f32": torch.float32}[dtype]
    torch.manual_seed(1)
    if kv_heads is not None and kv_heads > 1:
        k_shape: Tuple[int, ...] = (1, bsz * padding, kv_heads, n_heads, d)
        q_shape: Tuple[int, ...] = (
            1,
            bsz * num_queries,
            kv_heads,
            n_heads,
            d,
        )
    else:
        k_shape = (1, bsz * padding, n_heads, d)
        q_shape = (1, bsz * num_queries, n_heads, d)

    # TODO: support 2 kv heads etc.
    k = torch.randn(k_shape, dtype=dtype_, device="cuda")
    k_seqlen = torch.randint(num_queries, padding + 1, (bsz,)).tolist()
    v = torch.randn(k_shape, dtype=dtype_, device="cuda")
    q = torch.randn(q_shape, dtype=dtype_, device="cuda")

    if dequant:
        k_shape = k_shape[:-1] + (d // 8 + op.NUM_GROUPS,)
        k = torch.zeros(k_shape, dtype=torch.int32, device="cuda")
        k.random_()
        k[..., : op.NUM_GROUPS].view(torch.float16).fill_(1.0)
        v = torch.zeros(k_shape, dtype=torch.int32, device="cuda")
        v.random_()
        v[..., : op.NUM_GROUPS].view(torch.float16).fill_(1.0)

    if kv_heads is not None:
        k = k[..., :1, :].expand(k_shape)
        v = v[..., :1, :].expand(k_shape)

    if skip_reasons := op.not_supported_reasons(fmha.Inputs(q, k, v)):
        pytest.skip("; ".join(skip_reasons))

    attn_bias = fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask.from_seqlens(
        q_seqlen=[num_queries] * bsz,
        kv_seqlen=k_seqlen,
        kv_padding=padding,
    )

    decoder_output = fmha.memory_efficient_attention_forward(
        q,
        k,
        v,
        attn_bias,
        op=op,
    )

    def dequant_cache(x):
        x = x[..., op.NUM_GROUPS :, None].expand(k_shape[:-1] + (d // 8, 8))
        x = x // (2 ** (4 * torch.arange(8, device="cuda")))
        x = (x % 16).flatten(start_dim=-2)
        return x.to(dtype_) + 1.0

    if dequant:
        k = dequant_cache(k)
        v = dequant_cache(v)

    ref_output = ref_attention_for_test(q, k, v, attn_bias)

    assert_allclose(
        decoder_output.to(ref_output.dtype),
        ref_output,
        atol=op.ERROR_ATOL[dtype_] * 4,
        rtol=op.ERROR_RTOL[dtype_],
    )


@sm80_or_better_only
@pytest.mark.parametrize(
    "op,dequant,dtype",
    [
        (fmha.triton_splitk.FwOp_S1, False, "bf16"),
        (fmha.triton_splitk.FwOp_S2, False, "f16"),
        (fmha.triton_splitk.FwOp_S2, True, "bf16"),
        (
            type(
                "S2_8", (fmha.triton_splitk.FwOp_S2,), {"NUM_GROUPS": 8, "NAME": "S2_8"}
            ),
            True,
            "bf16",
        ),
    ],
)
@pytest.mark.parametrize("kv_heads", [None, 1, 2], ids=_kv_heads_label)
@pytest.mark.parametrize("n_heads", [16])
@pytest.mark.parametrize("padding, bsz", [(32, 8), (4096, 1)])
def test_triton_splitk_decoder(
    op,
    dequant: bool,
    kv_heads: Optional[int],
    n_heads: int,
    padding: int,
    bsz: int,
    dtype: str,
) -> None:
    # We omit dequant with f16: it needs a very high tol
    test_decoder(
        op,
        kv_heads=kv_heads,
        n_heads=n_heads,
        padding=padding,
        bsz=bsz,
        dtype=dtype,
        dequant=dequant,
    )


@rocm_only
@pytest.mark.parametrize(
    "op", [fmha.ck_splitk.FwOp_S1, fmha.ck_splitk.FwOp_S2, fmha.ck_splitk.FwOp_S4]
)
@pytest.mark.parametrize("dtype", ["f32"])
@pytest.mark.parametrize("kv_heads", [None, 1, 2], ids=_kv_heads_label)
@pytest.mark.parametrize("n_heads", [16])
@pytest.mark.parametrize("d", [128, 256])
@pytest.mark.parametrize("padding, bsz", [(32, 8), (4096, 1), (32, 1), (4096, 8)])
def test_ck_splitk_decoder(
    op,
    kv_heads: Optional[int],
    n_heads: int,
    padding: int,
    bsz: int,
    dtype: str,
    d: int,
) -> None:
    # no quantized impl compared to cuda
    test_decoder(
        op,
        kv_heads=kv_heads,
        n_heads=n_heads,
        padding=padding,
        bsz=bsz,
        dtype=dtype,
        d=d,
    )


@sm80_or_better_only
@pytest.mark.parametrize(
    "op",
    [
        fmha.triton_splitk.FwOp_S1,
        fmha.triton_splitk.FwOp_S2,
    ],
    ids=lambda op: f"splitk{op.SPLIT_K}",
)
@pytest.mark.parametrize("multiquery", [True, False], ids=lambda x: "mq" if x else "")
# n_heads=1 => it's ambiguous whether can count as multiquery
@pytest.mark.parametrize("padding, bsz", [(32, 8), (44, 1)])
@pytest.mark.parametrize("dtype", ["f16", "bf16"])
@pytest.mark.parametrize("n_heads, num_queries", [(2, 4), (2, 5), (6, 7), (20, 3)])
def test_triton_splitk_decoder_manyqueries(
    op,
    multiquery: bool,
    n_heads: int,
    padding: int,
    bsz: int,
    dtype: str,
    num_queries: int,
) -> None:
    kv_heads = 1 if multiquery else None
    test_decoder(
        op,
        kv_heads=kv_heads,
        n_heads=n_heads,
        padding=padding,
        bsz=bsz,
        dtype=dtype,
        num_queries=num_queries,
        dequant=False,
    )


def test_attn_bias_from_seqlens() -> None:
    bias = fmha.attn_bias.BlockDiagonalMask.from_seqlens([3, 5, 1])
    out = bias.split(torch.randn([1, 3 + 5 + 1, 16]))
    assert len(out) == 3
    assert tuple(out[0].shape) == (1, 3, 16)


@cuda_only
def test_attn_bias_blockdiag_doc() -> None:
    """IMPORTANT:
    This is the example in the doc for `BlockDiagonalMask`.
    If this example needs to be updated, please also update the doc
    """
    import torch

    from xformers.ops import fmha

    if torch.version.hip:
        pytest.skip("backward pass/gradience is not yet supported by ck-tiled fmha!")

    K = 16
    dtype = torch.float16
    device = "cuda"
    list_x = [
        torch.randn([1, 3, 1, K], dtype=dtype, device=device),
        torch.randn([1, 6, 1, K], dtype=dtype, device=device),
        torch.randn([1, 2, 1, K], dtype=dtype, device=device),
    ]
    attn_bias, x = fmha.BlockDiagonalMask.from_tensor_list(list_x)

    linear = torch.nn.Linear(K, K * 3).to(device=device, dtype=dtype)  # type: ignore

    q, k, v = linear(x).reshape([1, -1, 1, 3, K]).unbind(-2)
    out = fmha.memory_efficient_attention(q, k, v, attn_bias=attn_bias)
    list_out = attn_bias.split(out)
    assert tuple(list_out[0].shape) == (1, 3, 1, K)


@cuda_only
class TestAttnBias:
    @staticmethod
    def create_tensors(
        dtype,
        B: int = 2,
        Mq: int = 32,
        Mkv: int = 32,
        H: int = 3,
        K: int = 16,
        Kv: int = 16,
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
        return (
            torch.randn([B, Mq, H, K], device="cuda", dtype=dtype) * 3,
            torch.randn([B, Mkv, H, K], device="cuda", dtype=dtype) * 3,
            torch.randn([B, Mkv, H, Kv], device="cuda", dtype=dtype) * 3,
            torch.randn([B, H, Mq, Mkv], device="cuda", dtype=dtype) * 3,
        )

    @staticmethod
    def pad_bias(bias: torch.Tensor) -> torch.Tensor:
        align_to = 16
        if (bias.shape[-1] % align_to) == 0:
            return bias
        pad_count = align_to - (bias.shape[-1] % align_to)
        return torch.nn.functional.pad(bias, [0, pad_count])[:, :, :, : bias.shape[-1]]

    def test_f16_biasf32(self) -> None:
        q, k, v, bias = self.create_tensors(torch.float16)
        fmha.memory_efficient_attention(q, k, v, attn_bias=bias)
        bias = bias.to(torch.float32)
        with pytest.raises((ValueError, RuntimeError)):
            fmha.memory_efficient_attention(q, k, v, attn_bias=bias)

    @disable_on_rocm
    def test_f32_biasf16(self) -> None:
        q, k, v, bias = self.create_tensors(torch.float32)
        fmha.memory_efficient_attention(q, k, v, attn_bias=bias)
        bias = bias.to(torch.float16)
        with pytest.raises((ValueError, RuntimeError)):
            fmha.memory_efficient_attention(q, k, v, attn_bias=bias)

    @pytest.mark.parametrize("dtype", [torch.float32, torch.float16])
    def test_wrong_alignment(self, dtype) -> None:
        op = fmha.cutlass.FwOp if torch.version.cuda else fmha.ck.FwOp
        if dtype not in op.SUPPORTED_DTYPES:
            pytest.skip(
                f"{dtype=} is not supported by {op.__module__}.{op.__qualname__}"
            )

        q, k, v, bias = self.create_tensors(dtype, Mq=7, Mkv=5)
        try:
            fmha.memory_efficient_attention(q, k, v, attn_bias=bias, op=(op, None))
            return
        except (ValueError, RuntimeError):
            pass
        # This case is not supported, likely due to padding issues
        # Let's make sure it works with padding
        assert bias.ndim == 4, bias.shape
        bias_padded = self.pad_bias(bias)
        out = fmha.memory_efficient_attention(
            q, k, v, attn_bias=bias_padded, op=(op, None)
        ).float()
        ref_out = ref_attention_bmhk_for_test(q, k, v, bias)
        assert_allclose(
            out, ref_out, atol=op.ERROR_ATOL[dtype], rtol=op.ERROR_RTOL[dtype]
        )

    def test_permuted_attn_bias(self) -> None:
        op = fmha.cutlass.FwOp
        dtype = torch.float16
        q, k, v, bias = self.create_tensors(dtype, Mq=7, Mkv=7)
        bias = bias.transpose(-1, -2)  # now `stride(-1) != 1`
        # Either it works, or it raises an exception
        # but we should never get a CUDA error
        try:
            out = fmha.memory_efficient_attention(
                q, k, v, attn_bias=bias, op=(op, None)
            ).float()
            ref_out = ref_attention_bmhk_for_test(q, k, v, bias)
            assert_allclose(
                out, ref_out, atol=op.ERROR_ATOL[dtype], rtol=op.ERROR_RTOL[dtype]
            )
        except (ValueError, RuntimeError):
            pass


SM_AND_SHMEM_KBYTES = [
    # https://docs.nvidia.com/cuda/cuda-c-programming-guide/#features-and-technical-specifications-technical-specifications-per-compute-capability
    (50, 64),
    (60, 64),
    (70, 96),
    (75, 64),
    (80, 163),
    (86, 99),
    (89, 99),
    # (90, 227),
]


@cuda_only
@disable_on_rocm
@skip_if_pt_cutlass
@pytest.mark.parametrize("dtype_str", ["f32", "f16", "bf16"])
@pytest.mark.parametrize(
    "sm_shmem",
    SM_AND_SHMEM_KBYTES,
    ids=[f"cc{sm}_shmem{shmem}kb" for sm, shmem in SM_AND_SHMEM_KBYTES],
)
def test_has_kernel_for(sm_shmem: Tuple[int, int], dtype_str: str) -> None:
    dtype = {"f32": torch.float, "f16": torch.half, "bf16": torch.bfloat16}[dtype_str]
    sm, shmem_kbytes = sm_shmem
    if sm < 80 and dtype_str == "bf16":
        return

    for k in [16, 32, 64, 128, 256]:
        assert torch.ops.xformers._has_cutlassF_kernel_for(
            dtype, sm, shmem_kbytes * 1024, k
        ), f"k={k}"
        assert torch.ops.xformers._has_cutlassB_kernel_for(
            dtype, sm, shmem_kbytes * 1024, k
        ), f"k={k}"


def test_window_size_materialize() -> None:
    seqlens = [4, 6]
    attn_bias = fmha.attn_bias.BlockDiagonalMask.from_seqlens(
        q_seqlen=seqlens,
        kv_seqlen=seqlens,
    ).make_local_attention(2)
    mask = attn_bias.materialize(
        (1, 1, sum(seqlens), sum(seqlens)),
        device="cpu",
        dtype=torch.float32,
    )
    true_mask = torch.log(
        torch.Tensor(
            [
                [
                    [
                        [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
                        [1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
                        [0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
                        [0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
                        [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0],
                        [0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0],
                        [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0],
                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0],
                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0],
                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0],
                    ]
                ]
            ]
        )
    )
    assert torch.all(mask == true_mask)


@cuda_only
@pytest.mark.parametrize("Mq", [1, 512])
@pytest.mark.parametrize(
    "opFW_biasT",
    [
        (op, biasT)
        for op in ALL_FW_OPS_NO_UNPADDED_LSE
        for biasT in op.SUPPORTED_ATTN_BIAS_TYPES
        if op.SUPPORTS_BMGHK
    ],
    ids=lambda o: f"{o[0].NAME}-{o[1].__name__}" if isinstance(o, tuple) else "",
)
def test_forward_gqa(opFW_biasT, Mq: int):
    opFW, biasT = opFW_biasT
    if Mq < 512 and (
        issubclass(biasT, fmha.attn_bias.LowerTriangularMask)
        or issubclass(biasT, fmha.attn_bias.BlockDiagonalCausalMask)
    ):
        pytest.skip("undefined upper left")
    B_Mq_Mkv_H_K_Kv = (3, Mq, 512, 16, 128, 128)
    test_forward(
        (
            opFW,
            "cuda",
            torch.float16,
            biasT,
            *B_Mq_Mkv_H_K_Kv,
        ),
        packed=False,
        fmt="BMGHK",
        g=2,
    )


@cuda_only
@pytest.mark.parametrize(
    "opBW",
    [
        fmha.flash.BwOp,
        fmha.cutlass.BwOp,
    ],
)
def test_backward_gqa(opBW):
    H = 8
    B_Mq_Mkv_H_K_Kv = (3, 512, 512, H, 128, 128)
    dtype = torch.float16
    query, key, value, attn_bias = create_tensors(
        *(opBW, "cuda", dtype, type(None), *B_Mq_Mkv_H_K_Kv),
        attn_bias_requires_grad=False,
        fmt="BMHK",
    )
    op = (fmha.cutlass.FwOp, opBW)
    key = key[:, :, :1].expand(-1, -1, H, -1)
    value = value[:, :, :1].expand(-1, -1, H, -1)
    key.requires_grad_(True)
    out = fmha.memory_efficient_attention(query, key, value, attn_bias=attn_bias)
    out_ref = ref_attention_bmhk_for_test(query, key, value, attn_bias=attn_bias)
    assert_allclose(
        out.float(),
        out_ref.float(),
        atol=op[0].ERROR_ATOL[dtype],
        rtol=op[0].ERROR_RTOL[dtype],
    )
    out.backward(query)
    dk = key.grad
    key.grad = None
    out_ref.backward(query)
    assert_allclose(
        dk.float(),
        key.grad.float(),
        atol=op[1].ERROR_ATOL[dtype],
        rtol=op[1].ERROR_RTOL[dtype],
    )


@cuda_only
@pytest.mark.parametrize(
    "opFW", [op for op in ALL_FW_OPS_NO_UNPADDED_LSE if op.SUPPORTS_BMGHK]
)
def test_forward_gqa_one_group(opFW):
    dtype = torch.float16
    B, Mq, Mkv, H, K = 3, 13, 16, 5, 128
    q = torch.randn([B, Mq, 1, H, K], dtype=dtype, device="cuda") * 3
    k = torch.randn([B, Mkv, 1, H, K], dtype=dtype, device="cuda") * 3
    v = torch.randn([B, Mkv, 1, H, K], dtype=dtype, device="cuda") * 3

    supported = opFW.supports(fmha.Inputs(q, k, v))
    if not supported:
        supported_bmhk = opFW.supports(fmha.Inputs(q[:, :, 0], k[:, :, 0], v[:, :, 0]))
        assert supported == supported_bmhk
        pytest.skip("not supported")
    out = fmha.memory_efficient_attention_forward(q, k, v, op=opFW)
    ref = ref_attention_for_test(q, k, v)
    assert_allclose(
        out.float(),
        ref,
        atol=opFW.ERROR_ATOL[dtype],
        rtol=opFW.ERROR_RTOL.get(dtype, 1e-5),
    )


@sm80_or_better_only
@disable_on_rocm
def test_flash_gqa_wrong_strides() -> None:
    op = (fmha.flash.FwOp, None)

    device = "cuda"
    B, Mq, Mkv, G, H, K = 3, 1, 512, 2, 8, 128
    q = torch.empty((B, Mq, G, H, K), dtype=torch.float16, device=device)
    kv = torch.empty((B, Mkv, G, H, K), dtype=torch.float16, device=device)
    fmha.memory_efficient_attention(q, kv, kv, op=op)

    kv = torch.empty((B, Mkv, H, G, K), dtype=torch.float16, device=device).permute(
        0, 1, 3, 2, 4
    )
    with pytest.raises(ValueError):
        fmha.memory_efficient_attention(q, kv, kv, op=op)

    kv = torch.empty((B, Mkv, G, 1, K), dtype=torch.float16, device=device)
    with pytest.raises(ValueError):
        fmha.memory_efficient_attention(q, kv, kv, op=op)
    kv = kv.expand(-1, -1, -1, H, K)
    fmha.memory_efficient_attention(q, kv, kv, op=op)

    kv = torch.empty((B, Mkv, G, H, 2 * K), dtype=torch.float16, device=device)[
        :, :, :, :, :K
    ]
    fmha.memory_efficient_attention(q, kv, kv, op=op)


def _dispatches_to_splitK(q, kv):
    return (
        _dispatch_fw_priority_list(fmha.Inputs(q, kv, kv), False)[0]
        is fmha.triton_splitk.FwOp
    )


def _dispatches_to_flash_decoding(q, kv):
    return (
        _dispatch_fw_priority_list(fmha.Inputs(q, kv, kv), False)[0] is fmha.flash.FwOp
    )


@disable_on_rocm
def test_dispatch_decoding_bmhk() -> None:
    assert not _dispatches_to_splitK(
        torch.empty([1, 8, 1, 128]), torch.empty([1, 2048, 1, 128])
    ), "Should not use SplitK with 1 head (no tensorcores)"
    assert _dispatches_to_flash_decoding(
        torch.empty([1, 8, 32, 128]),
        torch.empty([1, 2048, 1, 128]).expand(-1, -1, 32, -1),
    ), "Should use Flash-Decoding with BMHK MQA"
    assert not _dispatches_to_splitK(
        torch.empty([1, 8, 32, 128]),
        torch.empty([1, 2048, 32, 128]),
    ), "Should not use SplitK when no TensorCores"
    assert not _dispatches_to_splitK(
        torch.empty([1, 128, 32, 128]),
        torch.empty([1, 2048, 1, 128]).expand(-1, -1, 32, -1),
    ), "Should not use SplitK if q seqlen is long"
    assert not _dispatches_to_splitK(
        torch.empty([128, 8, 32, 128]),
        torch.empty([128, 2048, 1, 128]).expand(-1, -1, 32, -1),
    ), "Should not use SplitK if B is big"


@disable_on_rocm
def test_dispatch_decoding_bmghk() -> None:
    assert not _dispatches_to_splitK(
        torch.empty([1, 8, 1, 1, 128]), torch.empty([1, 2048, 1, 1, 128])
    ), "Should not use SplitK with 1 head (no tensorcores)"
    assert _dispatches_to_flash_decoding(
        torch.empty([1, 8, 1, 32, 128]),
        torch.empty([1, 2048, 1, 1, 128]).expand(-1, -1, -1, 32, -1),
    ), "Should use Flash-Decoding with MQA"
    assert _dispatches_to_flash_decoding(
        torch.empty([1, 8, 4, 32, 128]),
        torch.empty([1, 2048, 4, 1, 128]).expand(-1, -1, -1, 32, -1),
    ), "Should use Flash-Decoding with GQA"
    assert not _dispatches_to_splitK(
        torch.empty([1, 8, 1, 32, 128]),
        torch.empty([1, 2048, 1, 32, 128]),
    ), "Should not use SplitK when no TensorCores"
    assert not _dispatches_to_splitK(
        torch.empty([1, 128, 1, 32, 128]),
        torch.empty([1, 2048, 1, 1, 128]).expand(-1, -1, -1, 32, -1),
    ), "Should not use SplitK if q seqlen is long"
    assert not _dispatches_to_splitK(
        torch.empty([128, 8, 1, 32, 128]),
        torch.empty([128, 2048, 1, 1, 128]).expand(-1, -1, -1, 32, -1),
    ), "Should not use SplitK if B is big"


shapes_triton_splitk = [
    (1, 8, 2**16, 1, 128, 128),
    (1, 4, 2**16, 1, 128, 128),
    (1, 16, 2**16, 1, 128, 128),
    (1, 16, 2**16, 1, 32, 32),
    (1, 8, 1025, 1, 128, 128),
    (2, 8, 4096, 1, 128, 128),
    (10, 8, 2**16, 1, 128, 128),
    (10, 15, 2**16, 1, 128, 128),
    (1, 3, 2**16, 1, 128, 128),
    (1, 3, 2**16 - 10, 1, 128, 128),
    (2, 3, 73, 1, 128, 128),
    (2, 7, 7328, 1, 128, 128),
    (2, 7, 7328, 1, 120, 120),
    (2, 7, 63, 1, 120, 120),
]
op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv_splitk = [
    (fmha.triton_splitk.FwOp, "cuda", torch.float16, type(None), *s)
    for s in shapes_triton_splitk
] + [
    (fmha.triton_splitk.FwOp, "cuda", torch.bfloat16, type(None), *s)
    for s in shapes_triton_splitk
]


@pytest.mark.parametrize(
    "opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv",
    op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv_splitk,
    ids=[make_id(*c) for c in op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv_splitk],
)
@cuda_only
def test_forward_splitk(
    opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
    packed=False,
    fmt="BMHK",
):
    test_forward(opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv, packed=packed, fmt=fmt)


@cuda_only
@pytest.mark.parametrize(
    "op",
    [fmha.triton_splitk.FwOp, fmha.flash.FwOp, fmha.ck.FwOp],
    ids=lambda op: op.NAME,
)
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16], ids=str)
@pytest.mark.parametrize(
    "B_Mkv_H_K",
    [
        (1, 2**16, 3, 128),
        (5, 53, 4, 64),
        (7, 51, 4, 256),
        (3, 51, 2, 512),
    ],
)
def test_mqa_decoding(op: Type[fmha.AttentionFwOpBase], dtype, B_Mkv_H_K):
    B, Mkv, H, K = B_Mkv_H_K
    q = torch.randn([B, 1, H, K], dtype=dtype, device="cuda") * 3
    k = torch.randn([B, Mkv, 1, K], dtype=dtype, device="cuda") * 3
    v = torch.randn([B, Mkv, 1, K], dtype=dtype, device="cuda") * 3
    k = k.expand(-1, -1, H, -1)
    v = v.expand(-1, -1, H, -1)

    if skip_reasons := op.not_supported_reasons(fmha.Inputs(q, k, v)):
        pytest.skip("; ".join(skip_reasons))
    out = fmha.memory_efficient_attention_forward(q, k, v, op=op)
    ref = ref_attention_for_test(q, k, v)
    assert_allclose(
        out.float(),
        ref,
        atol=op.ERROR_ATOL[dtype],
        rtol=op.ERROR_RTOL.get(dtype, 1e-5),
    )


@parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv__xs
def test_empty_tensors_empty_query(
    opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
):
    query, key, value, attn_bias = create_tensors(
        *opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
        fmt="BMHK",
    )
    opFW = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv[0]

    if torch.version.hip:
        pytest.skip("backward pass/gradience is not yet supported by ck-tiled fmha!")

    query = query[:, :0]
    query.requires_grad_(True)
    key.requires_grad_(True)
    value.requires_grad_(True)
    out = xformers.ops.memory_efficient_attention(query, key, value, op=(opFW, None))
    assert out.shape[1] == 0
    out.backward(out)
    # dK/dV should be all zeros
    assert_allclose(key.grad, torch.zeros_like(key.grad), "key.grad")
    assert_allclose(value.grad, torch.zeros_like(value.grad), "value.grad")


@parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv__xs
def test_empty_tensors_empty_kv(
    opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
):
    query, key, value, attn_bias = create_tensors(
        *opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
        fmt="BMHK",
    )
    opFW = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv[0]
    if opFW == fmha.triton_splitk.FwOp:
        pytest.skip("triton_splitk doesn't support empty kv")

    if torch.version.hip:
        pytest.skip("backward pass/gradience is not yet supported by ck-tiled fmha!")

    key = key[:, :0]
    value = value[:, :0]
    query.requires_grad_(True)
    key.requires_grad_(True)
    value.requires_grad_(True)
    out = xformers.ops.memory_efficient_attention(query, key, value, op=(opFW, None))
    assert_allclose(out, torch.zeros_like(out), "out")
    out.backward(out)
    # dQ should be all zeros
    assert_allclose(query.grad, torch.zeros_like(query.grad), "query.grad")


@parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv__xs
def test_empty_tensors_empty_b(
    opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
):
    query, key, value, attn_bias = create_tensors(
        *opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
        fmt="BMHK",
    )
    opFW = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv[0]

    if torch.version.hip:
        pytest.skip("backward pass/gradience is not yet supported by ck-tiled fmha!")

    query, key, value = query[:0], key[:0], value[:0]
    query.requires_grad_(True)
    key.requires_grad_(True)
    value.requires_grad_(True)
    out = xformers.ops.memory_efficient_attention(query, key, value, op=(opFW, None))
    out.backward(out)


def test_local_attn_bias() -> None:
    mask = (
        fmha.attn_bias.LocalAttentionFromBottomRightMask(window_left=1, window_right=2)
        .materialize(shape=(4, 4))
        .exp()
    )

    expected = torch.tensor(
        [[1, 1, 1, 0], [1, 1, 1, 1], [0, 1, 1, 1], [0, 0, 1, 1]], dtype=torch.float32
    )
    assert (mask == expected).all().item()


@cuda_only
@disable_on_rocm
@skip_if_pt_cutlass
@pytest.mark.parametrize("cc", [60, 70, 80])
@pytest.mark.parametrize("maxK", [32, 64, 128, 256])
@pytest.mark.parametrize("dtype", [torch.float32, torch.float16])
@pytest.mark.parametrize(
    "custom_mask_type",
    [
        fmha.cutlass._CustomMaskType.NoCustomMask,
        fmha.cutlass._CustomMaskType.CausalFromTopLeft,
        fmha.cutlass._CustomMaskType.CausalFromBottomRight,
    ],
)
@pytest.mark.parametrize("window_size", [0, 3, 300])
@pytest.mark.parametrize(
    "num_queries,num_keys",
    [
        (30, 66),
        (256, 256),
        # Edge cases
        (314, 320),
        (32, 256),
        (224, 226),
        (5, 531),
        (320, 332),  # for win_size=300
        # Others
        (256, 62),
        (256, 63),
        (256, 64),
        (256, 65),
        (256, 66),
    ],
)
def test_cutlassB_iter_order(
    dtype,
    cc: int,
    maxK: int,
    num_queries: int,
    num_keys: int,
    custom_mask_type,
    window_size,
) -> None:
    """
    This tests some internals of the cutlassB kernel
    We test the iteration across blocks of [queries, keys] to ensure
    that we correctly:
    * Iterate over all the blocks that should be iterated
    * Do *not* iterate over blocks that are completely masked out
    * Correctly compute the number of parallel blocks that will compute
        the same block of dQ
    .. and we test this across variable causal masks+local attention combinations
    """

    if (
        window_size > 0
        and custom_mask_type == fmha.cutlass._CustomMaskType.NoCustomMask
    ):
        pytest.skip("LocalAttention is only supported for causal")
    get_iteration_data = partial(
        torch.ops.xformers._cutlassB_iteration_data,
        dtype=dtype,
        cc=cc,
        maxK=maxK,
        num_queries=num_queries,
        num_keys=num_keys,
        custom_mask_type=custom_mask_type,
        window_size=window_size,
    )
    bias = torch.zeros([num_queries, num_keys], dtype=torch.float32)
    if custom_mask_type != fmha.cutlass._CustomMaskType.NoCustomMask:
        bias = fmha.attn_bias._materialize_causal_mask(
            (num_queries, num_keys),
            dtype=torch.float32,
            device="cpu",
            window_size=None if window_size == 0 else window_size,
            from_bottomright=(
                custom_mask_type == fmha.cutlass._CustomMaskType.CausalFromBottomRight
            ),
        )

    block_queries, block_keys = get_iteration_data()[:2]
    mask_pooled = (
        F.max_pool2d(bias.unsqueeze(0), (block_queries, block_keys), ceil_mode=True)
        == 0
    ).int()[0]
    attn_computed = torch.zeros_like(mask_pooled)
    for key_start in range(0, num_keys, block_keys):
        it = 0
        new_key_start = key_start
        new_query_start = get_iteration_data(key_start=key_start)[2]
        try:
            expected_first_query = (
                mask_pooled[:, key_start // block_keys].tolist().index(1)
                * block_queries
            )
            assert (
                new_query_start == expected_first_query
            ), f"Wrong first query for K={key_start}: {new_query_start} (expected {expected_first_query})"
        except ValueError:  # Nothing to compute in this column
            pass

        while new_key_start == key_start and new_query_start < num_queries:
            query_start = new_query_start
            attn_computed[query_start // block_queries, key_start // block_keys] += 1
            # print(f"Compute [{query_start}, {key_start}]")

            # Is there something to compute here?
            assert mask_pooled[
                query_start // block_queries, key_start // block_keys
            ].item(), "Computing a block that is not needed!"
            new_query_start, new_key_start = get_iteration_data(
                key_start=key_start, query_start=query_start
            )[3:5]
            it += 1
            assert it < num_queries, ""
        assert (attn_computed == mask_pooled)[
            :, key_start // block_keys
        ].all(), "some blocks were not computed!"

    # Now check that the number returned by `getNumParallelBlocksForQuery` is correct
    for query_start in range(0, num_queries, block_queries):
        num_parallel_blocks = get_iteration_data(
            query_start=query_start, num_splits_key=num_keys
        )[5]
        num_actual = mask_pooled[query_start // block_queries].sum().item()
        assert num_parallel_blocks == num_actual


@sm80_or_better_only
@pytest.mark.parametrize("B", [1, 5, 128])
@pytest.mark.parametrize("MAX_T", [64, 128, 2048, 4096, 8192])
@pytest.mark.parametrize(
    "op",
    [
        fmha.triton_splitk.FwOp,
        fmha.triton_splitk.FwOp_S8,
        fmha.triton_splitk.FwOp_Map[48],
    ],
    ids=lambda op: op.NAME,
)
@pytest.mark.parametrize("num_quant_groups", [0, 1, 8])
@pytest.mark.parametrize("page_size", [64, 128, 256])
@pytest.mark.parametrize("gappy", [False, True], ids=lambda x: "gappy" if x else "")
def test_paged_attention(
    B,
    MAX_T: int,
    num_quant_groups: int,
    page_size: int,
    op: Type[AttentionFwOpBase],
    gappy: bool,
):
    paged_attention_run_inner(
        B, MAX_T, num_quant_groups, page_size, op, bench=False, gappy=gappy
    )


@sm80_or_better_only
@disable_on_rocm
@pytest.mark.parametrize("B", [1, 5, 128])
@pytest.mark.parametrize("MAX_T", [64, 128, 2048, 4096, 8192])
@pytest.mark.parametrize("page_size", [256])
def test_paged_attention_flash(B, MAX_T: int, page_size: int):
    # TODO: add smaller page sizes when https://github.com/Dao-AILab/flash-attention/pull/824 is merged
    op = fmha.flash.FwOp
    if (
        fmha.attn_bias.PagedBlockDiagonalPaddedKeysMask
        not in op.SUPPORTED_ATTN_BIAS_TYPES
    ):
        pytest.skip("Not supported bias")
    num_quant_groups = 0
    paged_attention_run_inner(B, MAX_T, num_quant_groups, page_size, op, bench=False)


def paged_attention_run_inner(
    B: int,
    MAX_T: int,
    num_quant_groups: int,
    page_size: int,
    op: Type[AttentionFwOpBase],
    bench: bool,
    gappy: bool = False,
) -> None:
    import triton

    torch.manual_seed(10)
    TEST_WARMUP_MS = 500
    TEST_RUN_MS = 5000

    N_H_L = 8
    N_KVH_L = 1
    D_H = 128
    D_H_KV = D_H // 8 + num_quant_groups if num_quant_groups else D_H
    kv_seqlens = torch.randint(low=1, high=MAX_T + 1, size=(B,)).tolist()
    # Paged attention requires k.shape[1] and v.shape[1] to be divisible by page_size, so pad
    padded_per_row_len = ((MAX_T + page_size - 1) // page_size) * page_size

    if gappy:
        make_paged_kwargs = {
            "paged_type": fmha.attn_bias.PagedBlockDiagonalGappyKeysMask,
            "notional_padding": MAX_T,
        }
        attn_bias = fmha.attn_bias.BlockDiagonalGappyKeysMask.from_seqlens(
            q_seqlen=[1] * B,
            kv_seqstarts=list(range(0, MAX_T * (B + 1), MAX_T)),
            kv_seqlen=kv_seqlens,
        )
    else:
        make_paged_kwargs = {
            "paged_type": fmha.attn_bias.PagedBlockDiagonalCausalWithOffsetPaddedKeysMask,
        }

        block_type = fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask
        attn_bias = block_type.from_seqlens(  # type: ignore
            q_seqlen=[1] * B,
            kv_padding=MAX_T,
            kv_seqlen=kv_seqlens,
        )

    q = torch.randn((B, 1, N_H_L, D_H), dtype=torch.bfloat16, device="cuda")
    if num_quant_groups:
        if triton.__version__[:4] < "3.0.":
            raise pytest.skip("dequant needs triton updates")

        # Using high=64 below, because with 256 both paged and non-paged paths
        # will produce NaNs - probably some quantization coeffitions are NaNs
        # after the bitwise cast.
        cache_k = torch.randint(
            0, 64, (B, MAX_T, N_KVH_L, D_H_KV * 4), dtype=torch.uint8, device="cuda"
        )
        cache_k = cache_k.view(dtype=torch.int32)
        cache_v = torch.randint(
            0, 64, (B, MAX_T, N_KVH_L, D_H_KV * 4), dtype=torch.uint8, device="cuda"
        )
        cache_v = cache_v.view(dtype=torch.int32)

        op = type(
            f"{op.__name__}_{num_quant_groups}",
            (op,),
            {"NUM_GROUPS": num_quant_groups},
        )
    else:
        cache_k = torch.randn(
            (B, MAX_T, N_KVH_L, D_H), dtype=torch.bfloat16, device="cuda"
        )
        cache_v = torch.randn_like(cache_k)

    axq = q.view(1, B * 1, N_H_L, D_H)
    axk = cache_k.view(1, B * MAX_T, N_KVH_L, D_H_KV).expand(
        1, B * MAX_T, N_H_L, D_H_KV
    )
    axv = cache_v.view(1, B * MAX_T, N_KVH_L, D_H_KV).expand(
        1, B * MAX_T, N_H_L, D_H_KV
    )

    k_cache_size_usual = axk.numel()

    # First, create "wasteful" K/V cache, where every block in logical cache
    # has a physical representation, even if there's nothing stored there

    block_tables = torch.arange(
        B * padded_per_row_len // page_size, device="cuda", dtype=torch.int32
    ).reshape(B, -1)

    shape_padded = (B, padded_per_row_len, N_KVH_L, D_H_KV)
    axk_padded = torch.empty(shape_padded, device=axk.device, dtype=axk.dtype)
    axv_padded = torch.empty(shape_padded, device=axv.device, dtype=axv.dtype)
    axk_padded[:, :MAX_T] = axk.view(B, -1, N_H_L, D_H_KV)[:, :, :1, :]
    axv_padded[:, :MAX_T] = axv.view(B, -1, N_H_L, D_H_KV)[:, :, :1, :]

    axk_padded = axk_padded.view(1, B * padded_per_row_len, N_KVH_L, D_H_KV)
    axv_padded = axv_padded.view(1, B * padded_per_row_len, N_KVH_L, D_H_KV)

    axk_padded = axk_padded.expand(-1, -1, N_H_L, -1)
    axv_padded = axv_padded.expand(-1, -1, N_H_L, -1)

    attn_bias_paged = attn_bias.make_paged(
        block_tables=block_tables, page_size=page_size, **make_paged_kwargs  # type: ignore
    )
    y_usual = fmha.memory_efficient_attention_forward(
        axq,
        axk,
        axv,
        attn_bias,
        op=op,
    )
    if bench:
        g = torch.cuda.CUDAGraph()
        with torch.cuda.graph(g):
            y_usual = fmha.memory_efficient_attention_forward(
                axq,
                axk,
                axv,
                attn_bias,
                op=op,
            )
        t_ms = triton.testing.do_bench(
            lambda g=g: g.replay(),
            warmup=TEST_WARMUP_MS,
            rep=TEST_RUN_MS,
        )
        logger.info(f"Non-paged attention took {t_ms * 1e3:.2f}us")

    y_wasteful = fmha.memory_efficient_attention_forward(
        axq,
        axk_padded,
        axv_padded,
        attn_bias_paged,
        op=op,
    )
    if bench:
        g = torch.cuda.CUDAGraph()
        with torch.cuda.graph(g):
            y_wasteful = fmha.memory_efficient_attention_forward(
                axq,
                axk_padded,
                axv_padded,
                attn_bias_paged,
                op=op,
            )
        t_ms = triton.testing.do_bench(
            lambda g=g: g.replay(),
            warmup=TEST_WARMUP_MS,
            rep=TEST_RUN_MS,
        )
        logger.info(f"Paged attention with wasteful K/V-cache took {t_ms * 1e3:.2f}us")

    torch.testing.assert_close(
        y_wasteful,
        y_usual,
        atol=1.0e-2,
        rtol=1.0e-2,
    )

    # Now let's create a "packed" K/V cache, where only meaniningful logical blocks are mapped to physical blocks
    (block_tables, packed_cache_k, packed_cache_v) = pack_kv_cache(
        cache_k,
        cache_v,
        kv_seqlens,
        page_size,
    )
    attn_bias_paged = attn_bias.make_paged(
        block_tables=block_tables, page_size=page_size, **make_paged_kwargs  # type: ignore
    )
    axk = packed_cache_k.view(1, -1, N_KVH_L, D_H_KV).expand(1, -1, N_H_L, D_H_KV)
    axv = packed_cache_v.view(1, -1, N_KVH_L, D_H_KV).expand(1, -1, N_H_L, D_H_KV)

    k_cache_size_packed = axk.numel()

    y_packed = fmha.memory_efficient_attention_forward(
        axq,
        axk,
        axv,
        attn_bias_paged,
        op=op,
    )

    logger.info(
        f"KV-cache size reduced by {(100 * (1 - k_cache_size_packed/k_cache_size_usual)):.2f}%"
    )

    torch.testing.assert_close(y_wasteful, y_packed)

    # Let's swap two blocks, and adjust two corresponding entries in the block table. The result shouldn't change
    i, j = 0, axk.shape[1] // page_size - 1

    axk = axk[:, :, :1, :]
    axv = axv[:, :, :1, :]

    vals_i = axk[:, i * page_size : (i + 1) * page_size, :, :].clone()
    vals_j = axk[:, j * page_size : (j + 1) * page_size, :, :].clone()
    axk[:, i * page_size : (i + 1) * page_size, :, :] = vals_j
    axk[:, j * page_size : (j + 1) * page_size, :, :] = vals_i

    vals_i = axv[:, i * page_size : (i + 1) * page_size, :, :].clone()
    vals_j = axv[:, j * page_size : (j + 1) * page_size, :, :].clone()
    axv[:, i * page_size : (i + 1) * page_size, :, :] = vals_j
    axv[:, j * page_size : (j + 1) * page_size, :, :] = vals_i

    axk = axk.expand(-1, -1, N_H_L, -1)
    axv = axv.expand(-1, -1, N_H_L, -1)

    where_i = block_tables == i
    where_j = block_tables == j
    block_tables.masked_fill_(where_i, j)
    block_tables.masked_fill_(where_j, i)

    y_swapped = fmha.memory_efficient_attention_forward(
        axq,
        axk,
        axv,
        attn_bias_paged,
        op=op,
    )
    if bench:
        g = torch.cuda.CUDAGraph()
        with torch.cuda.graph(g):
            y_swapped = fmha.memory_efficient_attention_forward(
                axq,
                axk,
                axv,
                attn_bias_paged,
                op=op,
            )
        t_ms = triton.testing.do_bench(
            lambda g=g: g.replay(),
            warmup=TEST_WARMUP_MS,
            rep=TEST_RUN_MS,
        )
        logger.info(f"Paged attention with packed K/V-cache took {t_ms * 1e3:.2f}us")

    torch.testing.assert_close(y_swapped, y_packed)


@disable_on_rocm
@sm80_or_better_only
@pytest.mark.parametrize(
    "op",
    [
        fmha.triton_splitk.FwOp,
        fmha.flash.FwOp,
        None,
    ],
    ids=lambda op: "None" if op is None else op.NAME,
)
@pytest.mark.parametrize("G,H", [(1, 11), (7, 1), (1, 1), (7, 11), (None, 11)])
@pytest.mark.parametrize(
    "write_lse", (False, True), ids=lambda x: "write_lse" if x else ""
)
@pytest.mark.parametrize(
    "stack_inputs", (False, True), ids=lambda x: "stack_inputs" if x else ""
)
def test_merge_attentions_nobias(
    write_lse: bool,
    stack_inputs: bool,
    op: Type[AttentionFwOpBase],
    G: Optional[int],
    H: int,
):
    """
    Merging the same attention twice shouldn't change anything.
    This also tests the shape of the lse output of each permitted op.
    """
    B, M, Mq, K = 13, 5, 3, 128
    if op is None or torch.bfloat16 in op.SUPPORTED_DTYPES:
        dtype = torch.bfloat16
    else:
        dtype = next(iter(op.SUPPORTED_DTYPES))
    if G is None:
        q = 3 * torch.rand(B, Mq, H, K, dtype=dtype, device="cuda")
        k = (3 * torch.rand(B, M, 1, K, dtype=dtype, device="cuda")).expand(B, M, H, K)
        v = (3 * torch.rand(B, M, 1, K, dtype=dtype, device="cuda")).expand(B, M, H, K)
    else:
        q = 3 * torch.rand(B, Mq, G, H, K, dtype=dtype, device="cuda")
        k = (3 * torch.rand(B, M, G, 1, K, dtype=dtype, device="cuda")).expand(
            B, M, G, H, K
        )
        v = (3 * torch.rand(B, M, G, 1, K, dtype=dtype, device="cuda")).expand(
            B, M, G, H, K
        )
    out1, lse1 = fmha.memory_efficient_attention_partial(q, k, v, op=op)
    assert out1.shape == q.shape
    M_ceil = lse1.shape[-1]
    assert M_ceil >= Mq
    assert lse1.shape == (B, H, M_ceil) if G is None else (B, G, H, M_ceil)
    lse1 = lse1[..., :Mq]

    attn_chunks = [out1, out1]
    lse_chunks = [lse1, lse1]
    attn_chunks_ = torch.stack(attn_chunks) if stack_inputs else attn_chunks
    lse_chunks_ = torch.stack(lse_chunks) if stack_inputs else lse_chunks
    out, lse = fmha.merge_attentions(attn_chunks_, lse_chunks_, write_lse=write_lse)  # type: ignore
    assert out.shape == out1.shape
    assert_allclose(out1, out, rtol=1e-3, atol=1e-3, msg="out")
    if write_lse:
        assert lse is not None
        assert lse.shape[:-1] == lse1.shape[:-1]
        assert_allclose(
            lse1[..., :Mq] + math.log(2), lse[..., :Mq], rtol=1e-3, atol=1e-3, msg="lse"
        )
    else:
        assert lse is None


@disable_on_rocm
@sm80_or_better_only
@pytest.mark.parametrize(
    "op",
    [
        pytest.param(fmha.flash.FwOp, id="flashfwd"),
        pytest.param((fmha.flash.FwOp, fmha.cutlass.BwOp), id="flashcutlass"),
        # pytest.param((fmha.triton_splitk.FwOp, fmha.cutlass.BwOp), id="splitk"), # XXX
        pytest.param(fmha.MemoryEfficientAttentionFlashAttentionOp, id="flash"),
        None,
    ],
)
def test_merge_attentions_nobias_bwd(
    op: Union[Type[AttentionFwOpBase], fmha.AttentionOp]
):
    B, M, Mq, H, K = 13, 5, 5, 4, 128
    dtype = torch.bfloat16
    nparts = 3
    torch.manual_seed(1)
    q = 3 * torch.rand(B, Mq, H, K, dtype=dtype, device="cuda")
    kv = [
        [3 * (torch.rand(B, M, H, K, dtype=dtype, device="cuda")) for _ in range(2)]
        for _ in range(nparts)
    ]
    q = q.requires_grad_(True)
    kv = [[j.requires_grad_(True) for j in i] for i in kv]
    out_parts = [fmha.memory_efficient_attention_partial(q, k, v, op=op) for k, v in kv]
    attn_split, lse_split = [list(x) for x in zip(*out_parts)]
    out_merged = fmha.merge_attentions(attn_split, lse_split, write_lse=True)[0]
    grad_out = torch.rand_like(q)
    out_merged.backward(grad_out)
    grad_q_out = q.grad
    assert q.grad is not None
    grad_kv_out = [[j.grad for j in i] for i in kv]
    q = q.detach().requires_grad_(True)
    kv = [[j.detach().requires_grad_(True) for j in i] for i in kv]

    k2, v2 = [torch.cat([i[j] for i in kv], dim=1) for j in range(2)]

    if op is None or isinstance(op, tuple):
        full_op = op
    else:
        full_op = (op, None)
    out_full = fmha.memory_efficient_attention(q, k2, v2, op=full_op)  # type: ignore
    out_full.backward(grad_out)
    assert_allclose(
        out_merged, out_full.to(out_merged.dtype), rtol=1e-2, atol=2e-2, msg="out"
    )
    atol = fmha.AttentionBwOpBase.ERROR_ATOL[dtype] * 1.5
    rtol = fmha.AttentionBwOpBase.ERROR_RTOL[dtype]
    assert_allclose(grad_q_out, q.grad, rtol=rtol, atol=atol, msg="qgrad")
    for i in range(nparts):
        for j in range(2):
            assert_allclose(
                grad_kv_out[i][j],
                kv[i][j].grad,
                rtol=rtol,
                atol=atol,
                msg=f"kvgrad {i} {j}",
            )


@disable_on_rocm
@sm80_or_better_only
@pytest.mark.parametrize(
    "dtype,op",
    [
        (torch.bfloat16, fmha.triton_splitk.FwOp_S1),
        # Cutlass's LSE is not consistent
        # (torch.float32, fmha.cutlass.FwOp),
        (torch.bfloat16, fmha.flash.FwOp),
    ],
    ids=lambda o: f"{o.NAME}" if hasattr(o, "NAME") else str(o),
)
@pytest.mark.parametrize("num_queries", [1])
@pytest.mark.parametrize("bmghk", [True, False], ids=lambda x: "bmghk" if x else "")
def test_partial_paged(
    dtype: torch.dtype, op: Type[AttentionFwOpBase], num_queries: int, bmghk: bool
):
    B = 128
    N_H_L = 8
    D_H = 128
    page_size = 256
    G = 2 if bmghk else 1
    block_tables = torch.zeros((B, 1), dtype=torch.int32, device="cuda")
    torch.manual_seed(1)
    output_dtype = torch.float32 if op.SUPPORTS_OUTPUT_DTYPE else None

    B_T = num_queries * B

    q = torch.randn((1, B_T, G, N_H_L, D_H), dtype=dtype, device="cuda")
    k = torch.randn((1, page_size, G, 1, D_H), dtype=dtype, device="cuda")
    v = torch.randn_like(k)
    k = k.expand(1, page_size, G, N_H_L, D_H)
    v = v.expand(1, page_size, G, N_H_L, D_H)
    if not bmghk:
        q = q[:, :, 0]
        k = k[:, :, 0]
        v = v[:, :, 0]

    attn_bias = (
        fmha.attn_bias.PagedBlockDiagonalCausalWithOffsetPaddedKeysMask.from_seqlens(
            q_seqlen=[num_queries] * B,
            kv_seqlen=[1] + ([100] * (B - 1)),
            page_size=page_size,
            block_tables=block_tables,
        )
    )

    if attn_bias not in op.SUPPORTED_ATTN_BIAS_TYPES:
        pytest.skip("Not supported bias")

    attn_chunk, lse_chunk = fmha.memory_efficient_attention_partial(
        q,
        k,
        v,
        attn_bias,
        op=op,
        output_dtype=output_dtype,
    )
    if bmghk:
        assert attn_chunk.shape == (1, B_T, G, N_H_L, D_H)
        assert lse_chunk.shape == (
            1,
            G,
            N_H_L,
            B_T,
        ), f"{lse_chunk.shape=}, {(1, G, N_H_L, B_T)=}"
    else:
        assert attn_chunk.shape == (1, B_T, N_H_L, D_H)
        assert lse_chunk.shape == (
            1,
            N_H_L,
            B_T,
        ), f"{lse_chunk.shape=}, {(1, N_H_L, B_T)=}"


@disable_on_rocm
@sm80_or_better_only
@pytest.mark.parametrize(
    "dtype,op",
    [
        (torch.bfloat16, fmha.triton_splitk.FwOp_S1),
        (torch.bfloat16, fmha.triton_splitk.FwOp_S32),
        # Cutlass's LSE is not consistent
        # (torch.float32, fmha.cutlass.FwOp),
        (torch.bfloat16, fmha.flash.FwOp),
    ],
    ids=lambda o: f"{o.NAME}" if hasattr(o, "NAME") else str(o),
)
@pytest.mark.parametrize("num_queries", [1, 2])
@pytest.mark.parametrize("bmghk", [True, False], ids=lambda x: "bmghk" if x else "")
@pytest.mark.parametrize(
    "stack_inputs", (False, True), ids=lambda x: "stack_inputs" if x else ""
)
def test_merge_attentions_decoding(
    dtype: torch.dtype,
    op: Type[AttentionFwOpBase],
    num_queries: int,
    bmghk: bool,
    stack_inputs: bool,
):
    """
    Compute decoding attention on chunks of K/V and merge them together.
    Compare with computing attention on the whole K/V.
    """
    MAX_T = 8192
    B = 128
    N_H_L = 8
    D_H = 128
    G = 2 if bmghk else 1
    torch.manual_seed(1)
    output_dtype = torch.float32 if op.SUPPORTS_OUTPUT_DTYPE else None

    num_chunks = 10

    chunk_starts = sorted(
        torch.randint(low=1, high=MAX_T // 2, size=(num_chunks,)).tolist()
    )
    chunk_starts[0] = 0
    chunk_starts.append(MAX_T)

    # We construct sequences so that even the last chunk has a non-empty part of every sequence
    # as long as the number of queries.
    # Otherwise the corresponding LSE will be -inf and that'll propagate to the whole sum.
    # It is possible to teach the kernel to ignore infinite LSEs, but in practical use cases
    # of merging attention, e.g. a batch of sequences with a common prefix, this condition should be satisfied.
    k_lens = torch.randint(
        low=chunk_starts[-2] + num_queries, high=MAX_T, size=(B,)
    ).tolist()
    q_lens = [num_queries] * B
    B_T = num_queries * B

    q = torch.randn((1, B_T, G, N_H_L, D_H), dtype=dtype, device="cuda")
    k = torch.randn((B, MAX_T, G, 1, D_H), dtype=dtype, device="cuda")
    v = torch.randn_like(k)
    if not bmghk:
        q = q[:, :, 0]

    # Compute per-chunk attention
    chunks_output = []
    for i in range(num_chunks):
        chunk_start, chunk_end = chunk_starts[i], chunk_starts[i + 1]
        k_chunk = k[:, chunk_start:chunk_end, ...]
        v_chunk = v[:, chunk_start:chunk_end, ...]
        axk = k_chunk.reshape(-1, G, 1, D_H).expand(1, -1, G, N_H_L, D_H)
        axv = v_chunk.reshape(-1, G, 1, D_H).expand(1, -1, G, N_H_L, D_H)
        if not bmghk:
            axk = axk[:, :, 0]
            axv = axv[:, :, 0]

        bias_type = fmha.attn_bias.BlockDiagonalPaddedKeysMask
        if i + 1 == num_chunks:
            bias_type = fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask
        attn_bias = bias_type.from_seqlens(
            q_seqlen=q_lens,
            kv_padding=chunk_end - chunk_start,
            kv_seqlen=[max(min(x, chunk_end) - chunk_start, 0) for x in k_lens],
        )

        attn_chunk, lse_chunk = fmha.memory_efficient_attention_partial(
            q,
            axk,
            axv,
            attn_bias,
            op=op,
            output_dtype=output_dtype,
        )
        if bmghk:
            assert attn_chunk.shape == (1, B_T, G, N_H_L, D_H)
            assert lse_chunk.shape == (1, G, N_H_L, B_T)
        else:
            assert attn_chunk.shape == (1, B_T, N_H_L, D_H)
            assert lse_chunk.shape == (1, N_H_L, B_T)
        chunks_output.append((attn_chunk, lse_chunk))

    # Merge attention from all chunks
    attn_split = [attn_chunk for attn_chunk, _ in chunks_output]
    lse_split = [lse_chunk for _, lse_chunk in chunks_output]
    attn_split_ = torch.stack(attn_split) if stack_inputs else attn_split
    lse_split_ = torch.stack(lse_split) if stack_inputs else lse_split

    attn_out, lse_out = fmha.merge_attentions(
        attn_split_, lse_split_, output_dtype=dtype  # type: ignore
    )
    assert lse_out is not None

    # Compute attention on the full K/V
    attn_bias = fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask.from_seqlens(
        q_seqlen=q_lens,
        kv_padding=MAX_T,
        kv_seqlen=k_lens,
    )
    axk = k.view(1, -1, G, 1, D_H).expand(1, -1, G, N_H_L, D_H)
    axv = v.view(1, -1, G, 1, D_H).expand(1, -1, G, N_H_L, D_H)
    if not bmghk:
        axk = axk[:, :, 0]
        axv = axv[:, :, 0]
    attn_full, lse_full = fmha.memory_efficient_attention_partial(
        q,
        axk,
        axv,
        attn_bias,
        op=op,
        output_dtype=output_dtype,
    )

    assert_allclose(
        lse_out.to(lse_full.dtype), lse_full, rtol=1e-3, atol=1e-3, msg="lse"
    )
    assert_allclose(
        attn_out.to(attn_full.dtype), attn_full, rtol=1e-3, atol=1e-3, msg="out"
    )

    attn_full2 = fmha.memory_efficient_attention_forward(
        q,
        axk,
        axv,
        attn_bias,
        op=op,
        output_dtype=output_dtype,
    )
    assert_allclose(attn_full2, attn_full, rtol=1e-3, atol=1e-3, msg="out2")


@disable_on_rocm
@sm80_or_better_only
@pytest.mark.parametrize(
    "dtype,op",
    [
        (torch.bfloat16, fmha.triton_splitk.FwOp_S1),
        (torch.bfloat16, fmha.triton_splitk.FwOp_S32),
    ],
    ids=lambda o: f"{o.NAME}" if hasattr(o, "NAME") else str(o),
)
@pytest.mark.parametrize("gqa", [False, True], ids=lambda x: "gqa" if x else "")
def test_merge_attentions_sharedinput(
    dtype: torch.dtype,
    op: Type[AttentionFwOpBase],
    gqa: bool,
):
    """
    Compute decoding attention on chunks of K/V and merge them together.
    Compare with computing attention on the whole K/V.
    """
    MAX_T = 8192
    N_H_L = 16
    D_H = 128
    G = 2
    torch.manual_seed(1)
    output_dtype = torch.float32 if op.SUPPORTS_OUTPUT_DTYPE else None

    shared_length = 20
    full_lengths = [30, 35, 40]

    attn_bias = fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask.from_seqlens(
        q_seqlen=[1, 1, 1],
        kv_padding=MAX_T,
        kv_seqlen=full_lengths,
    )
    attn_bias1 = fmha.attn_bias.BlockDiagonalPaddedKeysMask.from_seqlens(
        q_seqlen=[2, 1],
        kv_padding=MAX_T,
        kv_seqlen=[shared_length, 0],
    )
    attn_bias2 = fmha.attn_bias.BlockDiagonalGappyKeysMask.from_seqlens(
        q_seqlen=[1, 1, 1],
        kv_seqstarts=[shared_length, MAX_T + shared_length, 2 * MAX_T, 3 * MAX_T],
        kv_seqlen=[
            full_lengths[0] - shared_length,
            full_lengths[1] - shared_length,
            full_lengths[2],
        ],
    )

    q = torch.randn((1, 3, G, N_H_L, D_H), dtype=dtype, device="cuda")
    k = torch.randn((3, MAX_T, G, 1 if gqa else N_H_L, D_H), dtype=dtype, device="cuda")
    v = torch.randn_like(k)
    k[1, :shared_length] = k[0, :shared_length]
    v[1, :shared_length] = v[0, :shared_length]
    k = k.flatten(end_dim=1)[None]
    v = v.flatten(end_dim=1)[None]
    k = k.expand((1, 3 * MAX_T, G, N_H_L, D_H))
    v = v.expand((1, 3 * MAX_T, G, N_H_L, D_H))

    attn_chunk1, lse_chunk1 = fmha.memory_efficient_attention_partial(
        q,
        k,
        v,
        attn_bias1,
        op=op,
        output_dtype=output_dtype,
    )
    assert attn_chunk1.shape == (1, 3, G, N_H_L, D_H)
    assert lse_chunk1.shape == (1, G, N_H_L, 3)
    if gqa:
        attn_chunk1a, lse_chunk1a = fmha.memory_efficient_attention_partial(
            q,
            k.contiguous(),
            v,
            attn_bias1,
            op=op,
            output_dtype=output_dtype,
        )
        assert attn_chunk1a.shape == (1, 3, G, N_H_L, D_H)
        assert lse_chunk1a.shape == (1, G, N_H_L, 3)
        assert_allclose(
            attn_chunk1a.nan_to_num(0, 0, 0), attn_chunk1.nan_to_num(0, 0, 0)
        )
        assert_allclose(lse_chunk1a.nan_to_num(0, 0, 0), lse_chunk1.nan_to_num(0, 0, 0))

    attn_chunk2, lse_chunk2 = fmha.memory_efficient_attention_partial(
        q,
        k,
        v,
        attn_bias2,
        op=op,
        output_dtype=output_dtype,
    )
    assert attn_chunk2.shape == (1, 3, G, N_H_L, D_H)
    assert lse_chunk2.shape == (1, G, N_H_L, 3)
    # Merge attention from all chunks

    attn_out, lse_out = fmha.merge_attentions(
        [attn_chunk1, attn_chunk2], [lse_chunk1, lse_chunk2], output_dtype=dtype  # type: ignore
    )
    assert lse_out is not None

    # Compute attention on the full K/V
    attn_full, lse_full = fmha.memory_efficient_attention_partial(
        q,
        k,
        v,
        attn_bias,
        op=op,
        output_dtype=output_dtype,
    )
    assert_allclose(
        attn_out.to(attn_full.dtype), attn_full, rtol=1e-2, atol=2e-3, msg="out"
    )
    assert_allclose(
        lse_out.to(lse_full.dtype), lse_full, rtol=1e-3, atol=1e-3, msg="lse"
    )


@sm80_or_better_only
@pytest.mark.parametrize("bmghk", (False, True))
@pytest.mark.parametrize(
    "stack_inputs", (False, True), ids=lambda x: "stack_inputs" if x else ""
)
@pytest.mark.parametrize(
    "grad_var", ("lse", "attn", None)
)  # Gradient with respect to attention, LSE, or neither
def test_merge_attentions_against_ref(
    bmghk: bool, stack_inputs: bool, grad_var: Optional[str]
):
    split_k = 16
    B = 12
    M = 137
    G = 2 if bmghk else 1
    N_H_L = 8
    D_H = 128
    dtype = torch.float32

    attn_split = torch.randn([split_k, B, M, G, N_H_L, D_H], dtype=dtype, device="cuda")
    lse_split = torch.randn([split_k, B, G, N_H_L, M], dtype=dtype, device="cuda")

    if not bmghk:
        attn_split = attn_split[:, :, :, 0]
        lse_split = lse_split[:, :, 0]

    if grad_var is not None:
        attn_split.requires_grad_(True)
        lse_split.requires_grad_(True)

    attn_out_ref, lse_out_ref = _merge_attentions_ref(attn_split, lse_split)
    if grad_var is not None:
        if grad_var == "attn":
            out_grad = torch.randn_like(attn_out_ref)
            attn_out_ref.backward(out_grad)
        else:
            out_grad = torch.randn_like(lse_out_ref)
            lse_out_ref.backward(out_grad)

        attn_grad_ref, lse_grad_ref = attn_split.grad, lse_split.grad

        attn_split = attn_split.detach().unbind(0)  # type: ignore
        lse_split = lse_split.detach().unbind(0)  # type: ignore

        for x in attn_split + lse_split:
            x.requires_grad_(True)
            x.retain_grad()

    attn_out, lse_out = fmha.merge_attentions(attn_split, lse_split)

    torch.testing.assert_close(lse_out, lse_out_ref, rtol=1e-4, atol=1e-4)
    torch.testing.assert_close(attn_out, attn_out_ref, rtol=1e-4, atol=1e-4)

    if grad_var is not None:
        if grad_var == "attn":
            attn_out.backward(out_grad)
        else:
            assert lse_out is not None
            lse_out.backward(out_grad)

        attn_grads = [x.grad for x in attn_split]
        lse_grads = [x.grad for x in lse_split]
        attn_grad_concat = torch.stack(attn_grads, dim=0)
        lse_grad_concat = torch.stack(lse_grads, dim=0)

        if grad_var == "lse":
            # LSE doesn't depend on attn_split, so when only gradient with respect to LSE is provided as input,
            # the output gradient with respect to attn_split is zero.
            # The reference implementation produced None instead of zero in this case
            attn_grad_ref = torch.zeros_like(attn_grad_concat)
        torch.testing.assert_close(lse_grad_concat, lse_grad_ref, rtol=1e-4, atol=1e-4)
        torch.testing.assert_close(
            attn_grad_concat, attn_grad_ref, rtol=1e-4, atol=1e-4
        )


def _merge_attentions_ref(attn_split, lse_split):
    """
    attn_split: [split_k, B, M, (G,) H, Kq]
    lse_split: [split_k, B, (G,) H, M]
    """
    is_bmghk = len(attn_split.shape) == 6
    if not is_bmghk:
        attn_split = attn_split.unsqueeze(3)
        lse_split = lse_split.unsqueeze(2)

    lse_split = lse_split[..., None].moveaxis(4, 2)  # [split_k, B, M, G, H, 1]

    lse_max, _ = torch.max(lse_split, dim=0)  # [B, M, G, H, 1]
    sumexp_normalized = torch.exp(lse_split - lse_max)  # [split_k, B, M, G, H, 1]
    denominator = sumexp_normalized.sum(dim=0)  # [B, M, G, H, 1]
    numerator = (sumexp_normalized * attn_split).sum(dim=0)  # [B, M, G, H, K]

    attn_out = numerator / denominator  # [B, M_ceil, G, H, Kq]
    lse_out = lse_max + torch.log(denominator)
    lse_out = lse_out.squeeze(4).permute(0, 2, 3, 1)  # [B, G, H, M]

    if not is_bmghk:
        attn_out = attn_out.squeeze(2)
        lse_out = lse_out.squeeze(1)

    return attn_out, lse_out


@sm80_or_better_only
@skip_if_rocm  # rocm doesn't support backward yet
@pytest.mark.parametrize(
    "bias_t",
    [None, fmha.attn_bias.LowerTriangularMask, fmha.attn_bias.BlockDiagonalMask],
)
@pytest.mark.parametrize("create_bias_inside_compiled", [False, True])
@pytest.mark.parametrize(
    "op",
    [None, (fmha.flash.FwOp, fmha.flash.BwOp), (fmha.cutlass.FwOp, fmha.flash.BwOp)],
)
def test_memeff_compile(bias_t, create_bias_inside_compiled: bool, op) -> None:
    torch.manual_seed(0)
    torch._dynamo.reset_code_caches()  # avoids hitting recompilation limit
    B, M, H, K = 1, 256, 2, 64
    q, k, v, bias = create_tensors(
        op if op is None else op[0],
        "cuda",
        torch.float16,
        bias_t,
        B,
        M,
        M,
        H,
        K,
        K,
        fmt="BMHK",
    )
    grad = torch.randn_like(q)
    if create_bias_inside_compiled:
        bias = None
        if bias_t not in [None, fmha.attn_bias.LowerTriangularMask]:
            pytest.skip("Can't create this mask inside compile")
    if bias is not None:
        bias.to(q.device)
    q.requires_grad_(True)
    k.requires_grad_(True)
    v.requires_grad_(True)

    def fmha_fn(q, k, v, bias):
        if create_bias_inside_compiled and bias_t is not None:
            bias = bias_t()
        return fmha.memory_efficient_attention(q, k, v, attn_bias=bias, op=op)

    # Eager reference
    out_ref = fmha_fn(q, k, v, bias)
    out_ref.backward(grad)
    dq_ref, dk_ref, dv_ref = q.grad, k.grad, v.grad
    q.grad, k.grad, v.grad = None, None, None

    # Compiled version
    fmha_c = torch.compile(fmha_fn, fullgraph=True, dynamic=False)
    out = fmha_c(q, k, v, bias)
    out.backward(grad)

    assert_allclose(
        out,
        out_ref,
        "out",
        atol=fmha.flash.FwOp.ERROR_ATOL[q.dtype],
        rtol=fmha.flash.FwOp.ERROR_RTOL[q.dtype],
    )
    atol, rtol = (
        fmha.flash.BwOp.ERROR_ATOL[q.dtype],
        fmha.flash.BwOp.ERROR_RTOL[q.dtype],
    )
    assert_allclose(q.grad, dq_ref, "dq", atol=atol, rtol=rtol)
    assert_allclose(k.grad, dk_ref, "dk", atol=atol, rtol=rtol)
    assert_allclose(v.grad, dv_ref, "dv", atol=atol, rtol=rtol)


def test_bias_lower_triangular() -> None:
    mask = fmha.attn_bias.LowerTriangularMask()
    mask.detach()


def test_bias_lower_triangular_with_bias() -> None:
    dense_bias = torch.randn([128, 128], dtype=torch.float16, requires_grad=True)
    grad = torch.randn_like(dense_bias)
    mask = fmha.attn_bias.LowerTriangularMask()
    mask_biased = mask.add_bias(dense_bias)
    mask_biased2 = mask_biased.detach()
    mask_biased.backward(grad)
    assert dense_bias.grad is not None
    assert mask_biased2.grad is None
    assert_allclose(dense_bias.grad, grad, "dense.grad")


# end of file