danieldk HF Staff commited on Apr 16

Commit

b58ed97

1 Parent(s): dd2f0f9

Build

Browse files

Files changed (42) hide show

build/torch25-cxx11-cu118-x86_64-linux/flash_attn/__init__.py +343 -16
build/torch25-cxx11-cu118-x86_64-linux/flash_attn/{_flash_attn_ab4cc6a_dirty.abi3.so → _flash_attn_dd2f0f9.abi3.so} +2 -2
build/torch25-cxx11-cu118-x86_64-linux/flash_attn/_ops.py +3 -3
build/torch25-cxx11-cu121-x86_64-linux/flash_attn/{_flash_attn_ab4cc6a_dirty.abi3.so → _flash_attn_dd2f0f9.abi3.so} +2 -2
build/torch25-cxx11-cu121-x86_64-linux/flash_attn/_ops.py +3 -3
build/torch25-cxx11-cu124-x86_64-linux/flash_attn/{_flash_attn_ab4cc6a_dirty.abi3.so → _flash_attn_dd2f0f9.abi3.so} +2 -2
build/torch25-cxx11-cu124-x86_64-linux/flash_attn/_ops.py +3 -3
build/torch25-cxx98-cu118-x86_64-linux/flash_attn/{_flash_attn_ab4cc6a_dirty.abi3.so → _flash_attn_dd2f0f9.abi3.so} +2 -2
build/torch25-cxx98-cu118-x86_64-linux/flash_attn/_ops.py +3 -3
build/torch25-cxx98-cu121-x86_64-linux/flash_attn/_flash_attn_ab4cc6a_dirty.abi3.so +0 -3
build/torch25-cxx98-cu121-x86_64-linux/flash_attn/_flash_attn_dd2f0f9.abi3.so +3 -0
build/torch25-cxx98-cu121-x86_64-linux/flash_attn/_ops.py +3 -3
build/torch25-cxx98-cu124-x86_64-linux/flash_attn/_flash_attn_ab4cc6a_dirty.abi3.so +0 -3
build/torch25-cxx98-cu124-x86_64-linux/flash_attn/_flash_attn_dd2f0f9.abi3.so +3 -0
build/torch25-cxx98-cu124-x86_64-linux/flash_attn/_ops.py +3 -3
build/torch26-cxx11-cu118-x86_64-linux/flash_attn/_flash_attn_ab4cc6a_dirty.abi3.so +0 -3
build/torch26-cxx11-cu118-x86_64-linux/flash_attn/_flash_attn_dd2f0f9.abi3.so +3 -0
build/torch26-cxx11-cu118-x86_64-linux/flash_attn/_ops.py +3 -3
build/torch26-cxx11-cu124-x86_64-linux/flash_attn/_flash_attn_ab4cc6a_dirty.abi3.so +0 -3
build/torch26-cxx11-cu124-x86_64-linux/flash_attn/_flash_attn_dd2f0f9.abi3.so +3 -0
build/torch26-cxx11-cu124-x86_64-linux/flash_attn/_ops.py +3 -3
build/torch26-cxx11-cu126-x86_64-linux/flash_attn/_flash_attn_ab4cc6a_dirty.abi3.so +0 -3
build/torch26-cxx11-cu126-x86_64-linux/flash_attn/_flash_attn_dd2f0f9.abi3.so +3 -0
build/torch26-cxx11-cu126-x86_64-linux/flash_attn/_ops.py +3 -3
build/torch26-cxx98-cu118-x86_64-linux/flash_attn/_flash_attn_ab4cc6a_dirty.abi3.so +0 -3
build/torch26-cxx98-cu118-x86_64-linux/flash_attn/_flash_attn_dd2f0f9.abi3.so +3 -0
build/torch26-cxx98-cu118-x86_64-linux/flash_attn/_ops.py +3 -3
build/torch26-cxx98-cu124-x86_64-linux/flash_attn/_flash_attn_ab4cc6a_dirty.abi3.so +0 -3
build/torch26-cxx98-cu124-x86_64-linux/flash_attn/_flash_attn_dd2f0f9.abi3.so +3 -0
build/torch26-cxx98-cu124-x86_64-linux/flash_attn/_ops.py +3 -3
build/torch26-cxx98-cu126-x86_64-linux/flash_attn/_flash_attn_ab4cc6a_dirty.abi3.so +0 -3
build/torch26-cxx98-cu126-x86_64-linux/flash_attn/_flash_attn_dd2f0f9.abi3.so +3 -0
build/torch26-cxx98-cu126-x86_64-linux/flash_attn/_ops.py +3 -3
build/torch27-cxx11-cu118-x86_64-linux/flash_attn/__init__.py +364 -0
build/torch27-cxx11-cu118-x86_64-linux/flash_attn/_flash_attn_dd2f0f9.abi3.so +3 -0
build/torch27-cxx11-cu118-x86_64-linux/flash_attn/_ops.py +9 -0
build/torch27-cxx11-cu126-x86_64-linux/flash_attn/__init__.py +364 -0
build/torch27-cxx11-cu126-x86_64-linux/flash_attn/_flash_attn_dd2f0f9.abi3.so +3 -0
build/torch27-cxx11-cu126-x86_64-linux/flash_attn/_ops.py +9 -0
build/torch27-cxx11-cu128-x86_64-linux/flash_attn/__init__.py +364 -0
build/torch27-cxx11-cu128-x86_64-linux/flash_attn/_flash_attn_dd2f0f9.abi3.so +3 -0
build/torch27-cxx11-cu128-x86_64-linux/flash_attn/_ops.py +9 -0

build/torch25-cxx11-cu118-x86_64-linux/flash_attn/__init__.py CHANGED Viewed

@@ -1,25 +1,45 @@
-from typing import Optional
 import torch
 from ._ops import ops
 def mha_fwd(
     q: torch.Tensor,
     k: torch.Tensor,
     v: torch.Tensor,
-    out: torch.Tensor,
-    alibi_slopes: torch.Tensor,
-    p_dropout: float,
-    softmax_scale: float,
-    is_causal: bool,
-    window_size_left: int,
-    window_size_right: int,
-    softcap: float,
-    return_softmax: bool,
-    gen: Optional[torch.Generator],
-) -> torch.Tensor:
-    ops.mha_fwd(
         q,
         k,
         v,
@@ -34,4 +54,311 @@ def mha_fwd(
         return_softmax,
         gen,
     )
-    return out

+from typing import Optional, List
 import torch
 from ._ops import ops
 def mha_fwd(
     q: torch.Tensor,
     k: torch.Tensor,
     v: torch.Tensor,
+    out: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    p_dropout: float = 0.0,
+    softmax_scale: float = 1.0,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    return_softmax: bool = False,
+    gen: Optional[torch.Generator] = None,
+) -> List[torch.Tensor]:
+    """
+    Forward pass for multi-head attention.
+    Args:
+        q: Query tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        k: Key tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        v: Value tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        out: Optional output tensor, same shape as q
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        p_dropout: Dropout probability
+        softmax_scale: Scale factor for softmax
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        return_softmax: Whether to return softmax weights
+        gen: Optional random number generator
+    Returns:
+        List of tensors: [output, softmax_lse, (softmax if return_softmax)]
+    """
+    return ops.mha_fwd(
         q,
         k,
         v,
         return_softmax,
         gen,
     )
+def mha_varlen_fwd(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    cu_seqlens_q: torch.Tensor,
+    cu_seqlens_k: torch.Tensor,
+    out: Optional[torch.Tensor] = None,
+    seqused_k: Optional[torch.Tensor] = None,
+    leftpad_k: Optional[torch.Tensor] = None,
+    block_table: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    max_seqlen_q: int = 0,
+    max_seqlen_k: int = 0,
+    p_dropout: float = 0.0,
+    softmax_scale: float = 1.0,
+    zero_tensors: bool = False,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    return_softmax: bool = False,
+    gen: Optional[torch.Generator] = None,
+) -> List[torch.Tensor]:
+    """
+    Forward pass for multi-head attention with variable sequence lengths.
+    Args:
+        q: Query tensor of shape [total_q, num_heads, head_size]
+        k: Key tensor of shape [total_k, num_heads_k, head_size] or [num_blocks, page_block_size, num_heads_k, head_size]
+        v: Value tensor of shape [total_k, num_heads_k, head_size] or [num_blocks, page_block_size, num_heads_k, head_size]
+        cu_seqlens_q: Cumulative sequence lengths for queries of shape [batch_size+1]
+        cu_seqlens_k: Cumulative sequence lengths for keys of shape [batch_size+1]
+        out: Optional output tensor of shape [total_q, num_heads, head_size]
+        seqused_k: Optional tensor specifying how many keys to use per batch element [batch_size]
+        leftpad_k: Optional left padding for keys of shape [batch_size]
+        block_table: Optional block table of shape [batch_size, max_num_blocks_per_seq]
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        max_seqlen_q: Maximum sequence length for queries
+        max_seqlen_k: Maximum sequence length for keys
+        p_dropout: Dropout probability
+        softmax_scale: Scale factor for softmax
+        zero_tensors: Whether to zero tensors before computation
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        return_softmax: Whether to return softmax weights
+        gen: Optional random number generator
+    Returns:
+        List of tensors: [output, softmax_lse, (softmax if return_softmax)]
+    """
+    return ops.mha_varlen_fwd(
+        q,
+        k,
+        v,
+        out,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        seqused_k,
+        leftpad_k,
+        block_table,
+        alibi_slopes,
+        max_seqlen_q,
+        max_seqlen_k,
+        p_dropout,
+        softmax_scale,
+        zero_tensors,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        return_softmax,
+        gen,
+    )
+def mha_bwd(
+    dout: torch.Tensor,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    out: torch.Tensor,
+    softmax_lse: torch.Tensor,
+    dq: Optional[torch.Tensor] = None,
+    dk: Optional[torch.Tensor] = None,
+    dv: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    p_dropout: float = 0.0,
+    softmax_scale: float = 1.0,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    deterministic: bool = False,
+    gen: Optional[torch.Generator] = None,
+    rng_state: Optional[torch.Tensor] = None,
+) -> List[torch.Tensor]:
+    """
+    Backward pass for multi-head attention.
+    Args:
+        dout: Gradient tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        q: Query tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        k: Key tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        v: Value tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        out: Output tensor from forward pass of shape [batch_size, seqlen_q, num_heads, head_size]
+        softmax_lse: Log-sum-exp values from forward pass of shape [batch_size, num_heads, seqlen_q]
+        dq: Optional gradient tensor for queries, same shape as q
+        dk: Optional gradient tensor for keys, same shape as k
+        dv: Optional gradient tensor for values, same shape as v
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        p_dropout: Dropout probability
+        softmax_scale: Scale factor for softmax
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        deterministic: Whether to use deterministic algorithms
+        gen: Optional random number generator
+        rng_state: Optional RNG state from forward pass
+    Returns:
+        List of tensors: [dq, dk, dv]
+    """
+    return ops.mha_bwd(
+        dout,
+        q,
+        k,
+        v,
+        out,
+        softmax_lse,
+        dq,
+        dk,
+        dv,
+        alibi_slopes,
+        p_dropout,
+        softmax_scale,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        deterministic,
+        gen,
+        rng_state,
+    )
+def mha_varlen_bwd(
+    dout: torch.Tensor,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    out: torch.Tensor,
+    softmax_lse: torch.Tensor,
+    cu_seqlens_q: torch.Tensor,
+    cu_seqlens_k: torch.Tensor,
+    dq: Optional[torch.Tensor] = None,
+    dk: Optional[torch.Tensor] = None,
+    dv: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    max_seqlen_q: int = 0,
+    max_seqlen_k: int = 0,
+    p_dropout: float = 0.0,
+    softmax_scale: float = 1.0,
+    zero_tensors: bool = False,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    deterministic: bool = False,
+    gen: Optional[torch.Generator] = None,
+    rng_state: Optional[torch.Tensor] = None,
+) -> List[torch.Tensor]:
+    """
+    Backward pass for multi-head attention with variable sequence lengths.
+    Args:
+        dout: Gradient tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        q: Query tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        k: Key tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        v: Value tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        out: Output tensor from forward pass of shape [batch_size, seqlen_q, num_heads, head_size]
+        softmax_lse: Log-sum-exp values from forward pass of shape [batch_size, num_heads, seqlen_q]
+        cu_seqlens_q: Cumulative sequence lengths for queries of shape [batch_size+1]
+        cu_seqlens_k: Cumulative sequence lengths for keys of shape [batch_size+1]
+        dq: Optional gradient tensor for queries, same shape as q
+        dk: Optional gradient tensor for keys, same shape as k
+        dv: Optional gradient tensor for values, same shape as v
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        max_seqlen_q: Maximum sequence length for queries
+        max_seqlen_k: Maximum sequence length for keys
+        p_dropout: Dropout probability
+        softmax_scale: Scale factor for softmax
+        zero_tensors: Whether to zero tensors before computation
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        deterministic: Whether to use deterministic algorithms
+        gen: Optional random number generator
+        rng_state: Optional RNG state from forward pass
+    Returns:
+        List of tensors: [dq, dk, dv]
+    """
+    return ops.mha_varlen_bwd(
+        dout,
+        q,
+        k,
+        v,
+        out,
+        softmax_lse,
+        dq,
+        dk,
+        dv,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        alibi_slopes,
+        max_seqlen_q,
+        max_seqlen_k,
+        p_dropout,
+        softmax_scale,
+        zero_tensors,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        deterministic,
+        gen,
+        rng_state,
+    )
+def mha_fwd_kvcache(
+    q: torch.Tensor,
+    kcache: torch.Tensor,
+    vcache: torch.Tensor,
+    k: Optional[torch.Tensor] = None,
+    v: Optional[torch.Tensor] = None,
+    seqlens_k: Optional[torch.Tensor] = None,
+    rotary_cos: Optional[torch.Tensor] = None,
+    rotary_sin: Optional[torch.Tensor] = None,
+    cache_batch_idx: Optional[torch.Tensor] = None,
+    leftpad_k: Optional[torch.Tensor] = None,
+    block_table: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    out: Optional[torch.Tensor] = None,
+    softmax_scale: float = 1.0,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    is_rotary_interleaved: bool = False,
+    num_splits: int = 1,
+) -> List[torch.Tensor]:
+    """
+    Forward pass for multi-head attention with KV cache.
+    Args:
+        q: Query tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        kcache: Key cache tensor of shape [batch_size_c, seqlen_k, num_heads_k, head_size] or [num_blocks, page_block_size, num_heads_k, head_size]
+        vcache: Value cache tensor of shape [batch_size_c, seqlen_k, num_heads_k, head_size] or [num_blocks, page_block_size, num_heads_k, head_size]
+        k: Optional new keys tensor of shape [batch_size, seqlen_knew, num_heads_k, head_size]
+        v: Optional new values tensor of shape [batch_size, seqlen_knew, num_heads_k, head_size]
+        seqlens_k: Optional sequence lengths for keys of shape [batch_size]
+        rotary_cos: Optional rotary cosine tensor of shape [seqlen_ro, rotary_dim/2]
+        rotary_sin: Optional rotary sine tensor of shape [seqlen_ro, rotary_dim/2]
+        cache_batch_idx: Optional indices to index into the KV cache
+        leftpad_k: Optional left padding for keys of shape [batch_size]
+        block_table: Optional block table of shape [batch_size, max_num_blocks_per_seq]
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        out: Optional output tensor, same shape as q
+        softmax_scale: Scale factor for softmax
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        is_rotary_interleaved: Whether rotary embeddings are interleaved
+        num_splits: Number of splits for computation
+    Returns:
+        List of tensors: [output, softmax_lse]
+    """
+    return ops.mha_fwd_kvcache(
+        q,
+        kcache,
+        vcache,
+        k,
+        v,
+        seqlens_k,
+        rotary_cos,
+        rotary_sin,
+        cache_batch_idx,
+        leftpad_k,
+        block_table,
+        alibi_slopes,
+        out,
+        softmax_scale,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        is_rotary_interleaved,
+        num_splits,
+    )

build/torch25-cxx11-cu118-x86_64-linux/flash_attn/{_flash_attn_ab4cc6a_dirty.abi3.so → _flash_attn_dd2f0f9.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:85f70ae9ee6f5b27b149808f14aedf0dbb327fcfac6e6320c48d17810009dc77
-size 1301385392

 version https://git-lfs.github.com/spec/v1
+oid sha256:14e43c95a52d7b6a974bc54b6ec30068ae8fa513583a686494caf123137dc2e5
+size 658100376

build/torch25-cxx11-cu118-x86_64-linux/flash_attn/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _flash_attn_ab4cc6a_dirty
-ops = torch.ops._flash_attn_ab4cc6a_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_flash_attn_ab4cc6a_dirty::{op_name}"

 import torch
+from . import _flash_attn_dd2f0f9
+ops = torch.ops._flash_attn_dd2f0f9
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_flash_attn_dd2f0f9::{op_name}"

build/torch25-cxx11-cu121-x86_64-linux/flash_attn/{_flash_attn_ab4cc6a_dirty.abi3.so → _flash_attn_dd2f0f9.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:91b3c70a49f7d039bc7a238d0147dabe94cffd2485463bdd641bb74b395ada99
-size 1295653368

 version https://git-lfs.github.com/spec/v1
+oid sha256:5405ad9405b2c3ded5f971fdc7a7fdfa0531eb2f1aca2e37e396003a149b1379
+size 653617624

build/torch25-cxx11-cu121-x86_64-linux/flash_attn/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _flash_attn_ab4cc6a_dirty
-ops = torch.ops._flash_attn_ab4cc6a_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_flash_attn_ab4cc6a_dirty::{op_name}"

 import torch
+from . import _flash_attn_dd2f0f9
+ops = torch.ops._flash_attn_dd2f0f9
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_flash_attn_dd2f0f9::{op_name}"

build/torch25-cxx11-cu124-x86_64-linux/flash_attn/{_flash_attn_ab4cc6a_dirty.abi3.so → _flash_attn_dd2f0f9.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0e074eed034da9275d49c87d904babd0a718c8e22d12cdedfae01e7c38260113
-size 1262747328

 version https://git-lfs.github.com/spec/v1
+oid sha256:a2b7aeea4adc77aefd217ccdfa7bcadcaef3dc6d0d7567f4b1c2c5f0321738fe
+size 640704152

build/torch25-cxx11-cu124-x86_64-linux/flash_attn/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _flash_attn_ab4cc6a_dirty
-ops = torch.ops._flash_attn_ab4cc6a_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_flash_attn_ab4cc6a_dirty::{op_name}"

 import torch
+from . import _flash_attn_dd2f0f9
+ops = torch.ops._flash_attn_dd2f0f9
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_flash_attn_dd2f0f9::{op_name}"

build/torch25-cxx98-cu118-x86_64-linux/flash_attn/{_flash_attn_ab4cc6a_dirty.abi3.so → _flash_attn_dd2f0f9.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7b245e7fe66f20cef74aaab7c86d1e33913faeff9d6dae530763d4a5dd256af5
-size 1301380832

 version https://git-lfs.github.com/spec/v1
+oid sha256:73f7d75dcba8295aa14721ffc8c1aca0d86872ae03af18ab2e5149c043201d2a
+size 658091712

build/torch25-cxx98-cu118-x86_64-linux/flash_attn/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _flash_attn_ab4cc6a_dirty
-ops = torch.ops._flash_attn_ab4cc6a_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_flash_attn_ab4cc6a_dirty::{op_name}"

 import torch
+from . import _flash_attn_dd2f0f9
+ops = torch.ops._flash_attn_dd2f0f9
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_flash_attn_dd2f0f9::{op_name}"

build/torch25-cxx98-cu121-x86_64-linux/flash_attn/_flash_attn_ab4cc6a_dirty.abi3.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:347160ae8e05c11d1a99da542ecb4c2f6dbd30627cc6002b08c107b9d3d8af3c
-size 1295640880

build/torch25-cxx98-cu121-x86_64-linux/flash_attn/_flash_attn_dd2f0f9.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:534cc70a7659f0acfca6fbd135d229af62f11557f910360939f55454cd2f6ce3
+size 653605136

build/torch25-cxx98-cu121-x86_64-linux/flash_attn/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _flash_attn_ab4cc6a_dirty
-ops = torch.ops._flash_attn_ab4cc6a_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_flash_attn_ab4cc6a_dirty::{op_name}"

 import torch
+from . import _flash_attn_dd2f0f9
+ops = torch.ops._flash_attn_dd2f0f9
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_flash_attn_dd2f0f9::{op_name}"

build/torch25-cxx98-cu124-x86_64-linux/flash_attn/_flash_attn_ab4cc6a_dirty.abi3.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c79d6703c033ea9e1bfcc6fc3006ac88a9713d8371ea3a96d70e8495c7692f68
-size 1262738936

build/torch25-cxx98-cu124-x86_64-linux/flash_attn/_flash_attn_dd2f0f9.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8051fcc2f91c45d364292f8e2f04804a93b2e78844747a46afcfa926007769be
+size 640695760

build/torch25-cxx98-cu124-x86_64-linux/flash_attn/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _flash_attn_ab4cc6a_dirty
-ops = torch.ops._flash_attn_ab4cc6a_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_flash_attn_ab4cc6a_dirty::{op_name}"

 import torch
+from . import _flash_attn_dd2f0f9
+ops = torch.ops._flash_attn_dd2f0f9
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_flash_attn_dd2f0f9::{op_name}"

build/torch26-cxx11-cu118-x86_64-linux/flash_attn/_flash_attn_ab4cc6a_dirty.abi3.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d8cc0f02a6eea5c9fe8e5bc7b0138cef9bf77c026dc26b08f878bd809799189e
-size 1301389752

build/torch26-cxx11-cu118-x86_64-linux/flash_attn/_flash_attn_dd2f0f9.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:af9873164f58acc3dcea5a1ca046af1872f3a5d2061668edcec7e7802c02a0a6
+size 658100640

build/torch26-cxx11-cu118-x86_64-linux/flash_attn/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _flash_attn_ab4cc6a_dirty
-ops = torch.ops._flash_attn_ab4cc6a_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_flash_attn_ab4cc6a_dirty::{op_name}"

 import torch
+from . import _flash_attn_dd2f0f9
+ops = torch.ops._flash_attn_dd2f0f9
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_flash_attn_dd2f0f9::{op_name}"

build/torch26-cxx11-cu124-x86_64-linux/flash_attn/_flash_attn_ab4cc6a_dirty.abi3.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:fc6aaa5d51f3d329ec4d6fe7422ff8ff5223fa1a1e01644da196504534bd4fb6
-size 1262747768

build/torch26-cxx11-cu124-x86_64-linux/flash_attn/_flash_attn_dd2f0f9.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:469a5bac698ba7be5a9aecd831b5ca5fd21ff37843d603d3e39888fad477d6e6
+size 640704600

build/torch26-cxx11-cu124-x86_64-linux/flash_attn/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _flash_attn_ab4cc6a_dirty
-ops = torch.ops._flash_attn_ab4cc6a_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_flash_attn_ab4cc6a_dirty::{op_name}"

 import torch
+from . import _flash_attn_dd2f0f9
+ops = torch.ops._flash_attn_dd2f0f9
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_flash_attn_dd2f0f9::{op_name}"

build/torch26-cxx11-cu126-x86_64-linux/flash_attn/_flash_attn_ab4cc6a_dirty.abi3.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:afce8d0bc6516f4e2ade3b45453d6370ead51ab9d368786b20109544cc8b4772
-size 1273150064

build/torch26-cxx11-cu126-x86_64-linux/flash_attn/_flash_attn_dd2f0f9.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:475a51fa6fe806195457f3ea76e64343bb7b0beca8be1f128d24c0672de6a5ee
+size 646613576

build/torch26-cxx11-cu126-x86_64-linux/flash_attn/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _flash_attn_ab4cc6a_dirty
-ops = torch.ops._flash_attn_ab4cc6a_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_flash_attn_ab4cc6a_dirty::{op_name}"

 import torch
+from . import _flash_attn_dd2f0f9
+ops = torch.ops._flash_attn_dd2f0f9
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_flash_attn_dd2f0f9::{op_name}"

build/torch26-cxx98-cu118-x86_64-linux/flash_attn/_flash_attn_ab4cc6a_dirty.abi3.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:0f65a659aa158221085014ade1e92475fe08871894796ca8db38ef2d2dbbcb99
-size 1301381128

build/torch26-cxx98-cu118-x86_64-linux/flash_attn/_flash_attn_dd2f0f9.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2ca74dec8c6dcab25c0359d147b769d1025c46fb5a8ea81dd87ca4d03876044b
+size 658092008

build/torch26-cxx98-cu118-x86_64-linux/flash_attn/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _flash_attn_ab4cc6a_dirty
-ops = torch.ops._flash_attn_ab4cc6a_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_flash_attn_ab4cc6a_dirty::{op_name}"

 import torch
+from . import _flash_attn_dd2f0f9
+ops = torch.ops._flash_attn_dd2f0f9
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_flash_attn_dd2f0f9::{op_name}"

build/torch26-cxx98-cu124-x86_64-linux/flash_attn/_flash_attn_ab4cc6a_dirty.abi3.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:7ad532518c0a821e096e21c16bd89ec4c0b57b5b9cae92daa4c75100cfe712c6
-size 1262739232

build/torch26-cxx98-cu124-x86_64-linux/flash_attn/_flash_attn_dd2f0f9.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c366befff828dfa0bccf8de7de54c5fa6b3f796d55690e5e0623e518de89e4f2
+size 640696056

build/torch26-cxx98-cu124-x86_64-linux/flash_attn/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _flash_attn_ab4cc6a_dirty
-ops = torch.ops._flash_attn_ab4cc6a_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_flash_attn_ab4cc6a_dirty::{op_name}"

 import torch
+from . import _flash_attn_dd2f0f9
+ops = torch.ops._flash_attn_dd2f0f9
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_flash_attn_dd2f0f9::{op_name}"

build/torch26-cxx98-cu126-x86_64-linux/flash_attn/_flash_attn_ab4cc6a_dirty.abi3.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:300868f1f33c620a923efa0629916bb0afda4763af425de233e48389eede6db4
-size 1273141520

build/torch26-cxx98-cu126-x86_64-linux/flash_attn/_flash_attn_dd2f0f9.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a0753c6dad0da882bae86dae5658e5915eb20c89f19cb69352f239c597b5d697
+size 646605032

build/torch26-cxx98-cu126-x86_64-linux/flash_attn/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _flash_attn_ab4cc6a_dirty
-ops = torch.ops._flash_attn_ab4cc6a_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_flash_attn_ab4cc6a_dirty::{op_name}"

 import torch
+from . import _flash_attn_dd2f0f9
+ops = torch.ops._flash_attn_dd2f0f9
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_flash_attn_dd2f0f9::{op_name}"

build/torch27-cxx11-cu118-x86_64-linux/flash_attn/__init__.py ADDED Viewed

	@@ -0,0 +1,364 @@

+from typing import Optional, List
+import torch
+from ._ops import ops
+def mha_fwd(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    out: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    p_dropout: float = 0.0,
+    softmax_scale: float = 1.0,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    return_softmax: bool = False,
+    gen: Optional[torch.Generator] = None,
+) -> List[torch.Tensor]:
+    """
+    Forward pass for multi-head attention.
+    Args:
+        q: Query tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        k: Key tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        v: Value tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        out: Optional output tensor, same shape as q
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        p_dropout: Dropout probability
+        softmax_scale: Scale factor for softmax
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        return_softmax: Whether to return softmax weights
+        gen: Optional random number generator
+    Returns:
+        List of tensors: [output, softmax_lse, (softmax if return_softmax)]
+    """
+    return ops.mha_fwd(
+        q,
+        k,
+        v,
+        out,
+        alibi_slopes,
+        p_dropout,
+        softmax_scale,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        return_softmax,
+        gen,
+    )
+def mha_varlen_fwd(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    cu_seqlens_q: torch.Tensor,
+    cu_seqlens_k: torch.Tensor,
+    out: Optional[torch.Tensor] = None,
+    seqused_k: Optional[torch.Tensor] = None,
+    leftpad_k: Optional[torch.Tensor] = None,
+    block_table: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    max_seqlen_q: int = 0,
+    max_seqlen_k: int = 0,
+    p_dropout: float = 0.0,
+    softmax_scale: float = 1.0,
+    zero_tensors: bool = False,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    return_softmax: bool = False,
+    gen: Optional[torch.Generator] = None,
+) -> List[torch.Tensor]:
+    """
+    Forward pass for multi-head attention with variable sequence lengths.
+    Args:
+        q: Query tensor of shape [total_q, num_heads, head_size]
+        k: Key tensor of shape [total_k, num_heads_k, head_size] or [num_blocks, page_block_size, num_heads_k, head_size]
+        v: Value tensor of shape [total_k, num_heads_k, head_size] or [num_blocks, page_block_size, num_heads_k, head_size]
+        cu_seqlens_q: Cumulative sequence lengths for queries of shape [batch_size+1]
+        cu_seqlens_k: Cumulative sequence lengths for keys of shape [batch_size+1]
+        out: Optional output tensor of shape [total_q, num_heads, head_size]
+        seqused_k: Optional tensor specifying how many keys to use per batch element [batch_size]
+        leftpad_k: Optional left padding for keys of shape [batch_size]
+        block_table: Optional block table of shape [batch_size, max_num_blocks_per_seq]
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        max_seqlen_q: Maximum sequence length for queries
+        max_seqlen_k: Maximum sequence length for keys
+        p_dropout: Dropout probability
+        softmax_scale: Scale factor for softmax
+        zero_tensors: Whether to zero tensors before computation
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        return_softmax: Whether to return softmax weights
+        gen: Optional random number generator
+    Returns:
+        List of tensors: [output, softmax_lse, (softmax if return_softmax)]
+    """
+    return ops.mha_varlen_fwd(
+        q,
+        k,
+        v,
+        out,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        seqused_k,
+        leftpad_k,
+        block_table,
+        alibi_slopes,
+        max_seqlen_q,
+        max_seqlen_k,
+        p_dropout,
+        softmax_scale,
+        zero_tensors,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        return_softmax,
+        gen,
+    )
+def mha_bwd(
+    dout: torch.Tensor,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    out: torch.Tensor,
+    softmax_lse: torch.Tensor,
+    dq: Optional[torch.Tensor] = None,
+    dk: Optional[torch.Tensor] = None,
+    dv: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    p_dropout: float = 0.0,
+    softmax_scale: float = 1.0,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    deterministic: bool = False,
+    gen: Optional[torch.Generator] = None,
+    rng_state: Optional[torch.Tensor] = None,
+) -> List[torch.Tensor]:
+    """
+    Backward pass for multi-head attention.
+    Args:
+        dout: Gradient tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        q: Query tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        k: Key tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        v: Value tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        out: Output tensor from forward pass of shape [batch_size, seqlen_q, num_heads, head_size]
+        softmax_lse: Log-sum-exp values from forward pass of shape [batch_size, num_heads, seqlen_q]
+        dq: Optional gradient tensor for queries, same shape as q
+        dk: Optional gradient tensor for keys, same shape as k
+        dv: Optional gradient tensor for values, same shape as v
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        p_dropout: Dropout probability
+        softmax_scale: Scale factor for softmax
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        deterministic: Whether to use deterministic algorithms
+        gen: Optional random number generator
+        rng_state: Optional RNG state from forward pass
+    Returns:
+        List of tensors: [dq, dk, dv]
+    """
+    return ops.mha_bwd(
+        dout,
+        q,
+        k,
+        v,
+        out,
+        softmax_lse,
+        dq,
+        dk,
+        dv,
+        alibi_slopes,
+        p_dropout,
+        softmax_scale,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        deterministic,
+        gen,
+        rng_state,
+    )
+def mha_varlen_bwd(
+    dout: torch.Tensor,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    out: torch.Tensor,
+    softmax_lse: torch.Tensor,
+    cu_seqlens_q: torch.Tensor,
+    cu_seqlens_k: torch.Tensor,
+    dq: Optional[torch.Tensor] = None,
+    dk: Optional[torch.Tensor] = None,
+    dv: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    max_seqlen_q: int = 0,
+    max_seqlen_k: int = 0,
+    p_dropout: float = 0.0,
+    softmax_scale: float = 1.0,
+    zero_tensors: bool = False,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    deterministic: bool = False,
+    gen: Optional[torch.Generator] = None,
+    rng_state: Optional[torch.Tensor] = None,
+) -> List[torch.Tensor]:
+    """
+    Backward pass for multi-head attention with variable sequence lengths.
+    Args:
+        dout: Gradient tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        q: Query tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        k: Key tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        v: Value tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        out: Output tensor from forward pass of shape [batch_size, seqlen_q, num_heads, head_size]
+        softmax_lse: Log-sum-exp values from forward pass of shape [batch_size, num_heads, seqlen_q]
+        cu_seqlens_q: Cumulative sequence lengths for queries of shape [batch_size+1]
+        cu_seqlens_k: Cumulative sequence lengths for keys of shape [batch_size+1]
+        dq: Optional gradient tensor for queries, same shape as q
+        dk: Optional gradient tensor for keys, same shape as k
+        dv: Optional gradient tensor for values, same shape as v
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        max_seqlen_q: Maximum sequence length for queries
+        max_seqlen_k: Maximum sequence length for keys
+        p_dropout: Dropout probability
+        softmax_scale: Scale factor for softmax
+        zero_tensors: Whether to zero tensors before computation
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        deterministic: Whether to use deterministic algorithms
+        gen: Optional random number generator
+        rng_state: Optional RNG state from forward pass
+    Returns:
+        List of tensors: [dq, dk, dv]
+    """
+    return ops.mha_varlen_bwd(
+        dout,
+        q,
+        k,
+        v,
+        out,
+        softmax_lse,
+        dq,
+        dk,
+        dv,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        alibi_slopes,
+        max_seqlen_q,
+        max_seqlen_k,
+        p_dropout,
+        softmax_scale,
+        zero_tensors,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        deterministic,
+        gen,
+        rng_state,
+    )
+def mha_fwd_kvcache(
+    q: torch.Tensor,
+    kcache: torch.Tensor,
+    vcache: torch.Tensor,
+    k: Optional[torch.Tensor] = None,
+    v: Optional[torch.Tensor] = None,
+    seqlens_k: Optional[torch.Tensor] = None,
+    rotary_cos: Optional[torch.Tensor] = None,
+    rotary_sin: Optional[torch.Tensor] = None,
+    cache_batch_idx: Optional[torch.Tensor] = None,
+    leftpad_k: Optional[torch.Tensor] = None,
+    block_table: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    out: Optional[torch.Tensor] = None,
+    softmax_scale: float = 1.0,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    is_rotary_interleaved: bool = False,
+    num_splits: int = 1,
+) -> List[torch.Tensor]:
+    """
+    Forward pass for multi-head attention with KV cache.
+    Args:
+        q: Query tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        kcache: Key cache tensor of shape [batch_size_c, seqlen_k, num_heads_k, head_size] or [num_blocks, page_block_size, num_heads_k, head_size]
+        vcache: Value cache tensor of shape [batch_size_c, seqlen_k, num_heads_k, head_size] or [num_blocks, page_block_size, num_heads_k, head_size]
+        k: Optional new keys tensor of shape [batch_size, seqlen_knew, num_heads_k, head_size]
+        v: Optional new values tensor of shape [batch_size, seqlen_knew, num_heads_k, head_size]
+        seqlens_k: Optional sequence lengths for keys of shape [batch_size]
+        rotary_cos: Optional rotary cosine tensor of shape [seqlen_ro, rotary_dim/2]
+        rotary_sin: Optional rotary sine tensor of shape [seqlen_ro, rotary_dim/2]
+        cache_batch_idx: Optional indices to index into the KV cache
+        leftpad_k: Optional left padding for keys of shape [batch_size]
+        block_table: Optional block table of shape [batch_size, max_num_blocks_per_seq]
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        out: Optional output tensor, same shape as q
+        softmax_scale: Scale factor for softmax
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        is_rotary_interleaved: Whether rotary embeddings are interleaved
+        num_splits: Number of splits for computation
+    Returns:
+        List of tensors: [output, softmax_lse]
+    """
+    return ops.mha_fwd_kvcache(
+        q,
+        kcache,
+        vcache,
+        k,
+        v,
+        seqlens_k,
+        rotary_cos,
+        rotary_sin,
+        cache_batch_idx,
+        leftpad_k,
+        block_table,
+        alibi_slopes,
+        out,
+        softmax_scale,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        is_rotary_interleaved,
+        num_splits,
+    )

build/torch27-cxx11-cu118-x86_64-linux/flash_attn/_flash_attn_dd2f0f9.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dcb1788a80f0624ec6532ea3abdbd1ef504364006129ef4564d131f2a44dc916
+size 658100920

build/torch27-cxx11-cu118-x86_64-linux/flash_attn/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _flash_attn_dd2f0f9
+ops = torch.ops._flash_attn_dd2f0f9
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_flash_attn_dd2f0f9::{op_name}"

build/torch27-cxx11-cu126-x86_64-linux/flash_attn/__init__.py ADDED Viewed

	@@ -0,0 +1,364 @@

+from typing import Optional, List
+import torch
+from ._ops import ops
+def mha_fwd(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    out: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    p_dropout: float = 0.0,
+    softmax_scale: float = 1.0,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    return_softmax: bool = False,
+    gen: Optional[torch.Generator] = None,
+) -> List[torch.Tensor]:
+    """
+    Forward pass for multi-head attention.
+    Args:
+        q: Query tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        k: Key tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        v: Value tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        out: Optional output tensor, same shape as q
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        p_dropout: Dropout probability
+        softmax_scale: Scale factor for softmax
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        return_softmax: Whether to return softmax weights
+        gen: Optional random number generator
+    Returns:
+        List of tensors: [output, softmax_lse, (softmax if return_softmax)]
+    """
+    return ops.mha_fwd(
+        q,
+        k,
+        v,
+        out,
+        alibi_slopes,
+        p_dropout,
+        softmax_scale,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        return_softmax,
+        gen,
+    )
+def mha_varlen_fwd(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    cu_seqlens_q: torch.Tensor,
+    cu_seqlens_k: torch.Tensor,
+    out: Optional[torch.Tensor] = None,
+    seqused_k: Optional[torch.Tensor] = None,
+    leftpad_k: Optional[torch.Tensor] = None,
+    block_table: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    max_seqlen_q: int = 0,
+    max_seqlen_k: int = 0,
+    p_dropout: float = 0.0,
+    softmax_scale: float = 1.0,
+    zero_tensors: bool = False,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    return_softmax: bool = False,
+    gen: Optional[torch.Generator] = None,
+) -> List[torch.Tensor]:
+    """
+    Forward pass for multi-head attention with variable sequence lengths.
+    Args:
+        q: Query tensor of shape [total_q, num_heads, head_size]
+        k: Key tensor of shape [total_k, num_heads_k, head_size] or [num_blocks, page_block_size, num_heads_k, head_size]
+        v: Value tensor of shape [total_k, num_heads_k, head_size] or [num_blocks, page_block_size, num_heads_k, head_size]
+        cu_seqlens_q: Cumulative sequence lengths for queries of shape [batch_size+1]
+        cu_seqlens_k: Cumulative sequence lengths for keys of shape [batch_size+1]
+        out: Optional output tensor of shape [total_q, num_heads, head_size]
+        seqused_k: Optional tensor specifying how many keys to use per batch element [batch_size]
+        leftpad_k: Optional left padding for keys of shape [batch_size]
+        block_table: Optional block table of shape [batch_size, max_num_blocks_per_seq]
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        max_seqlen_q: Maximum sequence length for queries
+        max_seqlen_k: Maximum sequence length for keys
+        p_dropout: Dropout probability
+        softmax_scale: Scale factor for softmax
+        zero_tensors: Whether to zero tensors before computation
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        return_softmax: Whether to return softmax weights
+        gen: Optional random number generator
+    Returns:
+        List of tensors: [output, softmax_lse, (softmax if return_softmax)]
+    """
+    return ops.mha_varlen_fwd(
+        q,
+        k,
+        v,
+        out,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        seqused_k,
+        leftpad_k,
+        block_table,
+        alibi_slopes,
+        max_seqlen_q,
+        max_seqlen_k,
+        p_dropout,
+        softmax_scale,
+        zero_tensors,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        return_softmax,
+        gen,
+    )
+def mha_bwd(
+    dout: torch.Tensor,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    out: torch.Tensor,
+    softmax_lse: torch.Tensor,
+    dq: Optional[torch.Tensor] = None,
+    dk: Optional[torch.Tensor] = None,
+    dv: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    p_dropout: float = 0.0,
+    softmax_scale: float = 1.0,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    deterministic: bool = False,
+    gen: Optional[torch.Generator] = None,
+    rng_state: Optional[torch.Tensor] = None,
+) -> List[torch.Tensor]:
+    """
+    Backward pass for multi-head attention.
+    Args:
+        dout: Gradient tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        q: Query tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        k: Key tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        v: Value tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        out: Output tensor from forward pass of shape [batch_size, seqlen_q, num_heads, head_size]
+        softmax_lse: Log-sum-exp values from forward pass of shape [batch_size, num_heads, seqlen_q]
+        dq: Optional gradient tensor for queries, same shape as q
+        dk: Optional gradient tensor for keys, same shape as k
+        dv: Optional gradient tensor for values, same shape as v
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        p_dropout: Dropout probability
+        softmax_scale: Scale factor for softmax
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        deterministic: Whether to use deterministic algorithms
+        gen: Optional random number generator
+        rng_state: Optional RNG state from forward pass
+    Returns:
+        List of tensors: [dq, dk, dv]
+    """
+    return ops.mha_bwd(
+        dout,
+        q,
+        k,
+        v,
+        out,
+        softmax_lse,
+        dq,
+        dk,
+        dv,
+        alibi_slopes,
+        p_dropout,
+        softmax_scale,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        deterministic,
+        gen,
+        rng_state,
+    )
+def mha_varlen_bwd(
+    dout: torch.Tensor,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    out: torch.Tensor,
+    softmax_lse: torch.Tensor,
+    cu_seqlens_q: torch.Tensor,
+    cu_seqlens_k: torch.Tensor,
+    dq: Optional[torch.Tensor] = None,
+    dk: Optional[torch.Tensor] = None,
+    dv: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    max_seqlen_q: int = 0,
+    max_seqlen_k: int = 0,
+    p_dropout: float = 0.0,
+    softmax_scale: float = 1.0,
+    zero_tensors: bool = False,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    deterministic: bool = False,
+    gen: Optional[torch.Generator] = None,
+    rng_state: Optional[torch.Tensor] = None,
+) -> List[torch.Tensor]:
+    """
+    Backward pass for multi-head attention with variable sequence lengths.
+    Args:
+        dout: Gradient tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        q: Query tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        k: Key tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        v: Value tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        out: Output tensor from forward pass of shape [batch_size, seqlen_q, num_heads, head_size]
+        softmax_lse: Log-sum-exp values from forward pass of shape [batch_size, num_heads, seqlen_q]
+        cu_seqlens_q: Cumulative sequence lengths for queries of shape [batch_size+1]
+        cu_seqlens_k: Cumulative sequence lengths for keys of shape [batch_size+1]
+        dq: Optional gradient tensor for queries, same shape as q
+        dk: Optional gradient tensor for keys, same shape as k
+        dv: Optional gradient tensor for values, same shape as v
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        max_seqlen_q: Maximum sequence length for queries
+        max_seqlen_k: Maximum sequence length for keys
+        p_dropout: Dropout probability
+        softmax_scale: Scale factor for softmax
+        zero_tensors: Whether to zero tensors before computation
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        deterministic: Whether to use deterministic algorithms
+        gen: Optional random number generator
+        rng_state: Optional RNG state from forward pass
+    Returns:
+        List of tensors: [dq, dk, dv]
+    """
+    return ops.mha_varlen_bwd(
+        dout,
+        q,
+        k,
+        v,
+        out,
+        softmax_lse,
+        dq,
+        dk,
+        dv,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        alibi_slopes,
+        max_seqlen_q,
+        max_seqlen_k,
+        p_dropout,
+        softmax_scale,
+        zero_tensors,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        deterministic,
+        gen,
+        rng_state,
+    )
+def mha_fwd_kvcache(
+    q: torch.Tensor,
+    kcache: torch.Tensor,
+    vcache: torch.Tensor,
+    k: Optional[torch.Tensor] = None,
+    v: Optional[torch.Tensor] = None,
+    seqlens_k: Optional[torch.Tensor] = None,
+    rotary_cos: Optional[torch.Tensor] = None,
+    rotary_sin: Optional[torch.Tensor] = None,
+    cache_batch_idx: Optional[torch.Tensor] = None,
+    leftpad_k: Optional[torch.Tensor] = None,
+    block_table: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    out: Optional[torch.Tensor] = None,
+    softmax_scale: float = 1.0,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    is_rotary_interleaved: bool = False,
+    num_splits: int = 1,
+) -> List[torch.Tensor]:
+    """
+    Forward pass for multi-head attention with KV cache.
+    Args:
+        q: Query tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        kcache: Key cache tensor of shape [batch_size_c, seqlen_k, num_heads_k, head_size] or [num_blocks, page_block_size, num_heads_k, head_size]
+        vcache: Value cache tensor of shape [batch_size_c, seqlen_k, num_heads_k, head_size] or [num_blocks, page_block_size, num_heads_k, head_size]
+        k: Optional new keys tensor of shape [batch_size, seqlen_knew, num_heads_k, head_size]
+        v: Optional new values tensor of shape [batch_size, seqlen_knew, num_heads_k, head_size]
+        seqlens_k: Optional sequence lengths for keys of shape [batch_size]
+        rotary_cos: Optional rotary cosine tensor of shape [seqlen_ro, rotary_dim/2]
+        rotary_sin: Optional rotary sine tensor of shape [seqlen_ro, rotary_dim/2]
+        cache_batch_idx: Optional indices to index into the KV cache
+        leftpad_k: Optional left padding for keys of shape [batch_size]
+        block_table: Optional block table of shape [batch_size, max_num_blocks_per_seq]
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        out: Optional output tensor, same shape as q
+        softmax_scale: Scale factor for softmax
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        is_rotary_interleaved: Whether rotary embeddings are interleaved
+        num_splits: Number of splits for computation
+    Returns:
+        List of tensors: [output, softmax_lse]
+    """
+    return ops.mha_fwd_kvcache(
+        q,
+        kcache,
+        vcache,
+        k,
+        v,
+        seqlens_k,
+        rotary_cos,
+        rotary_sin,
+        cache_batch_idx,
+        leftpad_k,
+        block_table,
+        alibi_slopes,
+        out,
+        softmax_scale,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        is_rotary_interleaved,
+        num_splits,
+    )

build/torch27-cxx11-cu126-x86_64-linux/flash_attn/_flash_attn_dd2f0f9.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e7da95f182ca7f57b45cfe9387045d78397c312283d8a5eecd9bce96e6888ea8
+size 646613312

build/torch27-cxx11-cu126-x86_64-linux/flash_attn/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _flash_attn_dd2f0f9
+ops = torch.ops._flash_attn_dd2f0f9
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_flash_attn_dd2f0f9::{op_name}"

build/torch27-cxx11-cu128-x86_64-linux/flash_attn/__init__.py ADDED Viewed

	@@ -0,0 +1,364 @@

+from typing import Optional, List
+import torch
+from ._ops import ops
+def mha_fwd(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    out: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    p_dropout: float = 0.0,
+    softmax_scale: float = 1.0,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    return_softmax: bool = False,
+    gen: Optional[torch.Generator] = None,
+) -> List[torch.Tensor]:
+    """
+    Forward pass for multi-head attention.
+    Args:
+        q: Query tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        k: Key tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        v: Value tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        out: Optional output tensor, same shape as q
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        p_dropout: Dropout probability
+        softmax_scale: Scale factor for softmax
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        return_softmax: Whether to return softmax weights
+        gen: Optional random number generator
+    Returns:
+        List of tensors: [output, softmax_lse, (softmax if return_softmax)]
+    """
+    return ops.mha_fwd(
+        q,
+        k,
+        v,
+        out,
+        alibi_slopes,
+        p_dropout,
+        softmax_scale,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        return_softmax,
+        gen,
+    )
+def mha_varlen_fwd(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    cu_seqlens_q: torch.Tensor,
+    cu_seqlens_k: torch.Tensor,
+    out: Optional[torch.Tensor] = None,
+    seqused_k: Optional[torch.Tensor] = None,
+    leftpad_k: Optional[torch.Tensor] = None,
+    block_table: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    max_seqlen_q: int = 0,
+    max_seqlen_k: int = 0,
+    p_dropout: float = 0.0,
+    softmax_scale: float = 1.0,
+    zero_tensors: bool = False,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    return_softmax: bool = False,
+    gen: Optional[torch.Generator] = None,
+) -> List[torch.Tensor]:
+    """
+    Forward pass for multi-head attention with variable sequence lengths.
+    Args:
+        q: Query tensor of shape [total_q, num_heads, head_size]
+        k: Key tensor of shape [total_k, num_heads_k, head_size] or [num_blocks, page_block_size, num_heads_k, head_size]
+        v: Value tensor of shape [total_k, num_heads_k, head_size] or [num_blocks, page_block_size, num_heads_k, head_size]
+        cu_seqlens_q: Cumulative sequence lengths for queries of shape [batch_size+1]
+        cu_seqlens_k: Cumulative sequence lengths for keys of shape [batch_size+1]
+        out: Optional output tensor of shape [total_q, num_heads, head_size]
+        seqused_k: Optional tensor specifying how many keys to use per batch element [batch_size]
+        leftpad_k: Optional left padding for keys of shape [batch_size]
+        block_table: Optional block table of shape [batch_size, max_num_blocks_per_seq]
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        max_seqlen_q: Maximum sequence length for queries
+        max_seqlen_k: Maximum sequence length for keys
+        p_dropout: Dropout probability
+        softmax_scale: Scale factor for softmax
+        zero_tensors: Whether to zero tensors before computation
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        return_softmax: Whether to return softmax weights
+        gen: Optional random number generator
+    Returns:
+        List of tensors: [output, softmax_lse, (softmax if return_softmax)]
+    """
+    return ops.mha_varlen_fwd(
+        q,
+        k,
+        v,
+        out,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        seqused_k,
+        leftpad_k,
+        block_table,
+        alibi_slopes,
+        max_seqlen_q,
+        max_seqlen_k,
+        p_dropout,
+        softmax_scale,
+        zero_tensors,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        return_softmax,
+        gen,
+    )
+def mha_bwd(
+    dout: torch.Tensor,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    out: torch.Tensor,
+    softmax_lse: torch.Tensor,
+    dq: Optional[torch.Tensor] = None,
+    dk: Optional[torch.Tensor] = None,
+    dv: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    p_dropout: float = 0.0,
+    softmax_scale: float = 1.0,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    deterministic: bool = False,
+    gen: Optional[torch.Generator] = None,
+    rng_state: Optional[torch.Tensor] = None,
+) -> List[torch.Tensor]:
+    """
+    Backward pass for multi-head attention.
+    Args:
+        dout: Gradient tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        q: Query tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        k: Key tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        v: Value tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        out: Output tensor from forward pass of shape [batch_size, seqlen_q, num_heads, head_size]
+        softmax_lse: Log-sum-exp values from forward pass of shape [batch_size, num_heads, seqlen_q]
+        dq: Optional gradient tensor for queries, same shape as q
+        dk: Optional gradient tensor for keys, same shape as k
+        dv: Optional gradient tensor for values, same shape as v
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        p_dropout: Dropout probability
+        softmax_scale: Scale factor for softmax
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        deterministic: Whether to use deterministic algorithms
+        gen: Optional random number generator
+        rng_state: Optional RNG state from forward pass
+    Returns:
+        List of tensors: [dq, dk, dv]
+    """
+    return ops.mha_bwd(
+        dout,
+        q,
+        k,
+        v,
+        out,
+        softmax_lse,
+        dq,
+        dk,
+        dv,
+        alibi_slopes,
+        p_dropout,
+        softmax_scale,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        deterministic,
+        gen,
+        rng_state,
+    )
+def mha_varlen_bwd(
+    dout: torch.Tensor,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    out: torch.Tensor,
+    softmax_lse: torch.Tensor,
+    cu_seqlens_q: torch.Tensor,
+    cu_seqlens_k: torch.Tensor,
+    dq: Optional[torch.Tensor] = None,
+    dk: Optional[torch.Tensor] = None,
+    dv: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    max_seqlen_q: int = 0,
+    max_seqlen_k: int = 0,
+    p_dropout: float = 0.0,
+    softmax_scale: float = 1.0,
+    zero_tensors: bool = False,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    deterministic: bool = False,
+    gen: Optional[torch.Generator] = None,
+    rng_state: Optional[torch.Tensor] = None,
+) -> List[torch.Tensor]:
+    """
+    Backward pass for multi-head attention with variable sequence lengths.
+    Args:
+        dout: Gradient tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        q: Query tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        k: Key tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        v: Value tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        out: Output tensor from forward pass of shape [batch_size, seqlen_q, num_heads, head_size]
+        softmax_lse: Log-sum-exp values from forward pass of shape [batch_size, num_heads, seqlen_q]
+        cu_seqlens_q: Cumulative sequence lengths for queries of shape [batch_size+1]
+        cu_seqlens_k: Cumulative sequence lengths for keys of shape [batch_size+1]
+        dq: Optional gradient tensor for queries, same shape as q
+        dk: Optional gradient tensor for keys, same shape as k
+        dv: Optional gradient tensor for values, same shape as v
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        max_seqlen_q: Maximum sequence length for queries
+        max_seqlen_k: Maximum sequence length for keys
+        p_dropout: Dropout probability
+        softmax_scale: Scale factor for softmax
+        zero_tensors: Whether to zero tensors before computation
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        deterministic: Whether to use deterministic algorithms
+        gen: Optional random number generator
+        rng_state: Optional RNG state from forward pass
+    Returns:
+        List of tensors: [dq, dk, dv]
+    """
+    return ops.mha_varlen_bwd(
+        dout,
+        q,
+        k,
+        v,
+        out,
+        softmax_lse,
+        dq,
+        dk,
+        dv,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        alibi_slopes,
+        max_seqlen_q,
+        max_seqlen_k,
+        p_dropout,
+        softmax_scale,
+        zero_tensors,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        deterministic,
+        gen,
+        rng_state,
+    )
+def mha_fwd_kvcache(
+    q: torch.Tensor,
+    kcache: torch.Tensor,
+    vcache: torch.Tensor,
+    k: Optional[torch.Tensor] = None,
+    v: Optional[torch.Tensor] = None,
+    seqlens_k: Optional[torch.Tensor] = None,
+    rotary_cos: Optional[torch.Tensor] = None,
+    rotary_sin: Optional[torch.Tensor] = None,
+    cache_batch_idx: Optional[torch.Tensor] = None,
+    leftpad_k: Optional[torch.Tensor] = None,
+    block_table: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    out: Optional[torch.Tensor] = None,
+    softmax_scale: float = 1.0,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    is_rotary_interleaved: bool = False,
+    num_splits: int = 1,
+) -> List[torch.Tensor]:
+    """
+    Forward pass for multi-head attention with KV cache.
+    Args:
+        q: Query tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        kcache: Key cache tensor of shape [batch_size_c, seqlen_k, num_heads_k, head_size] or [num_blocks, page_block_size, num_heads_k, head_size]
+        vcache: Value cache tensor of shape [batch_size_c, seqlen_k, num_heads_k, head_size] or [num_blocks, page_block_size, num_heads_k, head_size]
+        k: Optional new keys tensor of shape [batch_size, seqlen_knew, num_heads_k, head_size]
+        v: Optional new values tensor of shape [batch_size, seqlen_knew, num_heads_k, head_size]
+        seqlens_k: Optional sequence lengths for keys of shape [batch_size]
+        rotary_cos: Optional rotary cosine tensor of shape [seqlen_ro, rotary_dim/2]
+        rotary_sin: Optional rotary sine tensor of shape [seqlen_ro, rotary_dim/2]
+        cache_batch_idx: Optional indices to index into the KV cache
+        leftpad_k: Optional left padding for keys of shape [batch_size]
+        block_table: Optional block table of shape [batch_size, max_num_blocks_per_seq]
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        out: Optional output tensor, same shape as q
+        softmax_scale: Scale factor for softmax
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        is_rotary_interleaved: Whether rotary embeddings are interleaved
+        num_splits: Number of splits for computation
+    Returns:
+        List of tensors: [output, softmax_lse]
+    """
+    return ops.mha_fwd_kvcache(
+        q,
+        kcache,
+        vcache,
+        k,
+        v,
+        seqlens_k,
+        rotary_cos,
+        rotary_sin,
+        cache_batch_idx,
+        leftpad_k,
+        block_table,
+        alibi_slopes,
+        out,
+        softmax_scale,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        is_rotary_interleaved,
+        num_splits,
+    )

build/torch27-cxx11-cu128-x86_64-linux/flash_attn/_flash_attn_dd2f0f9.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3ba8c31bf3488a6f0a93e2d5d83d28a27daa26c156ed357ba5443ac66e3809fc
+size 1502967480

build/torch27-cxx11-cu128-x86_64-linux/flash_attn/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _flash_attn_dd2f0f9
+ops = torch.ops._flash_attn_dd2f0f9
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_flash_attn_dd2f0f9::{op_name}"