from typing import Any

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import Tensor
from torch.nn import RMSNorm

from .config import DiaConfig


def _normalize_axes(axes: tuple[int, ...], ndim: int) -> tuple[int, ...]:
    return tuple(ax if ax >= 0 else ndim + ax for ax in axes)


def _str_to_dtype(dtype_str: str) -> torch.dtype | None:
    # Allow None for default behavior
    if dtype_str is None or dtype_str.lower() == "none":
        return None
    if dtype_str == "float32":
        return torch.float32
    elif dtype_str == "float16":
        return torch.float16
    elif dtype_str == "bfloat16":
        return torch.bfloat16
    else:
        raise ValueError(f"Unsupported dtype string: {dtype_str}")


class DenseGeneral(nn.Module):
    """
    PyTorch equivalent of flax.linen.DenseGeneral with shapes defined at init.

    Stores weights (`kernel`) in the same layout as Jax and uses torch.tensordot
    for the generalized matrix multiplication. Weight/bias shapes are calculated
    and parameters created during initialization based on config.
    `load_weights` validates shapes and copies data.

    Attributes:
        axis (Tuple[int, ...]): Input axis or axes to contract.
        in_shapes (Tuple[int, ...]): Sizes of the input dimensions specified by `axis`.
        out_features (Tuple[int, ...]): Shape of the output features (non-contracted dims).
        use_bias (bool): Whether to add a bias term.
        weight (nn.Parameter): The kernel parameter.
        bias (Optional[nn.Parameter]): The bias parameter (if use_bias=True).
    """

    def __init__(
        self,
        in_shapes: tuple[int, ...],
        out_features: tuple[int, ...],
        axis: tuple[int, ...] = (-1,),
        dtype: torch.dtype | None = None,
        weight_dtype: torch.dtype | None = None,
        device: torch.device | None = None,
    ):
        super().__init__()
        self.in_shapes = in_shapes
        self.out_features = out_features
        self.axis = axis
        self.dtype = dtype
        self.kernel_shape = self.in_shapes + self.out_features

        factory_kwargs = {"device": device, "dtype": weight_dtype}
        self.weight = nn.Parameter(torch.empty(self.kernel_shape, **factory_kwargs))
        self.register_parameter("bias", None)

    def forward(self, inputs: Tensor) -> Tensor:
        norm_axis = _normalize_axes(self.axis, inputs.ndim)
        kernel_contract_axes = tuple(range(len(norm_axis)))

        output = torch.tensordot(
            inputs.float(),
            self.weight.float(),
            dims=(norm_axis, kernel_contract_axes),
        ).to(inputs.dtype)
        return output


def get_activation_fn(activation_string: str) -> nn.Module:  # Return Module instance
    """Maps activation string to PyTorch activation function module."""
    if activation_string == "gelu":
        return nn.GELU()
    elif activation_string == "relu":
        return nn.ReLU()
    elif activation_string == "silu" or activation_string == "swish":
        return nn.SiLU()
    elif activation_string == "linear":
        return nn.Identity()
    else:
        raise ValueError(f"Unsupported activation function: {activation_string}")


class MlpBlock(nn.Module):
    """MLP block using DenseGeneral."""

    def __init__(
        self,
        config: DiaConfig,
        embed_dim: int,
        intermediate_dim: int,
        dropout_rate: float,
        activations: list[str] = ["silu", "linear"],
        use_pre_norm: bool = False,
    ):
        super().__init__()
        self.use_pre_norm = use_pre_norm
        num_activations = len(activations)
        compute_dtype = _str_to_dtype(config.training.dtype)
        weight_dtype = _str_to_dtype(config.model.weight_dtype)
        self.dtype = compute_dtype
        # Assume default device for now, could be passed in config

        if use_pre_norm:
            self.pre_norm = RMSNorm(
                embed_dim,
                eps=config.model.normalization_layer_epsilon,
                dtype=torch.float32,
            )

        self.wi_fused = DenseGeneral(
            in_shapes=(embed_dim,),
            out_features=(
                num_activations,
                intermediate_dim,
            ),
            axis=(-1,),
            dtype=compute_dtype,
            weight_dtype=weight_dtype,
        )

        self.activation_fn_0 = get_activation_fn(activations[0])  # silu
        self.activation_fn_1 = get_activation_fn(activations[1])  # linear

        self.dropout = nn.Dropout(dropout_rate)

        # Output layer using DenseGeneral
        self.wo = DenseGeneral(
            in_shapes=(intermediate_dim,),
            out_features=(embed_dim,),
            axis=(-1,),
            dtype=compute_dtype,
            weight_dtype=weight_dtype,
        )

    def forward(self, x: torch.Tensor, deterministic: bool) -> torch.Tensor:
        """Forward pass."""
        if self.use_pre_norm and hasattr(self, "pre_norm"):
            x = self.pre_norm(x)

        fused_x = self.wi_fused(x)

        gate_input = fused_x[..., 0, :]
        up_input = fused_x[..., 1, :]

        gate = self.activation_fn_0(gate_input)
        up = self.activation_fn_1(up_input)
        hidden = torch.mul(gate, up).to(self.dtype)

        if not deterministic:
            hidden = self.dropout(hidden)

        output = self.wo(hidden)
        return output


class RotaryEmbedding(nn.Module):
    """Rotary Position Embedding (RoPE) implementation in PyTorch."""

    def __init__(
        self,
        embedding_dims: int,
        min_timescale: int = 1,
        max_timescale: int = 10000,
        dtype: torch.dtype = torch.float32,
    ):
        super().__init__()
        if embedding_dims % 2 != 0:
            raise ValueError("Embedding dim must be even for RoPE.")
        self.embedding_dims = embedding_dims
        self.min_timescale = min_timescale
        self.max_timescale = max_timescale
        self.dtype = dtype

        half_embedding_dim = embedding_dims // 2
        fraction = (2.0 * torch.arange(0, half_embedding_dim)) / embedding_dims
        self.register_buffer(
            "timescale",
            self.min_timescale * (self.max_timescale / self.min_timescale) ** fraction,
            persistent=False,
        )

    def extra_repr(self) -> str:
        s = f"{self.timescale.shape}"
        return s

    def forward(self, inputs: torch.Tensor, position: torch.Tensor):
        """Applies RoPE."""
        position = position.unsqueeze(-1).unsqueeze(-1)
        timescale = self.timescale.to(inputs.device)
        sinusoid_inp = position / timescale
        sin = torch.sin(sinusoid_inp).to(inputs.dtype)
        cos = torch.cos(sinusoid_inp).to(inputs.dtype)
        first_half, second_half = torch.chunk(inputs, 2, dim=-1)
        first_part = first_half * cos - second_half * sin
        second_part = second_half * cos + first_half * sin
        return torch.cat((first_part, second_part), dim=-1)


class KVCache:
    def __init__(self, num_heads, max_len, head_dim, device, k=None, v=None):
        self.k = torch.zeros((2, num_heads, max_len, head_dim), device=device) if k is None else k
        self.v = torch.zeros((2, num_heads, max_len, head_dim), device=device) if v is None else v
        self.current_idx = 0
        self.max_len = max_len

    def get_kv_for_attention(self, current_k, current_v):
        if self.current_idx == 0:
            return current_k, current_v
        else:
            past_k = self.k[:, :, : self.current_idx, :]
            past_v = self.v[:, :, : self.current_idx, :]
            attn_k = torch.cat((past_k, current_k), dim=2)
            attn_v = torch.cat((past_v, current_v), dim=2)
            return attn_k, attn_v

    def update_cache(self, k, v):
        assert self.current_idx < self.max_len
        self.k[:, :, self.current_idx : self.current_idx + 1, :] = k
        self.v[:, :, self.current_idx : self.current_idx + 1, :] = v
        self.current_idx += 1

    def prefill_kv(self, k, v):
        prefill_len = k.shape[2]
        assert prefill_len <= self.max_len
        self.k[:, :, :prefill_len, :] = k
        self.v[:, :, :prefill_len, :] = v
        self.current_idx = prefill_len


class Attention(nn.Module):
    """Attention using DenseGeneral."""

    def __init__(
        self,
        config: DiaConfig,
        q_embed_dim: int,
        kv_embed_dim: int,
        num_query_heads: int,
        num_kv_heads: int,
        head_dim: int,
        dropout_rate: float,
        is_cross_attn: bool = False,
        out_embed_dim: int | None = None,
    ):
        super().__init__()
        self.num_query_heads = num_query_heads
        self.num_kv_heads = num_kv_heads
        self.head_dim = head_dim
        self.is_cross_attn = is_cross_attn
        self.dropout_rate = dropout_rate
        compute_dtype = _str_to_dtype(config.training.dtype)
        weight_dtype = _str_to_dtype(config.model.weight_dtype)
        self.output_dim = out_embed_dim if out_embed_dim is not None else q_embed_dim
        self.projected_query_dim = num_query_heads * head_dim
        if num_query_heads % num_kv_heads != 0:
            raise ValueError(f"num_query_heads ({num_query_heads}) must be divisible by num_kv_heads ({num_kv_heads})")
        self.num_gqa_groups = num_query_heads // num_kv_heads

        # --- Projection Layers using DenseGeneral ---
        self.q_proj = DenseGeneral(
            in_shapes=(q_embed_dim,),
            out_features=(num_query_heads, head_dim),
            axis=(-1,),
            dtype=compute_dtype,
            weight_dtype=weight_dtype,
        )
        self.k_proj = DenseGeneral(
            in_shapes=(kv_embed_dim,),
            out_features=(num_kv_heads, head_dim),
            axis=(-1,),
            dtype=compute_dtype,
            weight_dtype=weight_dtype,
        )
        self.v_proj = DenseGeneral(
            in_shapes=(kv_embed_dim,),
            out_features=(num_kv_heads, head_dim),
            axis=(-1,),
            dtype=compute_dtype,
            weight_dtype=weight_dtype,
        )
        self.o_proj = DenseGeneral(
            in_shapes=(num_query_heads, head_dim),
            out_features=(self.output_dim,),
            axis=(-2, -1),
            dtype=compute_dtype,
            weight_dtype=weight_dtype,
        )

        # --- Rotary Embedding ---
        self.rotary_emb = RotaryEmbedding(
            embedding_dims=self.head_dim,
            min_timescale=config.model.rope_min_timescale,
            max_timescale=config.model.rope_max_timescale,
            dtype=compute_dtype,
        )

    def forward(
        self,
        Xq: torch.Tensor,  # (B, T, D) T = 1 in AR generation
        Xkv: torch.Tensor,  # (B, S, E) S = 1 in AR generation
        q_positions: torch.Tensor,  # (B, T)
        kv_positions: torch.Tensor | None = None,  # (B, S)
        deterministic: bool = True,
        attn_mask: torch.Tensor | None = None,  # None in Decoder Self Attention, Valid mask in Others
        cache: KVCache | None = None,  # None in Encoder, KVCache in Decoder
        prefill: bool = False,  # True only when prefilling KV Cache
    ) -> tuple[torch.Tensor, tuple[torch.Tensor, torch.Tensor] | None]:
        """
        Performs attention calculation with optional KV caching.

        Args:
            Xq: Query tensor (B, T, D). T=1 during single-step decoding.
            Xkv: Key/Value source tensor (B, S, E). S=1 during single-step decoding for self-attn.
            q_positions: Positions for queries (B, T).
            kv_positions: Positions for keys/values (B, S). If None, uses q_positions.
            deterministic: If True, disable dropout.
            attn_mask: Attention mask.
            cache: KVCache.
            prefill: If True, use prefill mode.

        Returns:
            A tuple containing:
            - output: The attention output tensor (B, T, output_dim).
            - present_kv: The K/V state to be cached for the next step ((B, N, S_new, H), (B, N, S_new, H)). For self-attn, S_new = S_past + S. For cross-attn, S_new = S_kv.
        """
        if kv_positions is None:
            kv_positions = q_positions
        original_dtype = Xq.dtype

        Xq_BxTxNxH = self.q_proj(Xq)
        Xq_BxTxNxH = self.rotary_emb(Xq_BxTxNxH, position=q_positions)
        Xq_BxNxTxH = Xq_BxTxNxH.transpose(1, 2)

        # Input values into attention calculation
        attn_k: torch.Tensor | None = None
        attn_v: torch.Tensor | None = None
        new_kv_cache: tuple[torch.Tensor, torch.Tensor] | None = None

        # Decoder Cross Attention
        if self.is_cross_attn:
            # Directly use cache (no need to check index)
            attn_k, attn_v = cache.k, cache.v
            if attn_k.shape[1] != self.num_query_heads or attn_v.shape[1] != self.num_query_heads:
                raise ValueError(
                    f"Cross-attention cache head dimension ({attn_k.shape[1]}) "
                    f"does not match num_query_heads ({self.num_query_heads}). "
                    "Cache should be pre-repeated for GQA."
                )
        # Self Attention
        else:
            Xk_BxSxKxH = self.k_proj(Xkv)  # (B, S, K, H)
            Xv_BxSxKxH = self.v_proj(Xkv)  # (B, S, K, H)
            Xk_BxSxKxH = self.rotary_emb(Xk_BxSxKxH, position=kv_positions)  # (B, S, K, H)

            Xk_BxKxSxH = Xk_BxSxKxH.transpose(1, 2)  # (B, K, S, H)
            Xv_BxKxSxH = Xv_BxSxKxH.transpose(1, 2)  # (B, K, S, H)
            # S=1 for Decode Step

            if self.num_gqa_groups > 1:
                Xk_BxNxSxH = Xk_BxKxSxH.repeat_interleave(self.num_gqa_groups, dim=1)
                Xv_BxNxSxH = Xv_BxKxSxH.repeat_interleave(self.num_gqa_groups, dim=1)
            else:
                Xk_BxNxSxH = Xk_BxKxSxH
                Xv_BxNxSxH = Xv_BxKxSxH

            # Encoder Self Attention
            if cache is None:
                attn_k = Xk_BxNxSxH
                attn_v = Xv_BxNxSxH
            # Decoder Self Attention
            else:
                # In prefill mode, we fill in cache until prefill length
                if prefill:
                    attn_k, attn_v = Xk_BxNxSxH, Xv_BxNxSxH
                    cache.prefill_kv(attn_k, attn_v)
                # In decode step, we add current K/V to cache step by step
                else:
                    new_kv_cache = Xk_BxNxSxH, Xv_BxNxSxH
                    attn_k, attn_v = cache.get_kv_for_attention(Xk_BxNxSxH, Xv_BxNxSxH)

        attn_output = F.scaled_dot_product_attention(
            Xq_BxNxTxH,
            attn_k,
            attn_v,
            attn_mask=attn_mask,
            dropout_p=self.dropout_rate if not deterministic else 0.0,
            scale=1.0,
        )

        attn_output = attn_output.transpose(1, 2).contiguous()  # (B, T, N, H)
        output = self.o_proj(attn_output)

        return output.to(original_dtype), new_kv_cache


class EncoderLayer(nn.Module):
    """Transformer Encoder Layer using DenseGeneral."""

    def __init__(self, config: DiaConfig):
        super().__init__()
        self.config = config
        model_config = config.model
        enc_config = config.model.encoder
        embed_dim = enc_config.n_embd

        self.pre_sa_norm = RMSNorm(
            embed_dim,
            eps=model_config.normalization_layer_epsilon,
            dtype=torch.float32,
        )
        self.self_attention = Attention(
            config=config,
            q_embed_dim=embed_dim,
            kv_embed_dim=embed_dim,
            num_query_heads=enc_config.n_head,
            num_kv_heads=enc_config.n_head,
            head_dim=enc_config.head_dim,
            dropout_rate=model_config.dropout,
            is_cross_attn=False,
            out_embed_dim=embed_dim,
        )
        self.post_sa_norm = RMSNorm(
            embed_dim,
            eps=model_config.normalization_layer_epsilon,
            dtype=torch.float32,
        )
        self.mlp = MlpBlock(
            config=config,
            embed_dim=embed_dim,
            intermediate_dim=enc_config.n_hidden,
            activations=enc_config.mlp_activations,
            dropout_rate=model_config.dropout,
            use_pre_norm=enc_config.use_pre_norm,
        )
        self.dropout = nn.Dropout(model_config.dropout)

    def forward(
        self,
        x: torch.Tensor,
        src_positions: torch.Tensor | None = None,
        deterministic: bool = True,
        attn_mask: torch.Tensor | None = None,
    ) -> torch.Tensor:
        residual = x
        x_norm = self.pre_sa_norm(x)

        sa_out, _ = self.self_attention(
            Xq=x_norm,
            Xkv=x_norm,
            q_positions=src_positions,
            kv_positions=src_positions,
            deterministic=deterministic,
            attn_mask=attn_mask,
        )
        x = residual + sa_out

        residual = x
        x_norm = self.post_sa_norm(x)
        mlp_out = self.mlp(x_norm, deterministic=deterministic)
        x = residual + mlp_out

        if not deterministic:
            x = self.dropout(x)
        return x


class Encoder(nn.Module):
    """Transformer Encoder Stack using DenseGeneral."""

    def __init__(self, config: DiaConfig):
        super().__init__()
        self.config = config
        model_config = config.model
        enc_config = config.model.encoder
        compute_dtype = _str_to_dtype(config.training.dtype)

        self.embedding = nn.Embedding(
            model_config.src_vocab_size,
            enc_config.n_embd,
            dtype=compute_dtype,
        )
        self.dropout = nn.Dropout(model_config.dropout)
        self.layers = nn.ModuleList([EncoderLayer(config=config) for _ in range(enc_config.n_layer)])
        self.norm = RMSNorm(
            enc_config.n_embd,
            eps=model_config.normalization_layer_epsilon,
            dtype=torch.float32,
        )

    def forward(
        self,
        x_ids: torch.Tensor,
        src_positions: torch.Tensor | None = None,
        deterministic: bool = True,
        attn_mask: torch.Tensor | None = None,
    ) -> torch.Tensor:
        x = self.embedding(x_ids)

        if not deterministic:
            x = self.dropout(x)

        for layer in self.layers:
            x = layer(
                x,
                src_positions=src_positions,
                deterministic=deterministic,
                attn_mask=attn_mask,
            )
        x = self.norm(x)
        if not deterministic:
            x = self.dropout(x)
        return x


class DecoderLayer(nn.Module):
    """Transformer Decoder Layer using DenseGeneral."""

    def __init__(self, config: DiaConfig):
        super().__init__()
        self.config = config
        model_config = config.model
        dec_config = config.model.decoder
        enc_config = config.model.encoder
        dec_embed_dim = dec_config.n_embd
        enc_embed_dim = enc_config.n_embd

        # Norms
        self.pre_sa_norm = RMSNorm(
            dec_embed_dim,
            eps=model_config.normalization_layer_epsilon,
            dtype=torch.float32,
        )
        self.pre_ca_norm = RMSNorm(
            dec_embed_dim,
            eps=model_config.normalization_layer_epsilon,
            dtype=torch.float32,
        )
        self.pre_mlp_norm = RMSNorm(
            dec_embed_dim,
            eps=model_config.normalization_layer_epsilon,
            dtype=torch.float32,
        )

        # Self-Attention (GQA) with Causal Masking
        self.self_attention = Attention(
            config=config,
            q_embed_dim=dec_embed_dim,
            kv_embed_dim=dec_embed_dim,
            num_query_heads=dec_config.gqa_query_heads,
            num_kv_heads=dec_config.kv_heads,
            head_dim=dec_config.gqa_head_dim,
            dropout_rate=model_config.dropout,
            is_cross_attn=False,
            out_embed_dim=dec_embed_dim,
        )
        # Cross-Attention (MHA)
        self.cross_attention = Attention(
            config=config,
            q_embed_dim=dec_embed_dim,
            kv_embed_dim=enc_embed_dim,  # Note kv_embed_dim
            num_query_heads=dec_config.cross_query_heads,
            num_kv_heads=dec_config.cross_query_heads,
            head_dim=dec_config.cross_head_dim,
            dropout_rate=model_config.dropout,
            is_cross_attn=True,
            out_embed_dim=dec_embed_dim,
        )
        # MLP
        self.mlp = MlpBlock(
            config=config,
            embed_dim=dec_embed_dim,
            intermediate_dim=dec_config.n_hidden,
            activations=dec_config.mlp_activations,
            dropout_rate=model_config.dropout,
            use_pre_norm=dec_config.use_pre_norm,
        )

    def forward(
        self,
        x: torch.Tensor,
        encoder_out: torch.Tensor,
        tgt_positions: torch.Tensor,
        src_positions: torch.Tensor | None,
        deterministic: bool,
        self_attn_mask: torch.Tensor,
        cross_attn_mask: torch.Tensor,
        self_attn_cache: KVCache,
        cross_attn_cache: KVCache,
        prefill: bool = False,
    ) -> torch.Tensor:
        residual = x
        x_norm = self.pre_sa_norm(x)

        sa_out, new_kv_cache = self.self_attention(
            Xq=x_norm,  # (2, 1, D)
            Xkv=x_norm,  # (2, 1, D)
            q_positions=tgt_positions,  # (2, 1)
            kv_positions=tgt_positions,  # (2, 1)
            deterministic=deterministic,
            attn_mask=self_attn_mask,  # (2, 1, 1, S_max)
            cache=self_attn_cache,
            prefill=prefill,
        )

        x = residual + sa_out

        # 2. Cross-Attention
        residual = x
        x_norm = self.pre_ca_norm(x)
        ca_out, _ = self.cross_attention(
            Xq=x_norm,
            Xkv=encoder_out,
            q_positions=tgt_positions,
            kv_positions=src_positions,
            deterministic=deterministic,
            attn_mask=cross_attn_mask,
            cache=cross_attn_cache,
        )
        x = residual + ca_out

        # 3. MLP
        residual = x
        x_norm = self.pre_mlp_norm(x)
        mlp_out = self.mlp(x_norm, deterministic=deterministic)
        x = residual + mlp_out

        return x, new_kv_cache


class Decoder(nn.Module):
    """Transformer Decoder Stack using DenseGeneral."""

    def __init__(self, config: DiaConfig):
        super().__init__()
        self.config = config
        model_config = config.model
        dec_config = config.model.decoder
        train_config = config.training
        data_config = config.data
        compute_dtype = _str_to_dtype(config.training.dtype)
        weight_dtype = _str_to_dtype(config.model.weight_dtype)
        self.num_channels = data_config.channels
        self.num_layers = dec_config.n_layer

        self.embeddings = nn.ModuleList(
            [
                nn.Embedding(model_config.tgt_vocab_size, dec_config.n_embd, dtype=compute_dtype)
                for _ in range(self.num_channels)
            ]
        )
        self.dropout = nn.Dropout(model_config.dropout)
        self.layers = nn.ModuleList([DecoderLayer(config=config) for _ in range(self.num_layers)])
        self.norm = RMSNorm(
            dec_config.n_embd,
            eps=model_config.normalization_layer_epsilon,
            dtype=torch.float32,
        )

        # Final Logits Projection using DenseGeneral
        self.logits_dense = DenseGeneral(
            in_shapes=(dec_config.n_embd,),
            out_features=(self.num_channels, model_config.tgt_vocab_size),
            axis=(-1,),
            dtype=(torch.float32 if train_config.logits_dot_in_fp32 else compute_dtype),
            weight_dtype=weight_dtype,
        )
        self.logits_in_fp32 = train_config.logits_dot_in_fp32

    def precompute_cross_attention_kv(
        self,
        max_len: int,
        encoder_out: torch.Tensor,  # (B, S, E)
        src_positions: torch.Tensor | None,  # (B, S)
    ) -> list[KVCache]:
        """
        Computes the Key and Value tensors for cross-attention for each layer from the encoder output.
        """
        per_layer_kv_cache: list[KVCache] = []

        for layer in self.layers:
            cross_attn_module = layer.cross_attention
            k_proj = cross_attn_module.k_proj(encoder_out)
            v_proj = cross_attn_module.v_proj(encoder_out)

            k_proj = cross_attn_module.rotary_emb(k_proj, position=src_positions)
            k = k_proj.transpose(1, 2)
            v = v_proj.transpose(1, 2)

            per_layer_kv_cache.append(
                KVCache(
                    cross_attn_module.num_kv_heads,
                    max_len,
                    cross_attn_module.head_dim,
                    k.device,
                    k=k,
                    v=v,
                )
            )

        return per_layer_kv_cache

    def decode_step(
        self,
        tgt_ids_Bx1xC: torch.Tensor,  # [B, 1, C]
        tgt_pos_Bx1: torch.Tensor,  # [B, 1]
        encoder_out: torch.Tensor,  # [B, S, E]
        self_attn_mask: Any,  # None
        cross_attn_mask: torch.Tensor,  # [B, 1, 1, S]
        self_attention_cache: list[KVCache],
        cross_attention_cache: list[KVCache],
    ) -> torch.Tensor:
        """
        Performs a single decoding step, managing KV caches layer by layer.

        Returns:
            A tuple containing:
            - logits_Bx1xCV: The final output logits for the current step (B, 1, C*V), cast to float32.
        """
        assert self_attn_mask is None, "Self-attention mask should be None, kept for pattern"

        x = None
        for i in range(self.num_channels):
            channel_tokens = tgt_ids_Bx1xC[..., i]
            channel_embed = self.embeddings[i](channel_tokens)
            x = channel_embed if x is None else x + channel_embed

        new_cache = []

        for i, layer in enumerate(self.layers):
            self_cache = self_attention_cache[i]
            cross_cache = cross_attention_cache[i]
            x, new_kv_cache = layer(
                x,  # (2, 1, D)
                encoder_out,  # (2, S, E)
                src_positions=None,  # CA KV is already computed
                tgt_positions=tgt_pos_Bx1,  # (2, 1)
                deterministic=True,
                self_attn_mask=None,
                cross_attn_mask=cross_attn_mask,
                self_attn_cache=self_cache,
                cross_attn_cache=cross_cache,
            )
            new_cache.append(new_kv_cache)

        x = self.norm(x)
        logits_Bx1xCxV = self.logits_dense(x)

        return logits_Bx1xCxV.to(torch.float32), new_cache

    def forward(
        self,
        tgt_ids_BxTxC: torch.Tensor,
        encoder_out: torch.Tensor,
        tgt_positions: torch.Tensor,
        src_positions: torch.Tensor,
        deterministic: bool,
        self_attn_mask: torch.Tensor,
        cross_attn_mask: torch.Tensor,
        self_attention_cache: list[KVCache],
        cross_attention_cache: list[KVCache],
    ) -> torch.Tensor:
        """
        Forward pass for the Decoder stack, managing KV caches.

        Args:
            tgt_ids_BxTxC: Target token IDs (B, T, C).
            encoder_out: Output from the encoder (B, S, E).
            tgt_positions: Positions for target sequence (B, T).
            src_positions: Positions for source sequence (B, S).
            deterministic: Disable dropout if True.
            self_attn_mask: Mask for self-attention.
            cross_attn_mask: Mask for cross-attention.
            past_key_values: List containing the self-attention KV cache for each layer
                             from the previous decoding step. `len(past_key_values)` should
                             equal `num_layers`.
            precomputed_cross_attn_kv: A single tuple containing the pre-computed K/V cache
                                      derived from `encoder_out`. This is passed identically
                                      to all layers.

        Returns:
            A tuple containing:
            - logits: The final output logits (B, T, C * V), cast to float32.
            - present_key_values: A list containing the updated self-attention KV cache
                                 for each layer for the *current* decoding step.
        """
        _, _, num_channels_in = tgt_ids_BxTxC.shape
        assert num_channels_in == self.num_channels, "Input channels mismatch"

        # Embeddings
        x = None
        for i in range(self.num_channels):
            channel_tokens = tgt_ids_BxTxC[..., i]
            channel_embed = self.embeddings[i](channel_tokens)
            x = channel_embed if x is None else x + channel_embed

        if not deterministic:
            x = self.dropout(x)

        for i, layer in enumerate(self.layers):
            x, _ = layer(
                x,
                encoder_out,
                tgt_positions=tgt_positions,
                src_positions=src_positions,
                deterministic=deterministic,
                self_attn_mask=self_attn_mask,
                cross_attn_mask=cross_attn_mask,
                self_attn_cache=self_attention_cache[i],
                cross_attn_cache=cross_attention_cache[i],
                prefill=True,
            )

        # Final Norm
        x = self.norm(x)
        logits_BxTxCxV = self.logits_dense(x)

        return logits_BxTxCxV.to(torch.float32)


class DiaModel(nn.Module):
    """PyTorch Dia Model using DenseGeneral."""

    def __init__(self, config: DiaConfig):
        super().__init__()
        self.config = config
        self.encoder = Encoder(config)
        self.decoder = Decoder(config)

    def forward(
        self,
        src_BxS: torch.Tensor,
        tgt_BxTxC: torch.Tensor,
        src_positions: torch.Tensor | None = None,
        tgt_positions: torch.Tensor | None = None,
        enc_self_attn_mask: torch.Tensor | None = None,
        dec_self_attn_mask: torch.Tensor | None = None,
        dec_cross_attn_mask: torch.Tensor | None = None,
        enable_dropout: bool = True,
    ):
        deterministic = not enable_dropout

        # --- Encoder Pass ---
        encoder_out = self.encoder(
            x_ids=src_BxS,
            src_positions=src_positions,
            deterministic=deterministic,
            attn_mask=enc_self_attn_mask,
        )

        # --- Decoder Pass ---
        logits, _ = self.decoder(
            tgt_ids_BxTxC=tgt_BxTxC,
            encoder_out=encoder_out,
            tgt_positions=tgt_positions,
            src_positions=src_positions,
            deterministic=deterministic,
            self_attn_mask=dec_self_attn_mask,
            cross_attn_mask=dec_cross_attn_mask,
            precomputed_cross_attn_kv=None,
        )

        return logits