model

File size: 34,490 Bytes

"""Miscovery model implementation."""

import torch
import math
import torch.nn as nn
from transformers import PretrainedConfig, PreTrainedModel
from transformers.modeling_outputs import Seq2SeqLMOutput
from .configuration_miscovery import CustomTransformerConfig


class RMSNorm(nn.Module):
    def __init__(self, dim, eps=1e-6):
        super().__init__()
        self.eps = eps
        self.weight = nn.Parameter(torch.ones(dim))

    def forward(self, x):
        rms = torch.sqrt(torch.mean(x ** 2, dim=-1, keepdim=True) + self.eps)
        x_normalized = x / rms
        return self.weight * x_normalized


class LayerScale(nn.Module):
    def __init__(self, dim, init_values=1e-5):
        super().__init__()
        self.gamma = nn.Parameter(init_values * torch.ones(dim))

    def forward(self, x):
        return self.gamma * x


class RotaryEmbedding(nn.Module):
    def __init__(self, dim, max_position_embeddings=2048, base=10000.0):
        super().__init__()
        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
        self.register_buffer("inv_freq", inv_freq)

        self.max_seq_len_cached = max_position_embeddings
        self._update_cos_sin_cache(max_position_embeddings)

    def _update_cos_sin_cache(self, seq_len):
        t = torch.arange(seq_len, device=self.inv_freq.device)
        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
        emb = torch.cat((freqs, freqs), dim=-1)
        self.register_buffer("cos_cached", emb.cos()[None, None, :, :], persistent=False)
        self.register_buffer("sin_cached", emb.sin()[None, None, :, :], persistent=False)

    def forward(self, x, seq_len=None):
        if seq_len is None:
            seq_len = x.shape[1]

        if seq_len > self.max_seq_len_cached:
            self.max_seq_len_cached = seq_len
            self._update_cos_sin_cache(seq_len)

        return (
            self.cos_cached[:, :, :seq_len, ...],
            self.sin_cached[:, :, :seq_len, ...]
        )


def rotate_half(x):
    x1, x2 = x.chunk(2, dim=-1)
    return torch.cat((-x2, x1), dim=-1)


def apply_rotary_embeddings(q, k, cos, sin):
    q_seq_len = q.shape[2]
    k_seq_len = k.shape[2]

    q_cos = cos[:, :, :q_seq_len, :]
    q_sin = sin[:, :, :q_seq_len, :]
    k_cos = cos[:, :, :k_seq_len, :]
    k_sin = sin[:, :, :k_seq_len, :]

    q_embed = (q * q_cos) + (rotate_half(q) * q_sin)
    k_embed = (k * k_cos) + (rotate_half(k) * k_sin)

    return q_embed, k_embed


# Flash attention check
try:
    from flash_attn import flash_attn_func
    FLASH_ATTENTION_AVAILABLE = True
except ImportError:
    FLASH_ATTENTION_AVAILABLE = False


def create_decoder_mask(tgt, pad_idx):
    tgt_pad_mask = (tgt != pad_idx).unsqueeze(1).unsqueeze(2)
    tgt_len = tgt.size(1)
    tgt_sub_mask = torch.tril(torch.ones((tgt_len, tgt_len), device=tgt.device)).bool()
    tgt_mask = tgt_pad_mask & tgt_sub_mask.unsqueeze(0)
    return tgt_mask


class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads, dropout=0.1, max_len=2048, use_flash_attn=False):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        self.use_flash_attn = use_flash_attn and FLASH_ATTENTION_AVAILABLE

        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        self.depth = d_model // num_heads

        self.q_proj = nn.Linear(d_model, d_model)
        self.k_proj = nn.Linear(d_model, d_model)
        self.v_proj = nn.Linear(d_model, d_model)
        self.out_proj = nn.Linear(d_model, d_model)

        self.attention_dropout = nn.Dropout(dropout)
        self.output_dropout = nn.Dropout(dropout)
        self.layer_scale = LayerScale(d_model, init_values=1e-5)
        self.rotary_emb = RotaryEmbedding(self.depth, max_position_embeddings=max_len)

    def split_heads(self, x, batch_size):
        x = x.view(batch_size, -1, self.num_heads, self.depth)
        return x.transpose(1, 2)

    def forward(self, query, key=None, value=None, mask=None):
        batch_size = query.shape[0]

        if key is None:
            key = query
        if value is None:
            value = query

        q = self.split_heads(self.q_proj(query), batch_size)
        k = self.split_heads(self.k_proj(key), batch_size)
        v = self.split_heads(self.v_proj(value), batch_size)

        max_seq_len = max(q.shape[2], k.shape[2])
        cos, sin = self.rotary_emb(q, seq_len=max_seq_len)
        q, k = apply_rotary_embeddings(q, k, cos, sin)

        if self.use_flash_attn and q.is_cuda:
            try:
                q = q.transpose(1, 2)
                k = k.transpose(1, 2)
                v = v.transpose(1, 2)

                if mask is not None:
                    attn_mask = mask.float()
                    attn_mask = (1.0 - attn_mask) * -10000.0
                    if len(attn_mask.shape) == 4:
                        attn_mask = attn_mask.squeeze(1)
                else:
                    attn_mask = None

                attn_output = flash_attn_func(
                    q, k, v,
                    dropout_p=self.attention_dropout.p,
                    attn_mask=attn_mask,
                    causal=False
                )

                if attn_output.shape[1] != self.num_heads:
                    attn_output = attn_output.transpose(1, 2)
            except Exception as e:
                # Fallback to standard attention
                if q.shape[1] != self.num_heads:
                    q = q.transpose(1, 2)
                    k = k.transpose(1, 2)
                    v = v.transpose(1, 2)

                attn_logits = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.depth)
                if mask is not None:
                    attn_logits = attn_logits.masked_fill(~mask, float('-inf'))
                attn_weights = self.attention_dropout(torch.nn.functional.softmax(attn_logits, dim=-1))
                attn_output = torch.matmul(attn_weights, v)
        else:
            attn_logits = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.depth)
            if mask is not None:
                attn_logits = attn_logits.masked_fill(~mask, float('-inf'))
            attn_weights = self.attention_dropout(torch.nn.functional.softmax(attn_logits, dim=-1))
            attn_output = torch.matmul(attn_weights, v)

        attn_output = attn_output.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)
        output = self.out_proj(attn_output)
        output = self.layer_scale(output)
        return self.output_dropout(output)


class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff=None, dropout=0.1):
        super(FeedForward, self).__init__()
        if d_ff is None:
            d_ff = 4 * d_model

        self.w1 = nn.Linear(d_model, d_ff)
        self.w2 = nn.Linear(d_model, d_ff)
        self.w3 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)
        self.layer_scale = LayerScale(d_model, init_values=1e-5)

    def forward(self, x):
        gated_output = self.w1(x) * torch.sigmoid(self.w2(x) * 1.0)
        output = self.w3(self.dropout(gated_output))
        return self.layer_scale(output)


class TransformerEncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super(TransformerEncoderLayer, self).__init__()
        self.norm1 = RMSNorm(d_model)
        self.norm2 = RMSNorm(d_model)
        self.self_attention = MultiHeadAttention(d_model, num_heads, dropout)
        self.feed_forward = FeedForward(d_model, d_ff, dropout)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        attn_input = self.norm1(x)
        attn_output = self.self_attention(attn_input, mask=mask)
        x = x + self.dropout1(attn_output)

        ff_input = self.norm2(x)
        ff_output = self.feed_forward(ff_input)
        x = x + self.dropout2(ff_output)

        return x


class TransformerEncoder(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, num_layers, vocab_size, max_len, dropout=0.1):
        super(TransformerEncoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.embed_scale = math.sqrt(d_model)
        self.embed_dropout = nn.Dropout(dropout)

        self.layers = nn.ModuleList([
            TransformerEncoderLayer(d_model, num_heads, d_ff, dropout)
            for _ in range(num_layers)
        ])
        self.norm = RMSNorm(d_model)

    def forward(self, x, mask=None):
        x = self.embedding(x) * self.embed_scale
        x = self.embed_dropout(x)

        for layer in self.layers:
            x = layer(x, mask)

        return self.norm(x)


class TransformerDecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super(TransformerDecoderLayer, self).__init__()
        self.norm1 = RMSNorm(d_model)
        self.norm2 = RMSNorm(d_model)
        self.norm3 = RMSNorm(d_model)

        self.self_attention = MultiHeadAttention(d_model, num_heads, dropout)
        self.cross_attention = MultiHeadAttention(d_model, num_heads, dropout)
        self.feed_forward = FeedForward(d_model, d_ff, dropout)

        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.dropout3 = nn.Dropout(dropout)

    def forward(self, x, enc_output, src_mask=None, tgt_mask=None):
        attn_input = self.norm1(x)
        self_attn_output = self.self_attention(attn_input, mask=tgt_mask)
        x = x + self.dropout1(self_attn_output)

        cross_attn_input = self.norm2(x)
        cross_attn_output = self.cross_attention(
            query=cross_attn_input,
            key=enc_output,
            value=enc_output,
            mask=src_mask
        )
        x = x + self.dropout2(cross_attn_output)

        ff_input = self.norm3(x)
        ff_output = self.feed_forward(ff_input)
        x = x + self.dropout3(ff_output)

        return x


class TransformerDecoder(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, num_layers, vocab_size, max_len, dropout=0.1):
        super(TransformerDecoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.embed_scale = math.sqrt(d_model)
        self.embed_dropout = nn.Dropout(dropout)

        self.layers = nn.ModuleList([
            TransformerDecoderLayer(d_model, num_heads, d_ff, dropout)
            for _ in range(num_layers)
        ])
        self.norm = RMSNorm(d_model)
        self.output_projection = nn.Linear(d_model, vocab_size)

    def forward(self, x, enc_output, src_mask=None, tgt_mask=None):
        x = self.embedding(x) * self.embed_scale
        x = self.embed_dropout(x)

        for layer in self.layers:
            x = layer(x, enc_output, src_mask, tgt_mask)

        x = self.norm(x)
        return self.output_projection(x)


class Transformer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, num_encoder_layers, num_decoder_layers,
                 vocab_size, max_len, pad_idx, dropout=0.1):
        super(Transformer, self).__init__()
        self.encoder = TransformerEncoder(d_model, num_heads, d_ff, num_encoder_layers,
                                          vocab_size, max_len, dropout)
        self.decoder = TransformerDecoder(d_model, num_heads, d_ff, num_decoder_layers,
                                          vocab_size, max_len, dropout)
        self.pad_idx = pad_idx

    def create_masks(self, src, tgt):
        if not isinstance(src, torch.Tensor):
            src = torch.tensor(src)
        if not isinstance(tgt, torch.Tensor):
            tgt = torch.tensor(tgt)

        src_pad_mask = (src != self.pad_idx).unsqueeze(1).unsqueeze(2)
        tgt_pad_mask = (tgt != self.pad_idx).unsqueeze(1).unsqueeze(2)
        tgt_len = tgt.size(1)
        tgt_sub_mask = torch.tril(torch.ones((tgt_len, tgt_len), device=tgt.device)).bool()
        tgt_mask = tgt_pad_mask & tgt_sub_mask.unsqueeze(0)

        return src_pad_mask, tgt_mask

    def forward(self, src, tgt):
        src_mask, tgt_mask = self.create_masks(src, tgt)
        enc_output = self.encoder(src, src_mask)
        dec_output = self.decoder(tgt, enc_output, src_mask, tgt_mask)
        return dec_output


class CustomTransformerModel(PreTrainedModel):
    config_class = CustomTransformerConfig
    main_input_name = "input_ids"
    
    def __init__(self, config):
        super().__init__(config)
        self.model = Transformer(
            d_model=config.d_model,
            num_heads=config.num_heads,
            d_ff=config.d_ff,
            num_encoder_layers=config.num_encoder_layers,
            num_decoder_layers=config.num_decoder_layers,
            vocab_size=config.vocab_size,
            max_len=config.max_position_embeddings,
            pad_idx=config.pad_token_id,
            dropout=config.dropout
        )

        self.encoder = self.model.encoder
        self.decoder = self.model.decoder
        
        # Store key model attributes
        self.config = config
        
        # Add generation capability
        self.generate_response = self.custom_generate.__get__(self)

    def forward(
            self,
            input_ids=None,
            decoder_input_ids=None,
            attention_mask=None,
            decoder_attention_mask=None,
            labels=None,
            label_smoothing=0.1,
            output_attentions=None,
            output_hidden_states=None,
            return_dict=None,
            **kwargs
    ):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        
        if decoder_input_ids is None and labels is not None:
            decoder_input_ids = self._shift_right(labels)
        elif decoder_input_ids is None:
            decoder_input_ids = input_ids

        outputs = self.model(src=input_ids, tgt=decoder_input_ids)

        loss = None
        if labels is not None:
            if label_smoothing > 0:
                loss_fct = nn.CrossEntropyLoss(
                    ignore_index=self.config.pad_token_id,
                    label_smoothing=label_smoothing
                )
            else:
                loss_fct = nn.CrossEntropyLoss(ignore_index=self.config.pad_token_id)

            shifted_logits = outputs[:, :-1, :].contiguous()
            shifted_labels = labels[:, 1:].contiguous()
            loss = loss_fct(shifted_logits.view(-1, self.config.vocab_size), shifted_labels.view(-1))

        if not return_dict:
            return (loss, outputs) if loss is not None else outputs

        return Seq2SeqLMOutput(
            loss=loss,
            logits=outputs,
            past_key_values=None,
            decoder_hidden_states=None,
            decoder_attentions=None,
            cross_attentions=None,
            encoder_last_hidden_state=None,
            encoder_hidden_states=None,
            encoder_attentions=None,
        )

    def _shift_right(self, input_ids):
        shifted_input_ids = input_ids.new_zeros(input_ids.shape)
        shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
        shifted_input_ids[:, 0] = self.config.bos_token_id
        return shifted_input_ids

    def prepare_inputs_for_generation(
            self,
            decoder_input_ids,
            past_key_values=None,
            attention_mask=None,
            use_cache=None,
            encoder_outputs=None,
            **kwargs
    ):
        # Get the input_ids for encoding if not already provided
        input_ids = kwargs.get("input_ids", None)
        
        # If we have past_key_values, we only need the last token
        if past_key_values is not None:
            decoder_input_ids = decoder_input_ids[:, -1:]
        
        # Compute encoder outputs if not provided
        if encoder_outputs is None and input_ids is not None:
            src_pad_mask = (input_ids != self.config.pad_token_id).unsqueeze(1).unsqueeze(2)
            encoder_outputs = self.model.encoder(input_ids, src_pad_mask)

        return {
            "decoder_input_ids": decoder_input_ids,
            "encoder_outputs": encoder_outputs,
            "attention_mask": attention_mask,
            "use_cache": use_cache,
            "past_key_values": past_key_values,
            "input_ids": input_ids,  # Pass through for potential re-encoding
        }

    def _reorder_cache(self, past, beam_idx):
        # Implement if you have a cache mechanism for faster generation
        # Currently, we don't have a cache implementation
        return past
        
    def generate(
            self,
            input_ids=None,
            attention_mask=None,
            max_length=20,
            min_length=0,
            do_sample=True,
            early_stopping=False,
            num_beams=1,
            temperature=1.0,
            top_k=50,
            top_p=0.95,
            repetition_penalty=1.0,
            bad_words_ids=None,
            bos_token_id=None,
            pad_token_id=None,
            eos_token_id=None,
            length_penalty=1.0,
            no_repeat_ngram_size=0,
            encoder_no_repeat_ngram_size=0,
            num_return_sequences=1,
            decoder_start_token_id=None,
            use_cache=True,
            **model_kwargs
    ):
        """Generate text using the model with various sampling strategies."""
        
        # Set defaults from config if not provided
        if pad_token_id is None:
            pad_token_id = self.config.pad_token_id
        if bos_token_id is None:
            bos_token_id = self.config.bos_token_id
        if eos_token_id is None:
            eos_token_id = self.config.eos_token_id
            
        # Prepare encoder outputs if not already provided
        if "encoder_outputs" not in model_kwargs:
            encoder_kwargs = {
                "input_ids": input_ids,
                "attention_mask": attention_mask,
                "return_dict": True,
            }
            
            # Generate encoder outputs
            src_pad_mask = (input_ids != pad_token_id).unsqueeze(1).unsqueeze(2)
            encoder_outputs = self.model.encoder(input_ids, src_pad_mask)
            model_kwargs["encoder_outputs"] = encoder_outputs
            
        # Start with BOS token if needed
        if decoder_start_token_id is None:
            decoder_start_token_id = bos_token_id
            
        decoder_input_ids = torch.full(
            (input_ids.shape[0], 1),
            decoder_start_token_id,
            dtype=input_ids.dtype,
            device=input_ids.device
        )
        
        # Simple greedy decoding
        if num_beams == 1 and not do_sample:
            return self._greedy_search(
                decoder_input_ids,
                max_length=max_length,
                pad_token_id=pad_token_id,
                eos_token_id=eos_token_id,
                **model_kwargs
            )
        
        # Temperature sampling
        if do_sample:
            return self._sample(
                decoder_input_ids,
                max_length=max_length,
                temperature=temperature,
                top_k=top_k,
                top_p=top_p,
                repetition_penalty=repetition_penalty,
                pad_token_id=pad_token_id,
                eos_token_id=eos_token_id,
                **model_kwargs
            )
            
        # Beam search (simple implementation)
        return self._beam_search(
            decoder_input_ids,
            max_length=max_length,
            num_beams=num_beams,
            length_penalty=length_penalty,
            early_stopping=early_stopping,
            pad_token_id=pad_token_id,
            eos_token_id=eos_token_id,
            **model_kwargs
        )
    
    def _greedy_search(
            self,
            decoder_input_ids,
            max_length,
            pad_token_id=None,
            eos_token_id=None,
            **model_kwargs
    ):
        """Simple greedy decoding implementation."""
        # Initialize sequence scores
        batch_size = decoder_input_ids.shape[0]
        
        # Keep track of which sequences are finished
        unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=decoder_input_ids.device)
        
        # Start generation
        for _ in range(max_length - 1):
            model_inputs = self.prepare_inputs_for_generation(
                decoder_input_ids,
                **model_kwargs
            )
            
            # Get encoder output if needed
            encoder_outputs = model_kwargs.get("encoder_outputs")
            
            # Create masks
            src_mask = None
            if "input_ids" in model_inputs:
                src_mask = (model_inputs["input_ids"] != pad_token_id).unsqueeze(1).unsqueeze(2)
            
            tgt_mask = create_decoder_mask(decoder_input_ids, pad_token_id)
            
            # Forward pass
            outputs = self.model.decoder(decoder_input_ids, encoder_outputs, src_mask, tgt_mask)
            
            # Get next token with argmax
            next_token_logits = outputs[:, -1, :]
            next_tokens = torch.argmax(next_token_logits, dim=-1)
            
            # Update generated ids, sequences that hit EOS are marked as finished
            if eos_token_id is not None:
                # Set tokens to 0 (pad_token_id) if sequence is finished
                next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences)
                
                # Update which sequences are still unfinished
                unfinished_sequences = unfinished_sequences.mul((next_tokens != eos_token_id).long())
            
            # Add new tokens to input_ids
            decoder_input_ids = torch.cat([decoder_input_ids, next_tokens.unsqueeze(-1)], dim=-1)
            
            # Stop when all sequences are finished
            if unfinished_sequences.max() == 0:
                break
                
        return decoder_input_ids
    
    def _sample(
            self,
            decoder_input_ids,
            max_length,
            temperature=1.0,
            top_k=50,
            top_p=0.95,
            repetition_penalty=1.0,
            pad_token_id=None,
            eos_token_id=None,
            **model_kwargs
    ):
        """Temperature sampling with top-k and top-p filtering."""
        # Initialize sequence scores
        batch_size = decoder_input_ids.shape[0]
        
        # Keep track of which sequences are finished
        unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=decoder_input_ids.device)
        
        # Start generation
        for _ in range(max_length - 1):
            model_inputs = self.prepare_inputs_for_generation(
                decoder_input_ids,
                **model_kwargs
            )
            
            # Get encoder output if needed
            encoder_outputs = model_kwargs.get("encoder_outputs")
            
            # Create masks
            src_mask = None
            if "input_ids" in model_inputs:
                src_mask = (model_inputs["input_ids"] != pad_token_id).unsqueeze(1).unsqueeze(2)
            
            tgt_mask = create_decoder_mask(decoder_input_ids, pad_token_id)
            
            # Forward pass
            outputs = self.model.decoder(decoder_input_ids, encoder_outputs, src_mask, tgt_mask)
            
            # Get next token logits
            next_token_logits = outputs[:, -1, :].clone()
            
            # Apply temperature
            next_token_logits = next_token_logits / temperature
            
            # Apply repetition penalty
            if repetition_penalty != 1.0:
                for i in range(batch_size):
                    for previous_token in set(decoder_input_ids[i].tolist()):
                        next_token_logits[i, previous_token] /= repetition_penalty
            
            # Apply top-k filtering
            if top_k > 0:
                top_k = min(top_k, next_token_logits.size(-1))
                indices_to_remove = next_token_logits < torch.topk(next_token_logits, top_k)[0][..., -1, None]
                next_token_logits[indices_to_remove] = -float("Inf")
            
            # Apply top-p (nucleus) filtering
            if top_p < 1.0:
                sorted_logits, sorted_indices = torch.sort(next_token_logits, descending=True)
                cumulative_probs = torch.cumsum(torch.softmax(sorted_logits, dim=-1), dim=-1)
                
                # Remove tokens with cumulative probability above the threshold
                sorted_indices_to_remove = cumulative_probs > top_p
                
                # Shift the indices to the right to keep also the first token above the threshold
                sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
                sorted_indices_to_remove[..., 0] = 0
                
                # Scatter sorted tensors to original indexing
                indices_to_remove = sorted_indices_to_remove.scatter(
                    dim=-1, index=sorted_indices, src=sorted_indices_to_remove
                )
                next_token_logits[indices_to_remove] = -float("Inf")
            
            # Sample next token
            probs = torch.softmax(next_token_logits, dim=-1)
            next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
            
            # Update generated ids, sequences that hit EOS are marked as finished
            if eos_token_id is not None:
                # Set tokens to 0 (pad_token_id) if sequence is finished
                next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences)
                
                # Update which sequences are still unfinished
                unfinished_sequences = unfinished_sequences.mul((next_tokens != eos_token_id).long())
            
            # Add new tokens to input_ids
            decoder_input_ids = torch.cat([decoder_input_ids, next_tokens.unsqueeze(-1)], dim=-1)
            
            # Stop when all sequences are finished
            if unfinished_sequences.max() == 0:
                break
                
        return decoder_input_ids
        
    def _beam_search(
            self,
            decoder_input_ids,
            max_length,
            num_beams=4,
            length_penalty=1.0,
            early_stopping=False,
            pad_token_id=None,
            eos_token_id=None,
            **model_kwargs
    ):
        """Simple beam search implementation."""
        # This is a simplified implementation and might not be optimal
        batch_size = decoder_input_ids.shape[0]
        vocab_size = self.config.vocab_size
        
        # Expand inputs for beam search
        encoder_outputs = model_kwargs.get("encoder_outputs")
        input_ids = model_kwargs.get("input_ids")
        attention_mask = model_kwargs.get("attention_mask")
        
        # Expand input_ids for beam search
        decoder_input_ids = decoder_input_ids.unsqueeze(1).expand(batch_size, num_beams, -1)
        decoder_input_ids = decoder_input_ids.contiguous().view(batch_size * num_beams, -1)
        
        # Start with a single beam per sample
        beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=decoder_input_ids.device)
        beam_scores[:, 1:] = -1e9  # All beams except the first start with -inf
        beam_scores = beam_scores.view(-1)  # Flatten
        
        # Initialize sequence list
        generated_sequences = []
        
        # Start generation
        for step in range(max_length - 1):
            model_inputs = self.prepare_inputs_for_generation(
                decoder_input_ids,
                encoder_outputs=encoder_outputs,
                input_ids=input_ids,
                attention_mask=attention_mask,
                **model_kwargs
            )
            
            # Create masks
            src_mask = None
            if "input_ids" in model_inputs:
                src_mask = (model_inputs["input_ids"] != pad_token_id).unsqueeze(1).unsqueeze(2)
            
            tgt_mask = create_decoder_mask(decoder_input_ids, pad_token_id)
            
            # Forward pass
            outputs = self.model.decoder(decoder_input_ids, encoder_outputs, src_mask, tgt_mask)
            
            # Get next token logits
            next_token_logits = outputs[:, -1, :]
            
            # Calculate log probabilities
            next_token_scores = torch.log_softmax(next_token_logits, dim=-1)
            
            # Add beam scores
            next_token_scores = next_token_scores + beam_scores[:, None]
            
            # Reshape for beam search
            next_token_scores = next_token_scores.view(batch_size, num_beams * vocab_size)
            
            # Get the top-k beams
            next_scores, next_tokens = torch.topk(next_token_scores, num_beams, dim=1, largest=True, sorted=True)
            
            # Get new beam indices and token indices
            next_beam_indices = (next_tokens / vocab_size).long()
            next_token_indices = next_tokens % vocab_size
            
            # Update beam scores
            beam_scores = next_scores.view(-1)
            
            # Update decoder input ids
            decoder_input_ids = torch.cat([
                decoder_input_ids[next_beam_indices.view(-1)],
                next_token_indices.view(-1, 1)
            ], dim=-1)
            
            # Check if any beam is finished
            eos_mask = next_token_indices.eq(eos_token_id)
            if eos_mask.any():
                # Add finished beams to generated sequences
                for idx in range(batch_size):
                    if eos_mask[idx].any():
                        # Get best beam for this sample
                        best_beam = decoder_input_ids[idx * num_beams].clone()
                        generated_sequences.append(best_beam)
                
                # Early stopping if all beams are finished
                if early_stopping and len(generated_sequences) == batch_size:
                    break
        
        # If no sequences were generated, return the current state
        if len(generated_sequences) == 0:
            return decoder_input_ids.view(batch_size, num_beams, -1)[:, 0, :]
        
        # Stack generated sequences and return
        return torch.stack(generated_sequences)

    def custom_generate(
            self,
            prompt,
            tokenizer,
            max_length=512,
            device='cuda',
            temperature=1.0,
            top_k=50,
            top_p=0.95,
            repetition_penalty=1.0,
            do_sample=True
    ):
        """
        Enhanced text generation with sampling options
        """
        self.eval()

        input_ids = tokenizer(
            prompt,
            return_tensors="pt",
            max_length=512,
            padding='max_length',
            truncation=True
        )["input_ids"].to(device)

        bos_token_id = tokenizer.bos_token_id
        if bos_token_id is None:
            if tokenizer.cls_token_id is not None:
                bos_token_id = tokenizer.cls_token_id
            else:
                bos_token_id = 1  # Fallback

        decoder_input = torch.tensor([[bos_token_id]], device=device)
        generated_tokens = [bos_token_id]

        # Stop tokens to check
        stop_tokens = []
        if tokenizer.eos_token_id is not None:
            stop_tokens.append(tokenizer.eos_token_id)
        if tokenizer.sep_token_id is not None:
            stop_tokens.append(tokenizer.sep_token_id)

        for _ in range(max_length):
            src_mask, tgt_mask = self.model.create_masks(input_ids, decoder_input)
            enc_output = self.model.encoder(input_ids, src_mask)
            dec_output = self.model.decoder(decoder_input, enc_output, src_mask, tgt_mask)

            next_token_logits = dec_output[:, -1, :].squeeze(0)

            # Apply temperature
            next_token_logits = next_token_logits / temperature

            # Apply repetition penalty
            if repetition_penalty != 1.0:
                for prev_token in generated_tokens:
                    next_token_logits[prev_token] /= repetition_penalty

            # Filter with top-k
            if top_k > 0:
                indices_to_remove = next_token_logits < torch.topk(next_token_logits, top_k)[0][..., -1, None]
                next_token_logits[indices_to_remove] = float('-inf')

            # Filter with top-p (nucleus sampling)
            if top_p < 1.0:
                sorted_logits, sorted_indices = torch.sort(next_token_logits, descending=True)
                cumulative_probs = torch.cumsum(torch.softmax(sorted_logits, dim=-1), dim=-1)

                # Remove tokens with cumulative probability above the threshold
                sorted_indices_to_remove = cumulative_probs > top_p

                # Shift the indices to the right to keep the first token above the threshold
                sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
                sorted_indices_to_remove[..., 0] = 0

                indices_to_remove = sorted_indices[sorted_indices_to_remove]
                next_token_logits[indices_to_remove] = float('-inf')

            # Sample or greedy selection
            if do_sample:
                probs = torch.softmax(next_token_logits, dim=-1)
                next_token = torch.multinomial(probs, num_samples=1)
            else:
                next_token = torch.argmax(next_token_logits, dim=-1).unsqueeze(0)

            next_token = next_token.unsqueeze(0)
            decoder_input = torch.cat([decoder_input, next_token], dim=1)
            generated_tokens.append(next_token.item())

            # Check stop condition
            if next_token.item() in stop_tokens:
                break

            # Check for repetition
            if len(generated_tokens) >= 4:
                # Stop if generating the same token 4 times in a row
                if len(set(generated_tokens[-4:])) == 1:
                    break

        output_text = tokenizer.decode(decoder_input[0].tolist(), skip_special_tokens=True)
        return output_text