File size: 2,009 Bytes
f4785f2 bb045ad f4785f2 bb045ad f4785f2 55c46bf f4785f2 bb045ad f4785f2 bb045ad f4785f2 bb045ad |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 |
"""Miscovery model configuration"""
from transformers.configuration_utils import PretrainedConfig
class CustomTransformerConfig(PretrainedConfig):
"""
Configuration class for Miscovery custom transformer model.
"""
model_type = "miscovery"
def __init__(
self,
vocab_size=100000,
d_model=768,
num_heads=12,
d_ff=3072,
num_encoder_layers=12,
num_decoder_layers=12,
max_position_embeddings=2048,
dropout=0.1,
pad_token_id=0,
bos_token_id=2,
eos_token_id=3,
use_flash_attn=False,
**kwargs
):
"""
Initialize config for the Miscovery model.
Args:
vocab_size: Size of vocabulary
d_model: Dimension of model embeddings
num_heads: Number of attention heads
d_ff: Dimension of feed-forward layer
num_encoder_layers: Number of encoder layers
num_decoder_layers: Number of decoder layers
max_position_embeddings: Maximum sequence length
dropout: Dropout rate
pad_token_id: Token ID for padding
bos_token_id: Token ID for beginning of sequence
eos_token_id: Token ID for end of sequence
use_flash_attn: Whether to use flash attention
"""
self.vocab_size = vocab_size
self.d_model = d_model
self.num_heads = num_heads
self.d_ff = d_ff
self.num_encoder_layers = num_encoder_layers
self.num_decoder_layers = num_decoder_layers
self.max_position_embeddings = max_position_embeddings
self.dropout = dropout
self.use_flash_attn = use_flash_attn
# Make sure to call the parent class's __init__
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
**kwargs,
) |