model / configuration_miscovery.py
miscovery's picture
Update configuration_miscovery.py
55c46bf verified
raw
history blame contribute delete
2.01 kB
"""Miscovery model configuration"""
from transformers.configuration_utils import PretrainedConfig
class CustomTransformerConfig(PretrainedConfig):
"""
Configuration class for Miscovery custom transformer model.
"""
model_type = "miscovery"
def __init__(
self,
vocab_size=100000,
d_model=768,
num_heads=12,
d_ff=3072,
num_encoder_layers=12,
num_decoder_layers=12,
max_position_embeddings=2048,
dropout=0.1,
pad_token_id=0,
bos_token_id=2,
eos_token_id=3,
use_flash_attn=False,
**kwargs
):
"""
Initialize config for the Miscovery model.
Args:
vocab_size: Size of vocabulary
d_model: Dimension of model embeddings
num_heads: Number of attention heads
d_ff: Dimension of feed-forward layer
num_encoder_layers: Number of encoder layers
num_decoder_layers: Number of decoder layers
max_position_embeddings: Maximum sequence length
dropout: Dropout rate
pad_token_id: Token ID for padding
bos_token_id: Token ID for beginning of sequence
eos_token_id: Token ID for end of sequence
use_flash_attn: Whether to use flash attention
"""
self.vocab_size = vocab_size
self.d_model = d_model
self.num_heads = num_heads
self.d_ff = d_ff
self.num_encoder_layers = num_encoder_layers
self.num_decoder_layers = num_decoder_layers
self.max_position_embeddings = max_position_embeddings
self.dropout = dropout
self.use_flash_attn = use_flash_attn
# Make sure to call the parent class's __init__
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
**kwargs,
)