|
"""Miscovery model configuration""" |
|
|
|
from transformers.configuration_utils import PretrainedConfig |
|
|
|
class CustomTransformerConfig(PretrainedConfig): |
|
""" |
|
Configuration class for Miscovery custom transformer model. |
|
""" |
|
model_type = "miscovery" |
|
|
|
def __init__( |
|
self, |
|
vocab_size=100000, |
|
d_model=768, |
|
num_heads=12, |
|
d_ff=3072, |
|
num_encoder_layers=12, |
|
num_decoder_layers=12, |
|
max_position_embeddings=2048, |
|
dropout=0.1, |
|
pad_token_id=0, |
|
bos_token_id=2, |
|
eos_token_id=3, |
|
use_flash_attn=False, |
|
**kwargs |
|
): |
|
""" |
|
Initialize config for the Miscovery model. |
|
|
|
Args: |
|
vocab_size: Size of vocabulary |
|
d_model: Dimension of model embeddings |
|
num_heads: Number of attention heads |
|
d_ff: Dimension of feed-forward layer |
|
num_encoder_layers: Number of encoder layers |
|
num_decoder_layers: Number of decoder layers |
|
max_position_embeddings: Maximum sequence length |
|
dropout: Dropout rate |
|
pad_token_id: Token ID for padding |
|
bos_token_id: Token ID for beginning of sequence |
|
eos_token_id: Token ID for end of sequence |
|
use_flash_attn: Whether to use flash attention |
|
""" |
|
self.vocab_size = vocab_size |
|
self.d_model = d_model |
|
self.num_heads = num_heads |
|
self.d_ff = d_ff |
|
self.num_encoder_layers = num_encoder_layers |
|
self.num_decoder_layers = num_decoder_layers |
|
self.max_position_embeddings = max_position_embeddings |
|
self.dropout = dropout |
|
self.use_flash_attn = use_flash_attn |
|
|
|
|
|
super().__init__( |
|
pad_token_id=pad_token_id, |
|
bos_token_id=bos_token_id, |
|
eos_token_id=eos_token_id, |
|
**kwargs, |
|
) |