from ..vision.siglip_config import SigLipConfig from ..language.language_config import LanguageModelConfig class MultiModalConfig(): def __init__( self, vision_config=None, text_config=None, ignore_index=-100, image_token_index=256000, vocab_size=257152, projection_dim=2048, hidden_size=2048, pad_token_id=None, **kwargs, ): super().__init__() self.ignore_index = ignore_index self.image_token_index = image_token_index self.vocab_size = vocab_size self.projection_dim = projection_dim self.hidden_size = hidden_size self.vision_config = vision_config self.is_encoder_decoder = False self.pad_token_id = pad_token_id self.vision_config = SigLipConfig(**vision_config) self.text_config = text_config self.text_config = LanguageModelConfig(**text_config, pad_token_id=pad_token_id) self.vocab_size = self.text_config.vocab_size self.text_config.num_image_tokens = (self.vision_config.image_size // self.vision_config.patch_size) ** 2 self.vision_config.projection_dim = projection_dim