File size: 4,966 Bytes
a5f8592 08b8b31 a5f8592 08b8b31 a5f8592 08b8b31 a5f8592 2d26211 a5f8592 08b8b31 a5f8592 08b8b31 a5f8592 08b8b31 a5f8592 08b8b31 a5f8592 08b8b31 a5f8592 08b8b31 a5f8592 08b8b31 a5f8592 08b8b31 a5f8592 08b8b31 a5f8592 08b8b31 a5f8592 08b8b31 a5f8592 08b8b31 a5f8592 08b8b31 a5f8592 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 |
# --------------------------------------------------------
# NVIDIA
# Copyright (c) 2025 NVIDIA
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
import copy
from transformers.models.llama.configuration_llama import LlamaConfig
from transformers.models.qwen2.configuration_qwen2 import Qwen2Config
from transformers.configuration_utils import PretrainedConfig
from transformers.utils import logging
from transformers.models.siglip.configuration_siglip import SiglipVisionConfig
logger = logging.get_logger(__name__)
class Eagle2_5_VLConfig(PretrainedConfig):
model_type = 'eagle_2_5_vl'
is_composition = True
sub_configs = {"vision_config": SiglipVisionConfig, "text_config": Qwen2Config}
def __init__(
self,
vision_config=None,
text_config=None,
use_backbone_lora=0,
use_llm_lora=0,
pad2square=False,
select_layer=-4,
force_image_size=None,
downsample_ratio=0.5,
template=None,
dynamic_image_size=False,
use_thumbnail=False,
loss_version='v1',
min_dynamic_tiles=1,
max_dynamic_tiles=6,
mlp_checkpoint=False,
initializer_range=0.02,
_attn_implementation='flash_attention_2',
_attn_implementation_autoset=False,
llm_config=None,
image_token_index=None,
**kwargs):
super().__init__(**kwargs)
if vision_config is None:
vision_config = {'model_type': 'siglip_vision_model'}
logger.info('vision_config is None. Initializing the InternVisionConfig with default values.')
if text_config is None:
text_config = {'architectures': ['Qwen2ForCausalLM']}
logger.info('text_config is None. Initializing the LlamaConfig config with default values (`LlamaConfig`).')
if vision_config['model_type'] == 'siglip_vision_model':
self.vision_config = SiglipVisionConfig(**vision_config)
else:
raise ValueError('Unsupported model_type: {}'.format(vision_config['model_type']))
if text_config['architectures'][0] == 'LlamaForCausalLM':
self.text_config = LlamaConfig(**text_config)
elif text_config['architectures'][0] == 'Qwen2ForCausalLM':
self.text_config = Qwen2Config(**text_config)
else:
raise ValueError('Unsupported architecture: {}'.format(text_config['architectures'][0]))
self.use_backbone_lora = use_backbone_lora
self.use_llm_lora = use_llm_lora
self.mlp_checkpoint = mlp_checkpoint
self.pad2square = pad2square
self.select_layer = select_layer
self.force_image_size = force_image_size
self.downsample_ratio = downsample_ratio
self.template = template
self.dynamic_image_size = dynamic_image_size
self.use_thumbnail = use_thumbnail
self.loss_version = loss_version
self.initializer_range = initializer_range
self.min_dynamic_tiles = min_dynamic_tiles
self.max_dynamic_tiles = max_dynamic_tiles
self.tie_word_embeddings = self.text_config.tie_word_embeddings
self._attn_implementation = _attn_implementation
self._attn_implementation_autoset = _attn_implementation_autoset
self.image_token_index = image_token_index
logger.info(f'min_dynamic_tiles: {self.min_dynamic_tiles}')
logger.info(f'max_dynamic_tiles: {self.max_dynamic_tiles}')
def to_dict(self):
"""
Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
Returns:
`Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
"""
output = copy.deepcopy(self.__dict__)
output['vision_config'] = self.vision_config.to_dict()
output['text_config'] = self.text_config.to_dict()
output['model_type'] = self.__class__.model_type
output['use_backbone_lora'] = self.use_backbone_lora
output['use_llm_lora'] = self.use_llm_lora
output['pad2square'] = self.pad2square
output['select_layer'] = self.select_layer
output['force_image_size'] = self.force_image_size
output['downsample_ratio'] = self.downsample_ratio
output['template'] = self.template
output['dynamic_image_size'] = self.dynamic_image_size
output['use_thumbnail'] = self.use_thumbnail
output['min_dynamic_tiles'] = self.min_dynamic_tiles
output['max_dynamic_tiles'] = self.max_dynamic_tiles
output['tie_word_embeddings'] = self.tie_word_embeddings
output['_attn_implementation'] = self._attn_implementation
output['_attn_implementation_autoset'] = self._attn_implementation_autoset
return output
|