File size: 4,966 Bytes
a5f8592
08b8b31
a5f8592
08b8b31
a5f8592
 
 
 
08b8b31
 
a5f8592
 
2d26211
a5f8592
 
 
08b8b31
 
 
 
a5f8592
 
 
08b8b31
a5f8592
 
08b8b31
 
a5f8592
 
 
 
 
08b8b31
 
 
 
 
 
 
 
 
a5f8592
 
 
 
08b8b31
 
a5f8592
08b8b31
 
 
a5f8592
 
 
 
 
 
08b8b31
 
 
 
 
a5f8592
08b8b31
a5f8592
 
08b8b31
 
a5f8592
 
 
 
 
 
08b8b31
 
 
 
 
 
 
 
 
 
a5f8592
 
 
 
 
 
 
 
 
 
08b8b31
a5f8592
 
 
08b8b31
a5f8592
 
 
 
 
 
08b8b31
 
 
 
 
a5f8592
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# --------------------------------------------------------
# NVIDIA
# Copyright (c) 2025 NVIDIA
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------

import copy

from transformers.models.llama.configuration_llama import LlamaConfig
from transformers.models.qwen2.configuration_qwen2 import Qwen2Config
from transformers.configuration_utils import PretrainedConfig
from transformers.utils import logging
from transformers.models.siglip.configuration_siglip import SiglipVisionConfig
logger = logging.get_logger(__name__)


class Eagle2_5_VLConfig(PretrainedConfig):
    model_type = 'eagle_2_5_vl'
    is_composition = True
    sub_configs = {"vision_config": SiglipVisionConfig, "text_config": Qwen2Config}
    def __init__(
            self,
            vision_config=None,
            text_config=None,
            use_backbone_lora=0,
            use_llm_lora=0,
            pad2square=False,
            select_layer=-4,
            force_image_size=None,
            downsample_ratio=0.5,
            template=None,
            dynamic_image_size=False,
            use_thumbnail=False,
            loss_version='v1',
            min_dynamic_tiles=1,
            max_dynamic_tiles=6,
            mlp_checkpoint=False,
            initializer_range=0.02,
            _attn_implementation='flash_attention_2',
            _attn_implementation_autoset=False,
            llm_config=None,
            image_token_index=None,
            **kwargs):
        super().__init__(**kwargs)

        if vision_config is None:
            vision_config = {'model_type': 'siglip_vision_model'}
            logger.info('vision_config is None. Initializing the InternVisionConfig with default values.')

        if text_config is None:
            text_config = {'architectures': ['Qwen2ForCausalLM']}
            logger.info('text_config is None. Initializing the LlamaConfig config with default values (`LlamaConfig`).')

        if vision_config['model_type'] == 'siglip_vision_model':
            self.vision_config = SiglipVisionConfig(**vision_config)
        else:
            raise ValueError('Unsupported model_type: {}'.format(vision_config['model_type']))


        if text_config['architectures'][0] == 'LlamaForCausalLM':
            self.text_config = LlamaConfig(**text_config)
        elif text_config['architectures'][0] == 'Qwen2ForCausalLM':
            self.text_config = Qwen2Config(**text_config)
        else:
            raise ValueError('Unsupported architecture: {}'.format(text_config['architectures'][0]))
        self.use_backbone_lora = use_backbone_lora
        self.use_llm_lora = use_llm_lora
        self.mlp_checkpoint = mlp_checkpoint
        self.pad2square = pad2square
        self.select_layer = select_layer
        self.force_image_size = force_image_size
        self.downsample_ratio = downsample_ratio
        self.template = template
        self.dynamic_image_size = dynamic_image_size
        self.use_thumbnail = use_thumbnail
        self.loss_version = loss_version
        self.initializer_range = initializer_range
        self.min_dynamic_tiles = min_dynamic_tiles
        self.max_dynamic_tiles = max_dynamic_tiles
        self.tie_word_embeddings = self.text_config.tie_word_embeddings
        self._attn_implementation = _attn_implementation
        self._attn_implementation_autoset = _attn_implementation_autoset
        self.image_token_index = image_token_index
        logger.info(f'min_dynamic_tiles: {self.min_dynamic_tiles}')
        logger.info(f'max_dynamic_tiles: {self.max_dynamic_tiles}')

    def to_dict(self):
        """
        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].

        Returns:
            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
        """
        output = copy.deepcopy(self.__dict__)
        output['vision_config'] = self.vision_config.to_dict()
        output['text_config'] = self.text_config.to_dict()
        output['model_type'] = self.__class__.model_type
        output['use_backbone_lora'] = self.use_backbone_lora
        output['use_llm_lora'] = self.use_llm_lora
        output['pad2square'] = self.pad2square
        output['select_layer'] = self.select_layer
        output['force_image_size'] = self.force_image_size
        output['downsample_ratio'] = self.downsample_ratio
        output['template'] = self.template
        output['dynamic_image_size'] = self.dynamic_image_size
        output['use_thumbnail'] = self.use_thumbnail
        output['min_dynamic_tiles'] = self.min_dynamic_tiles
        output['max_dynamic_tiles'] = self.max_dynamic_tiles
        output['tie_word_embeddings'] = self.tie_word_embeddings
        output['_attn_implementation'] = self._attn_implementation
        output['_attn_implementation_autoset'] = self._attn_implementation_autoset

        return output