cyrilvallez HF Staff commited on
Commit
698b586
·
verified ·
1 Parent(s): 0af439b

Upload folder using huggingface_hub

Browse files
chat_template.jinja ADDED
@@ -0,0 +1 @@
 
 
1
+ {% for message in messages %}{{ '<|' + message['role'] + '|>' }}{% if message['content'] is string %}{{ message['content'] }}{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' %}{{ '<|image|>' }}{% elif content['type'] == 'audio' %}{{ '<|audio|>' }}{% elif content['type'] == 'text' %}{{ content['text'] }}{% endif %}{% endfor %}{% endif %}{% if message['role'] == 'system' and 'tools' in message and message['tools'] is not none %}{{ '<|tool|>' + message['tools'] + '<|/tool|>' + '<|end|>' }}{% endif %}{{ '<|end|>' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>' }}{% else %}{{ eos_token }}{% endif %}
config.json CHANGED
@@ -1,82 +1,47 @@
1
  {
2
- "_name_or_path": "Phi-4-multimodal-instruct",
 
 
 
3
  "architectures": [
4
- "Phi4MMForCausalLM"
5
  ],
6
  "attention_bias": false,
7
  "attention_dropout": 0.0,
8
- "audio_processor": {
9
- "config": {
10
- "activation": "swish",
11
- "activation_checkpointing": {
12
- "interval": 1,
13
- "module": "transformer",
14
- "offload": false
15
- },
16
- "attention_dim": 1024,
17
- "attention_heads": 16,
18
- "batch_norm": false,
19
- "bias_in_glu": true,
20
- "causal": true,
21
- "chunk_size": -1,
22
- "cnn_layer_norm": true,
23
- "conv_activation": "swish",
24
- "conv_glu_type": "swish",
25
- "depthwise_multiplier": 1,
26
- "depthwise_seperable_out_channel": 1024,
27
- "dropout_rate": 0.0,
28
- "encoder_embedding_config": {
29
- "input_size": 80
30
- },
31
- "ext_pw_kernel_size": 1,
32
- "ext_pw_out_channel": 1024,
33
- "input_layer": "nemo_conv",
34
- "input_size": 80,
35
- "kernel_size": 3,
36
- "left_chunk": 18,
37
- "linear_units": 1536,
38
- "nemo_conv_settings": {
39
- "conv_channels": 1024
40
- },
41
- "num_blocks": 24,
42
- "relative_attention_bias_args": {
43
- "t5_bias_max_distance": 500,
44
- "type": "t5"
45
- },
46
- "time_reduction": 8
47
- },
48
- "name": "cascades"
49
- },
50
- "auto_map": {
51
- "AutoConfig": "configuration_phi4mm.Phi4MMConfig",
52
- "AutoModelForCausalLM": "modeling_phi4mm.Phi4MMForCausalLM",
53
- "AutoTokenizer": "Xenova/gpt-4o"
54
  },
55
  "bos_token_id": 199999,
56
- "embd_layer": {
57
- "audio_embd_layer": {
58
- "compression_rate": 8,
59
- "downsample_rate": 1,
60
- "embedding_cls": "audio",
61
- "enable_gradient_checkpointing": true,
62
- "projection_cls": "mlp",
63
- "use_conv_downsample": false,
64
- "use_qformer": false
65
- },
66
- "embedding_cls": "image_audio",
67
- "image_embd_layer": {
68
- "crop_size": 448,
69
- "embedding_cls": "tune_image",
70
- "enable_gradient_checkpointing": true,
71
- "hd_transform_order": "sub_glb",
72
- "image_token_compression_cls": "avg_pool_2d",
73
- "projection_cls": "mlp",
74
- "use_hd_transform": true,
75
- "with_learnable_separator": true
76
- }
77
- },
78
  "embd_pdrop": 0.0,
79
- "eos_token_id": 199999,
 
 
 
80
  "full_attn_mod": 1,
81
  "hidden_act": "silu",
82
  "hidden_size": 3072,
@@ -84,21 +49,9 @@
84
  "intermediate_size": 8192,
85
  "interpolate_factor": 1,
86
  "lm_head_bias": false,
87
- "vision_lora": {
88
- "dp": 0.0,
89
- "layer": "layers.*((self_attn\\.(qkv_proj|o_proj))|(mlp\\.(gate_up|down)_proj))",
90
- "lora_alpha": 512,
91
- "r": 256
92
- },
93
- "speech_lora": {
94
- "dp": 0.01,
95
- "layer": "((layers.*self_attn\\.(qkv|o)_proj)|(layers.*mlp\\.(gate_up|down)_proj))",
96
- "lora_alpha": 640,
97
- "r": 320
98
- },
99
  "max_position_embeddings": 131072,
100
  "mlp_bias": false,
101
- "model_type": "phi4mm",
102
  "num_attention_heads": 24,
103
  "num_hidden_layers": 32,
104
  "num_key_value_heads": 8,
@@ -214,8 +167,23 @@
214
  "sliding_window": 262144,
215
  "tie_word_embeddings": true,
216
  "torch_dtype": "bfloat16",
217
- "transformers_version": "4.46.1",
218
  "use_cache": true,
219
- "vocab_size": 200064,
220
- "_attn_implementation": "flash_attention_2"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
  }
 
1
  {
2
+ "auto_map": {
3
+ "AutoConfig": "configuration_phi4_multimodal.Phi4MultimodalConfig",
4
+ "AutoModelForCausalLM": "modeling_phi4_multimodal.Phi4MultimodalForCausalLM"
5
+ },
6
  "architectures": [
7
+ "Phi4MultimodalForCausalLM"
8
  ],
9
  "attention_bias": false,
10
  "attention_dropout": 0.0,
11
+ "audio_config": {
12
+ "activation": "swish",
13
+ "audio_token_id": 200011,
14
+ "bias_max_distance": 500,
15
+ "bias_symmetric": false,
16
+ "chunk_size": -1,
17
+ "conv_activation": "swish",
18
+ "conv_glu_type": "swish",
19
+ "depthwise_multiplier": 1,
20
+ "depthwise_seperable_out_channel": 1024,
21
+ "downsample_rate": 1,
22
+ "dropout_rate": 0.0,
23
+ "ext_pw_out_channel": 1024,
24
+ "feature_layer": -2,
25
+ "hidden_size": 1024,
26
+ "initializer_range": 0.02,
27
+ "input_size": 80,
28
+ "intermediate_size": 1536,
29
+ "kernel_size": 3,
30
+ "left_chunk": 18,
31
+ "model_type": "phi4_multimodal_audio",
32
+ "nemo_activation": "relu",
33
+ "nemo_conv_channels": 1024,
34
+ "nemo_final_size": 10,
35
+ "num_attention_heads": 16,
36
+ "num_blocks": 24,
37
+ "time_reduction": 8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  },
39
  "bos_token_id": 199999,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  "embd_pdrop": 0.0,
41
+ "eos_token_id": [
42
+ 199999,
43
+ 200020
44
+ ],
45
  "full_attn_mod": 1,
46
  "hidden_act": "silu",
47
  "hidden_size": 3072,
 
49
  "intermediate_size": 8192,
50
  "interpolate_factor": 1,
51
  "lm_head_bias": false,
 
 
 
 
 
 
 
 
 
 
 
 
52
  "max_position_embeddings": 131072,
53
  "mlp_bias": false,
54
+ "model_type": "phi4_multimodal",
55
  "num_attention_heads": 24,
56
  "num_hidden_layers": 32,
57
  "num_key_value_heads": 8,
 
167
  "sliding_window": 262144,
168
  "tie_word_embeddings": true,
169
  "torch_dtype": "bfloat16",
170
+ "transformers_version": "4.52.0.dev0",
171
  "use_cache": true,
172
+ "vision_config": {
173
+ "attention_dropout": 0.0,
174
+ "crop_size": 448,
175
+ "feature_layer": -2,
176
+ "hidden_act": "gelu_pytorch_tanh",
177
+ "hidden_size": 1152,
178
+ "image_size": 448,
179
+ "image_token_id": 200010,
180
+ "intermediate_size": 4304,
181
+ "layer_norm_eps": 1e-06,
182
+ "model_type": "phi4_multimodal_vision",
183
+ "num_attention_heads": 16,
184
+ "num_channels": 3,
185
+ "num_hidden_layers": 27,
186
+ "patch_size": 14
187
+ },
188
+ "vocab_size": 200064
189
  }
configuration_phi4_multimodal.py ADDED
@@ -0,0 +1,484 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
2
+ # This file was automatically generated from src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py.
3
+ # Do NOT edit this file manually as any edits will be overwritten by the generation of
4
+ # the file from the modular. If any change should be done, please apply the change to the
5
+ # modular_phi4_multimodal.py file directly. One of our CI enforces this.
6
+ # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
7
+ # Copyright 2025 Microsoft and the HuggingFace Inc. team. All rights reserved.
8
+ #
9
+ # Licensed under the Apache License, Version 2.0 (the "License");
10
+ # you may not use this file except in compliance with the License.
11
+ # You may obtain a copy of the License at
12
+ #
13
+ # http://www.apache.org/licenses/LICENSE-2.0
14
+ #
15
+ # Unless required by applicable law or agreed to in writing, software
16
+ # distributed under the License is distributed on an "AS IS" BASIS,
17
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
+ # See the License for the specific language governing permissions and
19
+ # limitations under the License.
20
+
21
+ import math
22
+
23
+ from transformers.configuration_utils import PretrainedConfig
24
+
25
+
26
+ class Phi4MultimodalVisionConfig(PretrainedConfig):
27
+ r"""
28
+ This is the configuration class to store the configuration of a [`Phi4MultimodalVisionModel`]. It is used to instantiate a
29
+ Phi4Multimodal vision encoder according to the specified arguments, defining the model architecture. Instantiating a
30
+ configuration with the defaults will yield a similar configuration to that of the vision encoder of
31
+ [microsoft/Phi-4-multimodal-instruct](https://huggingface.co/microsoft/Phi-4-multimodal-instruct) architecture.
32
+
33
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
34
+ documentation from [`PretrainedConfig`] for more information.
35
+
36
+ Args:
37
+ hidden_size (`int`, *optional*, defaults to 1152):
38
+ Dimensionality of the encoder layers and the pooler layer.
39
+ intermediate_size (`int`, *optional*, defaults to 4304):
40
+ Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
41
+ num_hidden_layers (`int`, *optional*, defaults to 27):
42
+ Number of hidden layers in the Transformer encoder.
43
+ num_attention_heads (`int`, *optional*, defaults to 16):
44
+ Number of attention heads for each attention layer in the Transformer encoder.
45
+ num_channels (`int`, *optional*, defaults to 3):
46
+ Number of channels in the input images.
47
+ image_size (`int`, *optional*, defaults to 448):
48
+ The size (resolution) of each image.
49
+ patch_size (`int`, *optional*, defaults to 14):
50
+ The size (resolution) of each patch.
51
+ hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
52
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
53
+ `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
54
+ layer_norm_eps (`float`, *optional*, defaults to 1e-06):
55
+ The epsilon used by the layer normalization layers.
56
+ attention_dropout (`float`, *optional*, defaults to 0.0):
57
+ The dropout ratio for the attention probabilities.
58
+ crop_size (`int`, *optional*, defaults to 448):
59
+ Crop size for the input images.
60
+ image_token_id (`int`, *optional*, defaults to 200010):
61
+ The image token id.
62
+ feature_layer (`int`, *optional*, defaults to -2):
63
+ The index of the layer of the encoder from which to extract image features.
64
+
65
+ Example:
66
+
67
+ ```python
68
+ >>> from transformers import Phi4MultimodalVisionConfig
69
+
70
+ >>> # Initializing a Phi4MultimodalVisionConfig with microsoft/Phi-4-multimodal-instruct style configuration
71
+ >>> configuration = Phi4MultimodalVisionConfig()
72
+ ```"""
73
+
74
+ model_type = "phi4_multimodal_vision"
75
+ base_config_key = "vision_config"
76
+
77
+ def __init__(
78
+ self,
79
+ hidden_size=1152,
80
+ intermediate_size=4304,
81
+ num_hidden_layers=27,
82
+ num_attention_heads=16,
83
+ num_channels=3,
84
+ image_size=448,
85
+ patch_size=14,
86
+ hidden_act="gelu_pytorch_tanh",
87
+ layer_norm_eps=1e-6,
88
+ attention_dropout=0.0,
89
+ crop_size: int = 448,
90
+ image_token_id: int = 200010,
91
+ feature_layer: int = -2,
92
+ **kwargs,
93
+ ):
94
+ super().__init__(**kwargs)
95
+
96
+ self.hidden_size = hidden_size
97
+ self.intermediate_size = intermediate_size
98
+ self.num_hidden_layers = num_hidden_layers
99
+ self.num_attention_heads = num_attention_heads
100
+ self.num_channels = num_channels
101
+ self.patch_size = patch_size
102
+ self.image_size = image_size
103
+ self.attention_dropout = attention_dropout
104
+ self.layer_norm_eps = layer_norm_eps
105
+ self.hidden_act = hidden_act
106
+ self.crop_size = crop_size
107
+ self.image_token_id = image_token_id
108
+ self.feature_layer = feature_layer
109
+
110
+
111
+ class Phi4MultimodalAudioConfig(PretrainedConfig):
112
+ r"""
113
+ This is the configuration class to store the configuration of a [`Phi4MultimodalAudioModel`]. It is used to instantiate a
114
+ Phi4Multimodal audio encoder according to the specified arguments, defining the model architecture. Instantiating a
115
+ configuration with the defaults will yield a similar configuration to that of the audio encoder of
116
+ [microsoft/Phi-4-multimodal-instruct](https://huggingface.co/microsoft/Phi-4-multimodal-instruct) architecture.
117
+
118
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
119
+ documentation from [`PretrainedConfig`] for more information.
120
+
121
+ Args:
122
+ hidden_size (`int`, *optional*, defaults to 1024):
123
+ Dimensionality of the encoder layers.
124
+ intermediate_size (`int`, *optional*, defaults to 1536):
125
+ Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
126
+ num_blocks (`int`, *optional*, defaults to 24):
127
+ Number of hidden layers in the Transformer encoder.
128
+ num_attention_heads (`int`, *optional*, defaults to 16):
129
+ Number of attention heads for each attention layer in the Transformer encoder.
130
+ activation (`str`, *optional*, defaults to `"swish"`):
131
+ The non-linear activation function in the MLPs.
132
+ chunk_size (`int`, *optional*, defaults to -1):
133
+ The chunk size to create the masks.
134
+ left_chunk (`int`, *optional*, defaults to 18):
135
+ The left chunk to create the masks.
136
+ dropout_rate (`float`, *optional*, defaults to 0.0):
137
+ The dropout ratio.
138
+ ext_pw_out_channel (`int`, *optional*, defaults to 1024):
139
+ Number of out channels in the point-wise conv modules.
140
+ depthwise_seperable_out_channel (`int`, *optional*, defaults to 1024):
141
+ Number of out channels in the depth-wise separable conv modules.
142
+ depthwise_multiplier (`int`, *optional*, defaults to 1):
143
+ Input size multiplier for the depth-wise separable conv modules.
144
+ kernel_size (`int`, *optional*, defaults to 3):
145
+ Kernel size for the depth-wise separable conv modules.
146
+ conv_activation (`str`, *optional*, defaults to `"swish"`):
147
+ The non-linear activation function in the conv modules.
148
+ input_size (`int`, *optional*, defaults to 80):
149
+ Input size for the audio model.
150
+ conv_glu_type (`str`, *optional*, defaults to `"swish"`):
151
+ The non-linear activation function in the point-wise conv modules.
152
+ time_reduction (`int`, *optional*, defaults to 8):
153
+ Time reduction (subsampling factor).
154
+ bias_max_distance (`int`, *optional*, defaults to 1000):
155
+ Max distance for the relative attention bias module.
156
+ bias_symmetric (`bool`, *optional*, defaults to `False`):
157
+ Whether the relative attention bias should be symmetric or not.
158
+ nemo_activation (`str`, *optional*, defaults to `"relu"`):
159
+ The non-linear activation function in the nemo conv modules.
160
+ nemo_conv_channels (`int`, *optional*, defaults to 1024):
161
+ Number of channels in the nemo conv modules.
162
+ downsample_rate (`int`, *optional*, defaults to 1):
163
+ Downsample rate for the audio feature extractor.
164
+ initializer_range (`float`, *optional*, defaults to 0.02):
165
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
166
+ audio_token_id (`int`, *optional*, defaults to 200011):
167
+ The audio token id.
168
+ feature_layer (`int`, *optional*, defaults to -2):
169
+ The index of the layer of the encoder from which to extract audio features.
170
+
171
+ Example:
172
+
173
+ ```python
174
+ >>> from transformers import Phi4MultimodalAudioConfig
175
+
176
+ >>> # Initializing a Phi4MultimodalAudioConfig with microsoft/Phi-4-multimodal-instruct style configuration
177
+ >>> configuration = Phi4MultimodalAudioConfig()
178
+ ```"""
179
+
180
+ model_type = "phi4_multimodal_audio"
181
+
182
+ def __init__(
183
+ self,
184
+ hidden_size: int = 1024,
185
+ intermediate_size: int = 1536,
186
+ num_blocks: int = 24,
187
+ num_attention_heads: int = 16,
188
+ activation: str = "swish",
189
+ chunk_size: int = -1,
190
+ left_chunk: int = 18,
191
+ dropout_rate: float = 0.0,
192
+ ext_pw_out_channel: int = 1024,
193
+ depthwise_seperable_out_channel: int = 1024,
194
+ depthwise_multiplier: int = 1,
195
+ kernel_size: int = 3,
196
+ conv_activation: str = "swish",
197
+ input_size: int = 80,
198
+ conv_glu_type: str = "swish",
199
+ time_reduction: int = 8,
200
+ bias_max_distance: int = 1000,
201
+ bias_symmetric: bool = False,
202
+ nemo_activation: str = "relu",
203
+ nemo_conv_channels: int = 1024,
204
+ downsample_rate: int = 1,
205
+ initializer_range: float = 0.02,
206
+ audio_token_id: int = 200011,
207
+ feature_layer: int = -2,
208
+ **kwargs,
209
+ ):
210
+ super().__init__(**kwargs)
211
+ self.hidden_size = hidden_size
212
+ self.num_attention_heads = num_attention_heads
213
+ self.intermediate_size = intermediate_size
214
+ self.activation = activation
215
+ self.chunk_size = chunk_size
216
+ self.left_chunk = left_chunk
217
+ self.num_blocks = num_blocks
218
+ self.dropout_rate = dropout_rate
219
+ self.ext_pw_out_channel = ext_pw_out_channel
220
+ self.depthwise_seperable_out_channel = depthwise_seperable_out_channel
221
+ self.depthwise_multiplier = depthwise_multiplier
222
+ self.kernel_size = kernel_size
223
+ self.conv_activation = conv_activation
224
+ self.input_size = input_size
225
+ self.conv_glu_type = conv_glu_type
226
+ self.time_reduction = time_reduction
227
+ self.bias_max_distance = bias_max_distance
228
+ self.bias_symmetric = bias_symmetric
229
+ self.nemo_activation = nemo_activation
230
+ self.nemo_conv_channels = nemo_conv_channels
231
+ self.downsample_rate = downsample_rate
232
+ self.audio_token_id = audio_token_id
233
+ self.initializer_range = initializer_range
234
+ self.feature_layer = feature_layer
235
+
236
+ if time_reduction % 2 != 0:
237
+ raise ValueError("`time_reduction` should be a multiple of 2!")
238
+ length = input_size
239
+ for _ in range(int(math.log(time_reduction, 2))):
240
+ length = math.floor((length - 1) / 2 + 1)
241
+ self.nemo_final_size = length
242
+
243
+
244
+ class Phi4MultimodalConfig(PretrainedConfig):
245
+ r"""
246
+ This is the configuration class to store the configuration of a [`Phi4MultimodalModel`]. It is used to instantiate a
247
+ Phi4Multimodal model according to the specified arguments, defining the model architecture. Instantiating a configuration
248
+ with the defaults will yield a similar configuration to that of the
249
+ [microsoft/Phi-4-multimodal-instruct](https://huggingface.co/microsoft/Phi-4-multimodal-instruct) architecture.
250
+
251
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
252
+ documentation from [`PretrainedConfig`] for more information.
253
+
254
+ Args:
255
+ vocab_size (`int`, *optional*, defaults to 200064):
256
+ Vocabulary size of the Phi-3 model. Defines the number of different tokens that can be represented by the
257
+ `inputs_ids` passed when calling [`Phi3Model`].
258
+ hidden_size (`int`, *optional*, defaults to 3072):
259
+ Dimension of the hidden representations.
260
+ intermediate_size (`int`, *optional*, defaults to 8192):
261
+ Dimension of the MLP representations.
262
+ num_hidden_layers (`int`, *optional*, defaults to 32):
263
+ Number of hidden layers in the Transformer decoder.
264
+ num_attention_heads (`int`, *optional*, defaults to 32):
265
+ Number of attention heads for each attention layer in the Transformer decoder.
266
+ num_key_value_heads (`int`, *optional*, defaults to 8):
267
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
268
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
269
+ `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
270
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
271
+ by meanpooling all the original heads within that group. For more details checkout [this
272
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
273
+ `num_attention_heads`.
274
+ resid_pdrop (`float`, *optional*, defaults to 0.0):
275
+ Dropout probability for mlp outputs.
276
+ embd_pdrop (`int`, *optional*, defaults to 0.0):
277
+ The dropout ratio for the embeddings.
278
+ attention_dropout (`float`, *optional*, defaults to 0.0):
279
+ The dropout ratio after computing the attention scores.
280
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
281
+ The non-linear activation function (function or string) in the decoder.
282
+ max_position_embeddings (`int`, *optional*, defaults to 131072):
283
+ The maximum sequence length that this model might ever be used with.
284
+ initializer_range (`float`, *optional*, defaults to 0.02):
285
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
286
+ rms_norm_eps (`float`, *optional*, defaults to 1e-05):
287
+ The epsilon value used for the RMSNorm.
288
+ use_cache (`bool`, *optional*, defaults to `True`):
289
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
290
+ relevant if `config.is_decoder=True`. Whether to tie weight embeddings or not.
291
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
292
+ Whether to tie weight embeddings
293
+ rope_theta (`float`, *optional*, defaults to 10000.0):
294
+ The base period of the RoPE embeddings.
295
+ rope_scaling (`dict`, *optional*):
296
+ The scaling strategy for the RoPE embeddings. If `None`, no scaling is applied. If a dictionary, it must
297
+ contain the following keys: `type`, `short_factor` and `long_factor`. The `type` must be `longrope` and
298
+ the `short_factor` and `long_factor` must be lists of numbers with the same length as the hidden size
299
+ divided by the number of attention heads divided by 2.
300
+ partial_rotary_factor (`float`, *optional*, defaults to `1.0`):
301
+ Percentage of the query and keys which will have rotary embedding. Must be between 0.0 and 1.0.
302
+ bos_token_id (`int`, *optional*, defaults to 199999):
303
+ The id of the "beginning-of-sequence" token.
304
+ eos_token_id (`int` or `list[int]`, *optional*, defaults to `[199999, 200020]`):
305
+ The id of the "end-of-sequence" token.
306
+ pad_token_id (`int`, *optional*, defaults to 199999):
307
+ The id of the padding token.
308
+ original_max_position_embeddings (`int`, *optional*, defaults to 4096):
309
+ The maximum sequence length that this model was trained with. This is used to determine the size of the
310
+ original RoPE embeddings when using long scaling.
311
+ sliding_window (`int`, *optional*):
312
+ Sliding window attention window size. If `None`, no sliding window is applied.
313
+ vision_config (`Phi4MultimodalVisionConfig` or `dict`, *optional*):
314
+ The vision config for the underlying image embedding model. If not provided, will default to the configuration
315
+ used to instantiate a model similar in architecture as
316
+ [microsoft/Phi-4-multimodal-instruct](https://huggingface.co/microsoft/Phi-4-multimodal-instruct).
317
+ audio_config (`Phi4MultimodalAudioConfig` or `dict`, *optional*):
318
+ The audio config for the underlying audio embedding model. If not provided, will default to the configuration
319
+ used to instantiate a model similar in architecture as
320
+ [microsoft/Phi-4-multimodal-instruct](https://huggingface.co/microsoft/Phi-4-multimodal-instruct).
321
+
322
+ Example:
323
+
324
+ ```python
325
+ >>> from transformers import Phi4MultimodalModel, Phi4MultimodalConfig
326
+
327
+ >>> # Initializing a Phi4Multimodal style configuration
328
+ >>> configuration = Phi4MultimodalConfig.from_pretrained("microsoft/Phi-4-multimodal-instruct")
329
+
330
+ >>> # Initializing a model from the configuration
331
+ >>> model = Phi4MultimodalModel(configuration)
332
+
333
+ >>> # Accessing the model configuration
334
+ >>> configuration = model.config
335
+ ```"""
336
+
337
+ model_type = "phi4_multimodal"
338
+ keys_to_ignore_at_inference = ["past_key_values"]
339
+ base_model_tp_plan = {
340
+ "layers.*.self_attn.qkv_proj": "colwise_rep", # we need to replicate here due to the slicing of qkv
341
+ "layers.*.self_attn.o_proj": "rowwise_rep", # we need to replicate here due to the slicing of qkv
342
+ "layers.*.mlp.gate_up_proj": "colwise_rep", # we need to replicate here due to the `chunk` operation
343
+ "layers.*.mlp.down_proj": "rowwise_rep", # we need to replicate here due to the `chunk` operation
344
+ }
345
+ base_model_pp_plan = {
346
+ "embed_tokens": (["input_ids"], ["inputs_embeds"]),
347
+ "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
348
+ "norm": (["hidden_states"], ["hidden_states"]),
349
+ }
350
+
351
+ sub_configs = {"audio_config": Phi4MultimodalAudioConfig, "vision_config": Phi4MultimodalVisionConfig}
352
+
353
+ def __init__(
354
+ self,
355
+ vocab_size=200064,
356
+ hidden_size=3072,
357
+ intermediate_size=8192,
358
+ num_hidden_layers=32,
359
+ num_attention_heads=32,
360
+ num_key_value_heads=8,
361
+ resid_pdrop=0.0,
362
+ embd_pdrop=0.0,
363
+ attention_dropout=0.0,
364
+ hidden_act="silu",
365
+ max_position_embeddings=131072,
366
+ initializer_range=0.02,
367
+ rms_norm_eps=1e-5,
368
+ use_cache=True,
369
+ tie_word_embeddings=False,
370
+ rope_theta=10000.0,
371
+ rope_scaling=None,
372
+ partial_rotary_factor=1,
373
+ bos_token_id=199999,
374
+ eos_token_id=[199999, 200020],
375
+ pad_token_id=199999,
376
+ original_max_position_embeddings=4096,
377
+ sliding_window=None,
378
+ vision_config=None,
379
+ audio_config=None,
380
+ **kwargs,
381
+ ):
382
+ super().__init__(
383
+ bos_token_id=bos_token_id,
384
+ eos_token_id=eos_token_id,
385
+ pad_token_id=pad_token_id,
386
+ tie_word_embeddings=tie_word_embeddings,
387
+ **kwargs,
388
+ )
389
+ self.vocab_size = vocab_size
390
+ self.hidden_size = hidden_size
391
+ self.intermediate_size = intermediate_size
392
+ self.num_hidden_layers = num_hidden_layers
393
+ self.num_attention_heads = num_attention_heads
394
+
395
+ if num_key_value_heads is None:
396
+ num_key_value_heads = num_attention_heads
397
+
398
+ self.num_key_value_heads = num_key_value_heads
399
+ self.resid_pdrop = resid_pdrop
400
+ self.embd_pdrop = embd_pdrop
401
+ self.attention_dropout = attention_dropout
402
+ self.hidden_act = hidden_act
403
+ self.max_position_embeddings = max_position_embeddings
404
+ self.original_max_position_embeddings = original_max_position_embeddings
405
+ self.initializer_range = initializer_range
406
+ self.rms_norm_eps = rms_norm_eps
407
+ self.use_cache = use_cache
408
+ self.rope_theta = rope_theta
409
+ self.rope_scaling = rope_scaling
410
+ self.partial_rotary_factor = partial_rotary_factor
411
+ self._rope_scaling_adjustment()
412
+ self._rope_scaling_validation()
413
+ self.sliding_window = sliding_window
414
+
415
+ if isinstance(vision_config, dict):
416
+ vision_config = Phi4MultimodalVisionConfig(**vision_config)
417
+ elif vision_config is None:
418
+ Phi4MultimodalVisionConfig()
419
+ self.vision_config = vision_config
420
+
421
+ if isinstance(audio_config, dict):
422
+ audio_config = Phi4MultimodalAudioConfig(**audio_config)
423
+ elif vision_config is None:
424
+ audio_config = Phi4MultimodalAudioConfig()
425
+ self.audio_config = audio_config
426
+
427
+ def _rope_scaling_adjustment(self):
428
+ """
429
+ Adjust the `type` of the `rope_scaling` configuration for backward compatibility.
430
+ """
431
+ if self.rope_scaling is None:
432
+ return
433
+
434
+ rope_scaling_type = self.rope_scaling.get("type", None)
435
+
436
+ # For backward compatibility if previous version used "su" or "yarn"
437
+ if rope_scaling_type is not None and rope_scaling_type in ["su", "yarn"]:
438
+ self.rope_scaling["type"] = "longrope"
439
+
440
+ def _rope_scaling_validation(self):
441
+ """
442
+ Validate the `rope_scaling` configuration.
443
+ """
444
+ if self.rope_scaling is None:
445
+ return
446
+
447
+ if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 3:
448
+ raise ValueError(
449
+ "`rope_scaling` must be a dictionary with three fields, `type`, `short_factor` and `long_factor`, "
450
+ f"got {self.rope_scaling}"
451
+ )
452
+ rope_scaling_type = self.rope_scaling.get("type", None)
453
+ rope_scaling_short_factor = self.rope_scaling.get("short_factor", None)
454
+ rope_scaling_long_factor = self.rope_scaling.get("long_factor", None)
455
+ if rope_scaling_type is None or rope_scaling_type not in ["longrope"]:
456
+ raise ValueError(f"`rope_scaling`'s type field must be one of ['longrope'], got {rope_scaling_type}")
457
+ if not (
458
+ isinstance(rope_scaling_short_factor, list)
459
+ and all(isinstance(x, (int, float)) for x in rope_scaling_short_factor)
460
+ ):
461
+ raise ValueError(
462
+ f"`rope_scaling`'s short_factor field must be a list of numbers, got {rope_scaling_short_factor}"
463
+ )
464
+ rotary_ndims = int(self.hidden_size // self.num_attention_heads * self.partial_rotary_factor)
465
+ if not len(rope_scaling_short_factor) == rotary_ndims // 2:
466
+ raise ValueError(
467
+ f"`rope_scaling`'s short_factor field must have length {rotary_ndims // 2}, got {len(rope_scaling_short_factor)}"
468
+ )
469
+ if not (
470
+ isinstance(rope_scaling_long_factor, list)
471
+ and all(isinstance(x, (int, float)) for x in rope_scaling_long_factor)
472
+ ):
473
+ raise ValueError(
474
+ f"`rope_scaling`'s long_factor field must be a list of numbers, got {rope_scaling_long_factor}"
475
+ )
476
+ if not len(rope_scaling_long_factor) == rotary_ndims // 2:
477
+ raise ValueError(
478
+ f"`rope_scaling`'s long_factor field must have length {rotary_ndims // 2}, got {len(rope_scaling_long_factor)}"
479
+ )
480
+
481
+
482
+ __all__ = ["Phi4MultimodalVisionConfig", "Phi4MultimodalAudioConfig", "Phi4MultimodalConfig"]
483
+
484
+ Phi4MultimodalConfig.register_for_auto_class()
feature_extraction_phi4_multimodal.py ADDED
@@ -0,0 +1,353 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """
16
+ Processor class for Phi4Multimodal
17
+ """
18
+
19
+ from typing import Optional, Union, List, Tuple
20
+
21
+ import numpy as np
22
+
23
+ from transformers.feature_extraction_sequence_utils import SequenceFeatureExtractor
24
+ from transformers.image_processing_utils import BatchFeature
25
+ from transformers.utils import TensorType, is_torch_available, logging
26
+
27
+
28
+ if is_torch_available():
29
+ import torch
30
+
31
+
32
+ logger = logging.get_logger(__name__)
33
+
34
+ AudioInput = Union[
35
+ np.ndarray, "torch.Tensor", List[np.ndarray], Tuple[np.ndarray], List["torch.Tensor"], Tuple["torch.Tensor"] # noqa: F821
36
+ ]
37
+
38
+
39
+ # TODO: @eustlb, remove this once #36603 is merged.
40
+ def speechlib_mel(sample_rate, n_fft, n_mels, fmin=None, fmax=None):
41
+ """Create a Mel filter-bank the same as SpeechLib FbankFC.
42
+
43
+ Args:
44
+ sample_rate (int): Sample rate in Hz. number > 0 [scalar]
45
+ n_fft (int): FFT size. int > 0 [scalar]
46
+ n_mel (int): Mel filter size. int > 0 [scalar]
47
+ fmin (float): lowest frequency (in Hz). If None use 0.0.
48
+ float >= 0 [scalar]
49
+ fmax: highest frequency (in Hz). If None use sample_rate / 2.
50
+ float >= 0 [scalar]
51
+
52
+ Returns
53
+ out (numpy.ndarray): Mel transform matrix
54
+ [shape=(n_mels, 1 + n_fft/2)]
55
+ """
56
+
57
+ bank_width = int(n_fft // 2 + 1)
58
+ if fmax is None:
59
+ fmax = sample_rate / 2
60
+ if fmin is None:
61
+ fmin = 0
62
+ assert fmin >= 0, "fmin cannot be negtive"
63
+ assert fmin < fmax <= sample_rate / 2, "fmax must be between (fmin, samplerate / 2]"
64
+
65
+ def mel(f):
66
+ return 1127.0 * np.log(1.0 + f / 700.0)
67
+
68
+ def bin2mel(fft_bin):
69
+ return 1127.0 * np.log(1.0 + fft_bin * sample_rate / (n_fft * 700.0))
70
+
71
+ def f2bin(f):
72
+ return int((f * n_fft / sample_rate) + 0.5)
73
+
74
+ # Spec 1: FFT bin range [f2bin(fmin) + 1, f2bin(fmax) - 1]
75
+ klo = f2bin(fmin) + 1
76
+ khi = f2bin(fmax)
77
+
78
+ khi = max(khi, klo)
79
+
80
+ # Spec 2: SpeechLib uses trianges in Mel space
81
+ mlo = mel(fmin)
82
+ mhi = mel(fmax)
83
+ m_centers = np.linspace(mlo, mhi, n_mels + 2)
84
+ ms = (mhi - mlo) / (n_mels + 1)
85
+
86
+ matrix = np.zeros((n_mels, bank_width), dtype=np.float32)
87
+ for m in range(0, n_mels):
88
+ left = m_centers[m]
89
+ center = m_centers[m + 1]
90
+ right = m_centers[m + 2]
91
+ for fft_bin in range(klo, khi):
92
+ mbin = bin2mel(fft_bin)
93
+ if left < mbin < right:
94
+ matrix[m, fft_bin] = 1.0 - abs(center - mbin) / ms
95
+
96
+ return matrix
97
+
98
+
99
+ class Phi4MultimodalFeatureExtractor(SequenceFeatureExtractor):
100
+ model_input_names = ["audio_input_features", "audio_embed_sizes", "audio_attention_mask"]
101
+
102
+ def __init__(
103
+ self,
104
+ feature_size: int = 80,
105
+ sampling_rate: int = 16000,
106
+ hop_length: int = 160,
107
+ n_fft: int = 512,
108
+ win_length: int = 400,
109
+ preemphasis: float = 0.97,
110
+ padding_value: float = 0.0,
111
+ audio_compression_rate: int = 8,
112
+ audio_downsample_rate: int = 1,
113
+ audio_feat_stride: int = 1,
114
+ mel_min_frequency: float = 0,
115
+ mel_max_frequency: float = 7690,
116
+ **kwargs,
117
+ ):
118
+ super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
119
+
120
+ self.hop_length = hop_length
121
+ self.n_fft = n_fft
122
+ self.win_length = win_length
123
+ self.preemphasis = preemphasis
124
+ self.padding_value = padding_value
125
+ self.audio_compression_rate = audio_compression_rate
126
+ self.audio_downsample_rate = audio_downsample_rate
127
+ self.audio_feat_stride = audio_feat_stride
128
+
129
+ # TODO: @eustlb, uncomment and remove speechlib_mel once #36603 is merged.
130
+ # self.mel_filters = mel_filter_bank(
131
+ # num_frequency_bins=self.n_fft // 2 + 1,
132
+ # num_mel_filters=self.feature_size,
133
+ # min_frequency=mel_min_frequency,
134
+ # max_frequency=mel_max_frequency,
135
+ # sampling_rate=self.sampling_rate,
136
+ # triangularize_in_mel_space=True,
137
+ # mel_scale="kaldi",
138
+ # )
139
+ self.mel_filters = speechlib_mel(
140
+ self.sampling_rate, self.n_fft, self.feature_size, mel_min_frequency, mel_max_frequency
141
+ ).T
142
+
143
+ def __call__(
144
+ self,
145
+ raw_speech: AudioInput,
146
+ sampling_rate: Optional[int] = None,
147
+ pad_to_multiple_of: Optional[int] = None,
148
+ padding: Optional[str] = "longest",
149
+ max_length: Optional[int] = None,
150
+ truncation: bool = False,
151
+ return_tensors: Optional[Union[str, TensorType]] = None,
152
+ return_attention_mask: Optional[bool] = True,
153
+ device: Optional[str] = "cpu",
154
+ **kwargs,
155
+ ) -> BatchFeature:
156
+ """
157
+ Main method to featurize and prepare for the model one or several audio sequence(s). Implementation uses PyTorch for
158
+ the STFT computation if available, otherwise a slower NumPy based one.
159
+
160
+ Args:
161
+ raw_speech (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
162
+ The sequence or batch of sequences to be processed. Each sequence can be a numpy array or PyTorch tensor.
163
+ For batched inputs, sequences can be a list of numpy arrays or PyTorch tensors, or a single numpy array or
164
+ PyTorch tensor with first dimension being the batch size.
165
+ sampling_rate (`int`, *optional*):
166
+ The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
167
+ `sampling_rate` at the forward call to prevent silent errors.
168
+ pad_to_multiple_of (`int`, *optional*, defaults to None):
169
+ If set will pad the sequence to a multiple of the provided value.
170
+ padding (`str`, *optional*, defaults to "longest"):
171
+ Padding strategy. Can be "longest" to pad to the longest sequence in the batch, or a specific length.
172
+ max_length (`int`, *optional*):
173
+ Maximum length of the returned list and optionally padding length.
174
+ truncation (`bool`, *optional*, defaults to False):
175
+ Activates truncation to cut input sequences longer than *max_length* to *max_length*.
176
+ return_tensors (`str` or [`~utils.TensorType`], *optional*):
177
+ If set, will return tensors instead of numpy arrays. Acceptable values are:
178
+ - `'pt'`: Return PyTorch `torch.Tensor` objects.
179
+ - `'np'`: Return Numpy `np.ndarray` objects.
180
+ - `'tf'`: Return TensorFlow `tf.constant` objects.
181
+ return_attention_mask (`bool`, *optional*, defaults to `True`):
182
+ Whether to return the extracted audio input features' attention mask.
183
+ device (`str`, *optional*, defaults to "cpu"):
184
+ Specifies the device for computation of the audio features. (e.g., "cpu", "cuda")
185
+
186
+ Returns:
187
+ [`BatchFeature`]: A [`BatchFeature`] with the following fields:
188
+ - **audio_input_features** -- Audio features extracted from the raw audio input, shape (batch_size, max_feature_length, feature_size).
189
+ - **audio_lengths** -- Length of each audio sample in the batch, shape (batch_size,).
190
+ - **audio_attention_mask** -- Attention mask for the audio input, shape (batch_size, max_feature_length).
191
+ If `return_tensors` is not specified, the fields will be PyTorch tensors if PyTorch is available, otherwise NumPy arrays.
192
+ """
193
+ if sampling_rate is not None:
194
+ if sampling_rate != self.sampling_rate:
195
+ raise ValueError(
196
+ f"The model corresponding to this feature extractor: {self.__class__.__name__} was trained using a"
197
+ f" sampling rate of {self.sampling_rate}. Please make sure that the provided `raw_speech` input"
198
+ f" was sampled with {self.sampling_rate} and not {sampling_rate}."
199
+ )
200
+ else:
201
+ logger.warning(
202
+ f"It is strongly recommended to pass the `sampling_rate` argument to `{self.__class__.__name__}()`. "
203
+ "Failing to do so can result in silent errors that might be hard to debug."
204
+ )
205
+
206
+ # Convert to torch tensor
207
+ if isinstance(raw_speech, np.ndarray):
208
+ raw_speech = torch.tensor(raw_speech)
209
+ elif isinstance(raw_speech, (list, tuple)) and isinstance(raw_speech[0], np.ndarray):
210
+ raw_speech = [torch.tensor(speech) for speech in raw_speech]
211
+
212
+ is_batched_torch = isinstance(raw_speech, torch.Tensor) and len(raw_speech.shape) > 1
213
+ if is_batched_torch and len(raw_speech.shape) > 2:
214
+ logger.warning(
215
+ f"Only mono-channel audio is supported for input to {self.__class__.__name__}. "
216
+ "We will take the mean of the channels to convert to mono."
217
+ )
218
+ raw_speech = raw_speech.mean(-1)
219
+
220
+ is_batched_sequence = isinstance(raw_speech, (list, tuple))
221
+ if is_batched_sequence:
222
+ for speech in raw_speech:
223
+ if len(speech.shape) > 1:
224
+ logger.warning(
225
+ f"Only mono-channel audio is supported for input to {self.__class__.__name__}. "
226
+ "We will take the mean of the channels to convert to mono."
227
+ )
228
+ speech = speech.mean(-1)
229
+
230
+ if is_batched_torch or is_batched_sequence:
231
+ raw_speech = [speech[:, None].to(torch.float32) for speech in raw_speech]
232
+ else:
233
+ raw_speech = [raw_speech[:, None].to(torch.float32)]
234
+
235
+ audio_lengths = [len(speech) for speech in raw_speech]
236
+
237
+ # convert into correct format for padding
238
+ batched_speech = BatchFeature(data={"audio_input_features": raw_speech, "audio_lengths": audio_lengths})
239
+ padded_inputs = self.pad(
240
+ batched_speech,
241
+ padding=padding,
242
+ max_length=max_length,
243
+ truncation=truncation,
244
+ pad_to_multiple_of=pad_to_multiple_of,
245
+ return_tensors="pt",
246
+ )
247
+ input_features = padded_inputs.audio_input_features.squeeze(-1)
248
+ audio_lengths = padded_inputs.audio_lengths
249
+
250
+ input_features = self._torch_extract_fbank_features(input_features, audio_lengths, device)
251
+
252
+ feature_lengths = (audio_lengths - self.win_length) // self.hop_length + 1
253
+ feature_lengths = feature_lengths * self.audio_feat_stride
254
+ audio_embed_sizes = self._compute_audio_embed_size(feature_lengths)
255
+
256
+ feature_attention_mask = (
257
+ torch.arange(0, feature_lengths.max()) if is_torch_available() else np.arange(0, feature_lengths.max())
258
+ )
259
+ feature_attention_mask = (
260
+ feature_attention_mask[None, :] < feature_lengths[:, None] if len(feature_lengths) > 1 else None
261
+ )
262
+
263
+ data = {
264
+ "audio_input_features": input_features,
265
+ "audio_embed_sizes": audio_embed_sizes,
266
+ }
267
+ if feature_attention_mask is not None and return_attention_mask:
268
+ data["audio_attention_mask"] = feature_attention_mask
269
+
270
+ return BatchFeature(data=data, tensor_type=return_tensors)
271
+
272
+ # TODO; @eustlb, move this to audio_utils in a general spectogram_batch function that handles torch and numpy
273
+ def _torch_extract_fbank_features(
274
+ self, waveform: "torch.FloatTensor", audio_lengths: "torch.Tensor", device: str = "cpu"
275
+ ) -> "torch.FloatTensor":
276
+ """
277
+ Compute the log mel-scaled spectrogram of batched waveforms using PyTorch's FFT implementation.
278
+
279
+ Args:
280
+ waveform (torch.FloatTensor` of shape `(batch_size, max_audio_length)`):
281
+ The batched waveforms.
282
+ audio_lengths (`torch.Tensor` of shape `(batch_size,)`):
283
+ The lengths of the waveforms along the max_audio_length dimension.
284
+ device (`str`, *optional*, defaults to "cpu"):
285
+ The device to run the computation on. (e.g., "cpu", "cuda")
286
+
287
+ Returns:
288
+ `torch.FloatTensor` of shape `(batch_size, max_feature_length, feature_size)`:
289
+ The log mel-scaled spectrogram of the batched waveforms.
290
+ """
291
+ fft_window = torch.hamming_window(self.win_length, periodic=False, device=device, dtype=torch.float64)
292
+
293
+ # batched implementation
294
+ batch_size = waveform.shape[0]
295
+ frames = waveform.unfold(-1, self.win_length, self.hop_length)
296
+
297
+ # ---
298
+ # the unbatched (and unpaded) original implementation skips last few audio values that can't be included in a frame
299
+ # we need to ensure that the corresponding frames for the padded input also mask these values
300
+ if batch_size > 1:
301
+ frames = frames.clone()
302
+ # concerned batch indices
303
+ to_mask_batch_idxs = torch.arange(batch_size)[audio_lengths != audio_lengths.max()]
304
+ if to_mask_batch_idxs.numel() > 0:
305
+ batch_idxs_down = (audio_lengths[to_mask_batch_idxs] - self.win_length) // self.hop_length + 1
306
+ batch_idxs_up = audio_lengths[to_mask_batch_idxs] // self.hop_length + 1
307
+ offset_idx = batch_idxs_down.min()
308
+ max_idx = batch_idxs_up.max()
309
+
310
+ mask = torch.arange(max_idx - offset_idx, device=device).expand(to_mask_batch_idxs.shape[0], -1)
311
+ mask = ((batch_idxs_down - offset_idx).unsqueeze(1) <= mask) & (
312
+ mask < (batch_idxs_up - offset_idx).unsqueeze(1)
313
+ )
314
+ mask = mask.unsqueeze(-1).expand(-1, -1, self.win_length)
315
+ masked_frames = frames[to_mask_batch_idxs, offset_idx:max_idx].masked_fill_(mask, 0)
316
+ frames[to_mask_batch_idxs, offset_idx:max_idx] = masked_frames
317
+ # ---
318
+
319
+ # apply pre-emphasis first order filter on fft windows
320
+ frames_prev = torch.roll(frames, 1, dims=-1)
321
+ frames_prev[:, :, 0] = frames_prev[:, :, 1]
322
+ frames = (frames - self.preemphasis * frames_prev) * 32768
323
+
324
+ # apply fft
325
+ S = torch.fft.rfft(fft_window * frames.view(-1, self.win_length), n=self.n_fft, dim=1)
326
+ S = S.view(frames.shape[0], -1, S.shape[-1])
327
+ S = S.to(torch.complex64)
328
+
329
+ spec = torch.abs(S)
330
+ spec_power = spec**2
331
+
332
+ # apply triangular mel filter bank
333
+ mel_filters = torch.from_numpy(self.mel_filters).to(device, torch.float32)
334
+ log_spec = torch.clamp(spec_power @ mel_filters, min=1.0)
335
+ log_spec = torch.log(log_spec)
336
+
337
+ return log_spec
338
+
339
+ def _compute_audio_embed_size(self, audio_frames):
340
+ integer = audio_frames // self.audio_compression_rate
341
+ remainder = audio_frames % self.audio_compression_rate
342
+ result = integer + (remainder > 0).to(integer.dtype)
343
+
344
+ integer = result // self.audio_downsample_rate
345
+ remainder = result % self.audio_downsample_rate
346
+ result = integer + (remainder > 0).to(integer.dtype) # qformer compression
347
+
348
+ return result
349
+
350
+
351
+ __all__ = ["Phi4MultimodalFeatureExtractor"]
352
+
353
+ Phi4MultimodalFeatureExtractor.register_for_auto_class()
generation_config.json CHANGED
@@ -2,10 +2,9 @@
2
  "_from_model_config": true,
3
  "bos_token_id": 199999,
4
  "eos_token_id": [
5
- 200020,
6
- 199999
7
  ],
8
  "pad_token_id": 199999,
9
- "transformers_version": "4.46.1",
10
- "use_cache": true
11
  }
 
2
  "_from_model_config": true,
3
  "bos_token_id": 199999,
4
  "eos_token_id": [
5
+ 199999,
6
+ 200020
7
  ],
8
  "pad_token_id": 199999,
9
+ "transformers_version": "4.52.0.dev0"
 
10
  }
image_processing_phi4_multimodal_fast.py ADDED
@@ -0,0 +1,284 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2025 Microsoft and the HuggingFace Inc. team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """
16
+ Processor class for Phi4Multimodal
17
+ """
18
+
19
+ import math
20
+ from typing import List, Optional, Union, TypedDict
21
+
22
+ import torch
23
+ from torchvision.transforms import functional as F
24
+
25
+ from transformers.image_processing_utils_fast import (
26
+ BaseImageProcessorFast,
27
+ BatchFeature,
28
+ Unpack,
29
+ convert_to_rgb,
30
+ ChannelDimension
31
+ )
32
+ from transformers.image_utils import ImageInput, make_flat_list_of_images, valid_images
33
+ from transformers.utils import TensorType, logging
34
+
35
+
36
+ logger = logging.get_logger(__name__)
37
+
38
+
39
+ class DefaultFastImageProcessorKwargs(TypedDict, total=False):
40
+ do_resize: Optional[bool]
41
+ size: Optional[dict[str, int]]
42
+ default_to_square: Optional[bool]
43
+ resample: Optional[Union["PILImageResampling", "F.InterpolationMode"]]
44
+ do_center_crop: Optional[bool]
45
+ crop_size: Optional[dict[str, int]]
46
+ do_rescale: Optional[bool]
47
+ rescale_factor: Optional[Union[int, float]]
48
+ do_normalize: Optional[bool]
49
+ image_mean: Optional[Union[float, list[float]]]
50
+ image_std: Optional[Union[float, list[float]]]
51
+ do_convert_rgb: Optional[bool]
52
+ return_tensors: Optional[Union[str, TensorType]]
53
+ data_format: Optional[ChannelDimension]
54
+ input_data_format: Optional[Union[str, ChannelDimension]]
55
+ device: Optional["torch.device"]
56
+
57
+
58
+ class Phi4MultimodalFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
59
+ image_size: Optional[int]
60
+ patch_size: Optional[int]
61
+ dynamic_hd: Optional[int]
62
+
63
+
64
+ class Phi4MultimodalImageProcessorFast(BaseImageProcessorFast):
65
+ r"""
66
+ Constructs a Phi4Multimodal image processor.
67
+ """
68
+
69
+ image_size = 448
70
+ patch_size = 14
71
+ dynamic_hd = 36
72
+ image_mean = [0.5, 0.5, 0.5]
73
+ image_std = [0.5, 0.5, 0.5]
74
+ valid_init_kwargs = Phi4MultimodalFastImageProcessorKwargs
75
+ model_input_names = ["image_pixel_values", "image_sizes", "image_attention_mask"]
76
+
77
+ def __init__(self, **kwargs: Unpack[Phi4MultimodalFastImageProcessorKwargs]):
78
+ super().__init__(**kwargs)
79
+
80
+ def find_closest_aspect_ratio(self, aspect_ratio, target_ratios, width, height):
81
+ best_ratio_diff = float("inf")
82
+ best_ratio = (1, 1)
83
+ area = width * height
84
+ for ratio in target_ratios:
85
+ target_aspect_ratio = ratio[0] / ratio[1]
86
+ ratio_diff = abs(aspect_ratio - target_aspect_ratio)
87
+ if ratio_diff < best_ratio_diff:
88
+ best_ratio_diff = ratio_diff
89
+ best_ratio = ratio
90
+ elif ratio_diff == best_ratio_diff:
91
+ if area > 0.5 * self.image_size * self.image_size * ratio[0] * ratio[1]:
92
+ best_ratio = ratio
93
+ return best_ratio
94
+
95
+ def dynamic_preprocess(self, image, max_num=36, min_num=1):
96
+ image_size = self.image_size
97
+ patch_size = self.patch_size
98
+ mask_size = image_size // patch_size
99
+ orig_width, orig_height = image.size
100
+
101
+ w_crop_num = math.ceil(orig_width / float(image_size))
102
+ h_crop_num = math.ceil(orig_height / float(image_size))
103
+ if w_crop_num * h_crop_num > max_num:
104
+ aspect_ratio = orig_width / orig_height
105
+
106
+ # calculate the existing image aspect ratio
107
+ target_ratios = {
108
+ (i, j)
109
+ for n in range(min_num, max_num + 1)
110
+ for i in range(1, n + 1)
111
+ for j in range(1, n + 1)
112
+ if i * j <= max_num and i * j >= min_num
113
+ }
114
+ target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
115
+
116
+ # find the closest aspect ratio to the target
117
+ target_aspect_ratio = self.find_closest_aspect_ratio(aspect_ratio, target_ratios, orig_width, orig_height)
118
+
119
+ # calculate the target width and height
120
+ target_width = image_size * target_aspect_ratio[0]
121
+ target_height = image_size * target_aspect_ratio[1]
122
+ else:
123
+ target_width = image_size * w_crop_num
124
+ target_height = image_size * h_crop_num
125
+ target_aspect_ratio = (w_crop_num, h_crop_num)
126
+
127
+ # Calculate the ratio
128
+ ratio_width = target_width / orig_width
129
+ ratio_height = target_height / orig_height
130
+ if ratio_width < ratio_height:
131
+ new_size = (target_width, int(orig_height * ratio_width))
132
+ padding_width = 0
133
+ padding_height = target_height - int(orig_height * ratio_width)
134
+ else:
135
+ new_size = (int(orig_width * ratio_height), target_height)
136
+ padding_width = target_width - int(orig_width * ratio_height)
137
+ padding_height = 0
138
+
139
+ attention_mask = torch.ones((int(mask_size * target_aspect_ratio[1]), int(mask_size * target_aspect_ratio[0])))
140
+ if padding_width >= patch_size:
141
+ attention_mask[:, -math.floor(padding_width / patch_size) :] = 0
142
+ if padding_height >= patch_size:
143
+ attention_mask[-math.floor(padding_height / patch_size) :, :] = 0
144
+
145
+ if min(new_size[1], target_height) < 10 or min(new_size[0], target_width) < 10:
146
+ raise ValueError(f"the aspect ratio is very extreme {new_size}")
147
+
148
+ image = F.resize(image, [new_size[1], new_size[0]])
149
+ resized_img = F.pad(image, [0, 0, padding_width, padding_height], fill=[255, 255, 255])
150
+
151
+ return resized_img, attention_mask
152
+
153
+ def pad_to_max_num_crops(self, images, max_crops=5):
154
+ """
155
+ images: B x 3 x H x W, B<=max_crops
156
+ """
157
+ B, _, H, W = images.shape
158
+ if B < max_crops:
159
+ pad = torch.zeros(max_crops - B, 3, H, W, dtype=images.dtype, device=images.device)
160
+ images = torch.cat([images, pad], dim=0)
161
+ return images
162
+
163
+ def pad_mask_to_max_num_crops(self, masks, max_crops=5):
164
+ B, H, W = masks.shape
165
+ if B < max_crops:
166
+ pad = torch.ones(max_crops - B, H, W, dtype=masks.dtype, device=masks.device)
167
+ masks = torch.cat([masks, pad], dim=0)
168
+ return masks
169
+
170
+ def preprocess(
171
+ self,
172
+ images: ImageInput,
173
+ image_mean: Optional[Union[float, List[float]]] = None,
174
+ image_std: Optional[Union[float, List[float]]] = None,
175
+ return_tensors: Optional[Union[str, TensorType]] = None,
176
+ ):
177
+ """
178
+ Args:
179
+ images (`ImageInput`):
180
+ Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
181
+ passing in images with pixel values between 0 and 1, set `do_rescale=False`.
182
+ image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
183
+ Mean to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
184
+ image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
185
+ Standard deviation to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
186
+ return_tensors (`str` or `TensorType`, *optional*):
187
+ The type of tensors to return. Can be one of:
188
+ - Unset: Return a list of `np.ndarray`.
189
+ - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
190
+ - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
191
+ - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
192
+ - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
193
+ """
194
+ image_mean = image_mean if image_mean is not None else self.image_mean
195
+ image_std = image_std if image_std is not None else self.image_std
196
+
197
+ images = make_flat_list_of_images(images)
198
+ if not valid_images(images):
199
+ raise ValueError(
200
+ "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
201
+ "torch.Tensor, tf.Tensor or jax.ndarray."
202
+ )
203
+ images = [convert_to_rgb(image) for image in images]
204
+
205
+ image_size = self.image_size
206
+ patch_size = self.patch_size
207
+ mask_size = image_size // patch_size
208
+ imgs_and_masks = [self.dynamic_preprocess(image, max_num=self.dynamic_hd) for image in images]
209
+ images, image_attention_masks = [x[0] for x in imgs_and_masks], [x[1] for x in imgs_and_masks]
210
+
211
+ images = [F.to_tensor(image) for image in images]
212
+ hd_images = [F.normalize(image, image_mean, image_std) for image in images]
213
+ global_image = [
214
+ torch.nn.functional.interpolate(
215
+ image.unsqueeze(0).float(),
216
+ size=(image_size, image_size),
217
+ mode="bicubic",
218
+ ).to(image.dtype)
219
+ for image in hd_images
220
+ ]
221
+
222
+ shapes = [[image.size(1), image.size(2)] for image in hd_images]
223
+ mask_shapes = [[mask.size(0), mask.size(1)] for mask in image_attention_masks]
224
+ global_attention_mask = [torch.ones((1, mask_size, mask_size)) for _ in hd_images]
225
+
226
+ hd_images_reshape = []
227
+ for im, (h, w) in zip(hd_images, shapes):
228
+ im = im.reshape(1, 3, h // image_size, image_size, w // image_size, image_size)
229
+ im = im.permute(0, 2, 4, 1, 3, 5)
230
+ im = im.reshape(-1, 3, image_size, image_size)
231
+ hd_images_reshape.append(im.contiguous())
232
+
233
+ attention_masks_reshape = []
234
+ for mask, (h, w) in zip(image_attention_masks, mask_shapes):
235
+ mask = mask.reshape(h // mask_size, mask_size, w // mask_size, mask_size)
236
+ mask = mask.transpose(1, 2)
237
+ mask = mask.reshape(-1, mask_size, mask_size)
238
+ attention_masks_reshape.append(mask.contiguous())
239
+
240
+ downsample_attention_masks = []
241
+ for mask, (h, w) in zip(attention_masks_reshape, mask_shapes):
242
+ mask = mask[:, 0::2, 0::2]
243
+ mask = mask.reshape(
244
+ h // mask_size, w // mask_size, mask_size // 2 + mask_size % 2, mask_size // 2 + mask_size % 2
245
+ )
246
+ mask = mask.transpose(1, 2)
247
+ mask = mask.reshape(mask.size(0) * mask.size(1), mask.size(2) * mask.size(3))
248
+ downsample_attention_masks.append(mask)
249
+
250
+ num_img_tokens = [
251
+ 256 + 1 + int(mask.sum().item()) + int(mask[:, 0].sum().item()) + 16 for mask in downsample_attention_masks
252
+ ]
253
+
254
+ hd_images_reshape = [
255
+ torch.cat([_global_image] + [_im], dim=0) for _global_image, _im in zip(global_image, hd_images_reshape)
256
+ ]
257
+ hd_masks_reshape = [
258
+ torch.cat([_global_mask] + [_mask], dim=0)
259
+ for _global_mask, _mask in zip(global_attention_mask, attention_masks_reshape)
260
+ ]
261
+ max_crops = max([img.size(0) for img in hd_images_reshape])
262
+ image_transformed = [self.pad_to_max_num_crops(im, max_crops) for im in hd_images_reshape]
263
+ image_transformed = torch.stack(image_transformed, dim=0)
264
+ mask_transformed = [self.pad_mask_to_max_num_crops(mask, max_crops) for mask in hd_masks_reshape]
265
+ mask_transformed = torch.stack(mask_transformed, dim=0)
266
+
267
+ returned_input_image_embeds = image_transformed
268
+ returned_image_sizes = torch.tensor(shapes, dtype=torch.long)
269
+ returned_image_attention_mask = mask_transformed
270
+ returned_num_img_tokens = num_img_tokens
271
+
272
+ data = {
273
+ "image_pixel_values": returned_input_image_embeds,
274
+ "image_sizes": returned_image_sizes,
275
+ "image_attention_mask": returned_image_attention_mask,
276
+ "num_img_tokens": returned_num_img_tokens,
277
+ }
278
+
279
+ return BatchFeature(data=data, tensor_type=return_tensors)
280
+
281
+
282
+ __all__ = ["Phi4MultimodalImageProcessorFast"]
283
+
284
+ Phi4MultimodalImageProcessorFast.register_for_auto_class()
model-00001-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:57b93f5d0c9422c0b76b68119660187989bd8bb47848994376be3ac53eb61a95
3
+ size 4903637712
model-00002-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fd6f60df08041b5c48afe7d7624d4de6e9d7d86162dec7a7e908a71d595e2967
3
+ size 4584575136
model.safetensors.index.json CHANGED
The diff for this file is too large to render. See raw diff
 
modeling_phi4_multimodal.py ADDED
The diff for this file is too large to render. See raw diff
 
preprocessor_config.json CHANGED
@@ -1,14 +1,21 @@
1
  {
2
  "auto_map": {
3
- "AutoProcessor": "processing_phi4mm.Phi4MMProcessor",
4
- "AutoImageProcessor": "processing_phi4mm.Phi4MMImageProcessor",
5
- "AutoFeatureExtractor": "processing_phi4mm.Phi4MMAudioFeatureExtractor"
6
  },
7
- "image_processor_type": "Phi4MMImageProcessor",
8
- "processor_class": "Phi4MMProcessor",
9
- "feature_extractor_type": "Phi4MMAudioFeatureExtractor",
10
  "audio_compression_rate": 8,
11
  "audio_downsample_rate": 1,
12
  "audio_feat_stride": 1,
13
- "dynamic_hd": 36
 
 
 
 
 
 
 
 
 
 
14
  }
 
1
  {
2
  "auto_map": {
3
+ "AutoProcessor": "processing_phi4_multimodal.Phi4MultimodalProcessor",
4
+ "AutoImageProcessor": "image_processing_phi4_multimodal_fast.Phi4MultimodalImageProcessorFast",
5
+ "AutoFeatureExtractor": "feature_extraction_phi4_multimodal.Phi4MultimodalFeatureExtractor"
6
  },
 
 
 
7
  "audio_compression_rate": 8,
8
  "audio_downsample_rate": 1,
9
  "audio_feat_stride": 1,
10
+ "feature_extractor_type": "Phi4MultimodalFeatureExtractor",
11
+ "feature_size": 80,
12
+ "hop_length": 160,
13
+ "n_fft": 512,
14
+ "padding_side": "right",
15
+ "padding_value": 0.0,
16
+ "preemphasis": 0.97,
17
+ "processor_class": "Phi4MultimodalProcessor",
18
+ "return_attention_mask": true,
19
+ "sampling_rate": 16000,
20
+ "win_length": 400
21
  }
processing_phi4_multimodal.py ADDED
@@ -0,0 +1,541 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2025 Microsoft and the HuggingFace Inc. team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """
16
+ Processor class for Phi4Multimodal
17
+ """
18
+
19
+ import re
20
+ import os
21
+ import requests
22
+ import base64
23
+ from io import BytesIO
24
+ from typing import List, Optional, Union, TypedDict
25
+
26
+ import librosa
27
+ import numpy as np
28
+ import PIL.Image
29
+
30
+ from transformers.image_processing_utils import BatchFeature
31
+ from transformers.image_utils import ImageInput
32
+ from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack, TextKwargs, ImagesKwargs, VideosKwargs, AudioKwargs, CommonKwargs, ProcessorChatTemplateKwargs
33
+ from transformers.tokenization_utils_base import TextInput
34
+ from transformers.utils import logging
35
+
36
+
37
+ from .feature_extraction_phi4_multimodal import AudioInput
38
+
39
+
40
+ logger = logging.get_logger(__name__)
41
+
42
+
43
+ class ChatTemplateLoadKwargs(TypedDict, total=False):
44
+ """
45
+ Keyword arguments used to load multimodal data in processor chat templates.
46
+
47
+ num_frames (`int`, *optional*):
48
+ Number of frames to sample uniformly. If not passed, the whole video is loaded.
49
+ video_load_backend (`str`, *optional*, defaults to `"pyav"`):
50
+ The backend to use when loading the video which will be used only when there are videos in the conversation.
51
+ Can be any of ["decord", "pyav", "opencv", "torchvision"]. Defaults to "pyav" because it is the only backend
52
+ that supports all types of sources to load from.
53
+ video_fps (`int`, *optional*):
54
+ Number of frames to sample per second. Should be passed only when `num_frames=None`.
55
+ If not specified and `num_frames==None`, all frames are sampled.
56
+ sample_indices_fn (`Callable`, *optional*):
57
+ A callable function that will return indices at which the video should be sampled. If the video has to be loaded using
58
+ by a different sampling technique than provided by `num_frames` or `fps` arguments, one should provide their own `sample_indices_fn`.
59
+ If not provided, simple uniformt sampling with fps is performed, otherwise `sample_indices_fn` has priority over other args.
60
+ The function expects at input the all args along with all kwargs passed to `load_video` and should output valid
61
+ indices at which the video should be sampled. For example:
62
+
63
+ def sample_indices_fn(num_frames, fps, metadata, **kwargs):
64
+ # add you sampling logic here ...
65
+ return np.linspace(start_idx, end_idx, num_frames, dtype=int)
66
+ """
67
+
68
+ num_frames: Optional[int] = None
69
+ video_load_backend: Optional[str] = "pyav"
70
+ video_fps: Optional[int] = None
71
+ sampling_rate: Optional[int] = 16_000
72
+ load_audio_from_video: Optional[bool] = False
73
+
74
+
75
+ class AllKwargsForChatTemplate(
76
+ TextKwargs, ImagesKwargs, VideosKwargs, AudioKwargs, CommonKwargs, ProcessorChatTemplateKwargs
77
+ ):
78
+ processor_kwargs: ProcessingKwargs = {
79
+ **ProcessingKwargs.__annotations__,
80
+ }
81
+ mm_load_kwargs: ChatTemplateLoadKwargs = {
82
+ **TextKwargs.__annotations__,
83
+ }
84
+ template_kwargs: ProcessorChatTemplateKwargs = {
85
+ **ProcessorChatTemplateKwargs.__annotations__,
86
+ }
87
+
88
+
89
+ class Phi4MultimodalProcessorKwargs(ProcessingKwargs, total=False):
90
+ _defaults = {
91
+ "audio_kwargs": {
92
+ "device": "cpu",
93
+ },
94
+ }
95
+
96
+
97
+ def load_audio(audio: Union[str, np.ndarray], sampling_rate=16000, timeout=None) -> np.ndarray:
98
+ """
99
+ Loads `audio` to an np.ndarray object.
100
+
101
+ Args:
102
+ audio (`str` or `np.ndarray`):
103
+ The audio to be laoded to the numpy array format.
104
+ sampling_rate (`int`, *optional*, defaults to 16000):
105
+ The samlping rate to be used when loading the audio. It should be same as the
106
+ sampling rate the model you will be using further was trained with.
107
+ timeout (`float`, *optional*):
108
+ The timeout value in seconds for the URL request.
109
+
110
+ Returns:
111
+ `np.ndarray`: A numpy artay representing the audio.
112
+ """
113
+
114
+ if isinstance(audio, str):
115
+ # Load audio from URL (e.g https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/translate_to_chinese.wav)
116
+ if audio.startswith("http://") or audio.startswith("https://"):
117
+ audio = librosa.load(BytesIO(requests.get(audio, timeout=timeout).content), sr=sampling_rate)[0]
118
+ elif os.path.isfile(audio):
119
+ audio = librosa.load(audio, sr=sampling_rate)[0]
120
+ elif isinstance(audio, np.ndarray):
121
+ audio = audio
122
+ else:
123
+ raise TypeError(
124
+ "Incorrect format used for `audio`. Should be an url linking to an audio, a local path, or numpy array."
125
+ )
126
+ return audio
127
+
128
+
129
+ def load_image(image: Union[str, "PIL.Image.Image"], timeout: Optional[float] = None) -> "PIL.Image.Image":
130
+ """
131
+ Loads `image` to a PIL Image.
132
+
133
+ Args:
134
+ image (`str` or `PIL.Image.Image`):
135
+ The image to convert to the PIL Image format.
136
+ timeout (`float`, *optional*):
137
+ The timeout value in seconds for the URL request.
138
+
139
+ Returns:
140
+ `PIL.Image.Image`: A PIL Image.
141
+ """
142
+ if isinstance(image, str):
143
+ if image.startswith("http://") or image.startswith("https://"):
144
+ # We need to actually check for a real protocol, otherwise it's impossible to use a local file
145
+ # like http_huggingface_co.png
146
+ image = PIL.Image.open(BytesIO(requests.get(image, timeout=timeout).content))
147
+ elif os.path.isfile(image):
148
+ image = PIL.Image.open(image)
149
+ else:
150
+ if image.startswith("data:image/"):
151
+ image = image.split(",")[1]
152
+
153
+ # Try to load as base64
154
+ try:
155
+ b64 = base64.decodebytes(image.encode())
156
+ image = PIL.Image.open(BytesIO(b64))
157
+ except Exception as e:
158
+ raise ValueError(
159
+ f"Incorrect image source. Must be a valid URL starting with `http://` or `https://`, a valid path to an image file, or a base64 encoded string. Got {image}. Failed with {e}"
160
+ )
161
+ elif isinstance(image, PIL.Image.Image):
162
+ image = image
163
+ else:
164
+ raise TypeError(
165
+ "Incorrect format used for image. Should be an url linking to an image, a base64 string, a local path, or a PIL image."
166
+ )
167
+ image = PIL.ImageOps.exif_transpose(image)
168
+ image = image.convert("RGB")
169
+ return image
170
+
171
+
172
+ class Phi4MultimodalProcessor(ProcessorMixin):
173
+ r"""
174
+ Constructs a Phi4Multimodal processor which raps an image processor, a audio processor, and a GPT tokenizer into a single processor.
175
+
176
+ [`Phi4MultimodalProcessor`] offers all the functionalities of [`Phi4MultimodalImageProcessorFast`] and [`GPT2Tokenizer`]. See the
177
+ [`~Phi4MultimodalProcessor.__call__`] and [`~Phi4MultimodalProcessor.decode`] for more information.
178
+
179
+ Args:
180
+ image_processor (`Phi4MultimodalImageProcessorFast`):
181
+ The image processor to use for images.
182
+ audio_processor (`Phi4MultimodalFeatureExtractor`):
183
+ The audio processor to use for audio inputs.
184
+ tokenizer (`GPT2TokenizerFast`):
185
+ The tokenizer to use for text.
186
+ fake_image_token_pattern (`str`, *optional*, defaults to `r"<\|image_\d+\|>"`):
187
+ The fake image token pattern.
188
+ fake_audio_token_pattern (`str`, *optional*, defaults to `r"<\|audio_\d+\|>"`):
189
+ The fake audio token pattern.
190
+ """
191
+
192
+ attributes = ["image_processor", "audio_processor", "tokenizer"]
193
+ tokenizer_class = "GPT2TokenizerFast"
194
+ image_processor_class = "AutoImageProcessor"
195
+ audio_processor_class = "AutoFeatureExtractor"
196
+ valid_kwargs = ["chat_template"]
197
+
198
+ def __init__(
199
+ self,
200
+ image_processor,
201
+ audio_processor,
202
+ tokenizer,
203
+ **kwargs,
204
+ ):
205
+ self.image_token = tokenizer.image_token
206
+ self.image_token_id = tokenizer.image_token_id
207
+ self.audio_token = tokenizer.audio_token
208
+ self.audio_token_id = tokenizer.audio_token_id
209
+ super().__init__(image_processor, audio_processor, tokenizer, **kwargs)
210
+
211
+ def __call__(
212
+ self,
213
+ text: Union[TextInput, List[TextInput]],
214
+ images: Optional[ImageInput] = None,
215
+ audio: Optional[AudioInput] = None,
216
+ **kwargs: Unpack[ProcessingKwargs],
217
+ ) -> BatchFeature:
218
+ """
219
+ Main method to prepare for the model one or several sequences(s) and image(s). This method forards the `text`
220
+ and `kwargs` arguments to GPT2Tokenizer's [`~GPT2Tokenizer.__call__`] if `text` is not `None` to encode
221
+ the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
222
+ Phi4MultimodalImageProcessorFast's [`~Phi4MultimodalImageProcessorFast.__call__`] if `images` is not `None`. Please refer to the doctsring
223
+ of the above two methods for more information.
224
+
225
+ Args:
226
+ text (`str`, `List[str]`, `List[List[str]]`):
227
+ The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
228
+ (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
229
+ `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
230
+ images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
231
+ The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
232
+ tensor. Both channels-first and channels-last formats are supported.
233
+ audio (`List[Union[np.ndarray, torch.Tensor]]`):
234
+ List of the audios to be prepared.
235
+
236
+ Returns:
237
+ [`BatchFeature`]: A [`BatchFeature`] with the following fields:
238
+
239
+ - **input_ids** -- List of token ids to be fed to a model.
240
+ - **attention_mask** -- List of indices specifying which tokens should be attended to by the model.
241
+ - **input_image_embeds** -- Pixel values to be fed to a model.
242
+ - **image_sizes** -- List of tuples specifying the size of each image in `input_image_embeds`.
243
+ - **image_attention_mask** -- List of attention masks for each image in `input_image_embeds`.
244
+ - **input_audio_embeds** -- Audio embeddings to be fed to a model.
245
+ - **audio_embed_sizes** -- List of integers specifying the size of each audio in `input_audio_embeds`.
246
+ """
247
+
248
+ output_kwargs = self._merge_kwargs(Phi4MultimodalProcessorKwargs, self.tokenizer.init_kwargs, **kwargs)
249
+ image_kwargs = output_kwargs["images_kwargs"]
250
+ audio_kwargs = output_kwargs["audio_kwargs"]
251
+
252
+ image_inputs = self.image_processor(images, **image_kwargs) if images is not None else {}
253
+ audio_inputs = self.audio_processor(audio, **audio_kwargs) if audio is not None else {}
254
+
255
+ # We pop here for images as we don't need it later
256
+ num_img_tokens = image_inputs.pop("num_img_tokens", [])
257
+ audio_embed_sizes = audio_inputs.get("audio_embed_sizes", [])
258
+
259
+ # Replace certain special tokens for compatibility
260
+ if isinstance(text, str):
261
+ text = [text]
262
+ elif not isinstance(text, list) and not isinstance(text[0], str):
263
+ raise ValueError("Invalid input text. Please provide a string, or a list of strings")
264
+
265
+ image_token = self.tokenizer.image_token
266
+ audio_token = self.tokenizer.audio_token
267
+
268
+ # Check that the number of special tokens is sound
269
+ concatenated_prompt = "".join(text)
270
+ if concatenated_prompt.count(image_token) != len(num_img_tokens):
271
+ raise ValueError(
272
+ "You should add as much image tokens `<|image|>` in your prompt as you pass `images` to the processor. ",
273
+ f"Input contains {concatenated_prompt.count(image_token)} tokens != {len(num_img_tokens)} images",
274
+ )
275
+ if concatenated_prompt.count(audio_token) != len(audio_embed_sizes):
276
+ raise ValueError(
277
+ "You should add as much audio tokens `<|audio|>` in your prompt as you pass `audios` to the processor. "
278
+ f"Input contains {concatenated_prompt.count(audio_token)} tokens != {len(audio_embed_sizes)} audios"
279
+ )
280
+
281
+ # Add appropriate number of image/audio tokens (note that the count of replacement is dynamic)
282
+ image_count_iter = iter(num_img_tokens)
283
+ audio_count_iter = iter(audio_embed_sizes)
284
+ processed_text = [
285
+ re.sub(re.escape(image_token), lambda _: image_token * next(image_count_iter), t) for t in text
286
+ ]
287
+ processed_text = [
288
+ re.sub(re.escape(audio_token), lambda _: audio_token * next(audio_count_iter), t) for t in processed_text
289
+ ]
290
+
291
+ return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
292
+ text_inputs = self.tokenizer(processed_text, **output_kwargs["text_kwargs"])
293
+ self._check_special_mm_tokens(processed_text, text_inputs, modalities=["image"])
294
+
295
+ # prepare batch feature
296
+ data = {
297
+ **text_inputs,
298
+ **image_inputs,
299
+ **audio_inputs,
300
+ }
301
+
302
+ return BatchFeature(data=data, tensor_type=return_tensors)
303
+
304
+ def batch_decode(self, *args, **kwargs):
305
+ """
306
+ This method forwards all its arguments to GPT2Tokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
307
+ refer to the docstring of this method for more information.
308
+ """
309
+ return self.tokenizer.batch_decode(*args, **kwargs)
310
+
311
+ def decode(self, *args, **kwargs):
312
+ """
313
+ This method forwards all its arguments to GPT2Tokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to
314
+ the docstring of this method for more information.
315
+ """
316
+ return self.tokenizer.decode(*args, **kwargs)
317
+
318
+ @property
319
+ def model_input_names(self):
320
+ tokenizer_input_names = self.tokenizer.model_input_names
321
+ image_processor_input_names = self.image_processor.model_input_names
322
+ audio_processor_input_names = self.audio_processor.model_input_names
323
+ return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names + audio_processor_input_names))
324
+
325
+ def _check_special_mm_tokens(self, text: list[str], text_inputs: "BatchFeature", modalities: list[str]):
326
+ """
327
+ Checks that number of special tokens in text and processed text is same. The count can be different
328
+ if tokenized text was truncated, leading to issues in model code.
329
+ """
330
+ for modality in modalities:
331
+ token_str = getattr(self, f"{modality}_token")
332
+ token_id = getattr(self, f"{modality}_token_id")
333
+ ids_count = [list(ids).count(token_id) for ids in text_inputs["input_ids"]]
334
+ text_count = [sample.count(token_str) for sample in text]
335
+
336
+ if ids_count != text_count:
337
+ raise ValueError(
338
+ f"Mismatch in `{modality}` token count between text and `input_ids`. Got ids={ids_count} and text={text_count}. "
339
+ "Likely due to `truncation='max_length'`. Please disable truncation or increase `max_length`."
340
+ )
341
+
342
+ def apply_chat_template(
343
+ self,
344
+ conversation: Union[list[dict[str, str]], list[list[dict[str, str]]]],
345
+ chat_template: Optional[str] = None,
346
+ **kwargs: Unpack[AllKwargsForChatTemplate],
347
+ ) -> str:
348
+ """
349
+ Similar to the `apply_chat_template` method on tokenizers, this method applies a Jinja template to input
350
+ conversations to turn them into a single tokenizable string.
351
+
352
+ The input is expected to be in the following format, where each message content is a list consisting of text and
353
+ optionally image or video inputs. One can also provide an image, video, URL or local path which will be used to form
354
+ `pixel_values` when `return_dict=True`. If not provided, one will get only the formatted text, optionally tokenized text.
355
+
356
+ conversation = [
357
+ {
358
+ "role": "user",
359
+ "content": [
360
+ {"type": "image", "image": "https://www.ilankelman.org/stopsigns/australia.jpg"},
361
+ {"type": "text", "text": "Please describe this image in detail."},
362
+ ],
363
+ },
364
+ ]
365
+
366
+ Args:
367
+ conversation (`Union[List[Dict, [str, str]], List[List[Dict[str, str]]]]`):
368
+ The conversation to format.
369
+ chat_template (`Optional[str]`, *optional*):
370
+ The Jinja template to use for formatting the conversation. If not provided, the tokenizer's
371
+ chat template is used.
372
+ """
373
+
374
+ if chat_template is None:
375
+ if isinstance(self.chat_template, dict) and "default" in self.chat_template:
376
+ chat_template = self.chat_template["default"]
377
+ elif isinstance(self.chat_template, dict):
378
+ raise ValueError(
379
+ 'The processor has multiple chat templates but none of them are named "default". You need to specify'
380
+ " which one to use by passing the `chat_template` argument. Available templates are: "
381
+ f"{', '.join(self.chat_template.keys())}"
382
+ )
383
+ elif self.chat_template is not None:
384
+ chat_template = self.chat_template
385
+ else:
386
+ raise ValueError(
387
+ "Cannot use apply_chat_template because this processor does not have a chat template."
388
+ )
389
+ else:
390
+ if isinstance(self.chat_template, dict) and chat_template in self.chat_template:
391
+ # It's the name of a template, not a full template string
392
+ chat_template = self.chat_template[chat_template]
393
+ else:
394
+ # It's a template string, render it directly
395
+ chat_template = chat_template
396
+
397
+ # Fill sets of kwargs that should be used by different parts of template
398
+ processed_kwargs = {
399
+ "mm_load_kwargs": {},
400
+ "template_kwargs": {},
401
+ }
402
+
403
+ for kwarg_type in processed_kwargs:
404
+ for key in AllKwargsForChatTemplate.__annotations__[kwarg_type].__annotations__.keys():
405
+ kwarg_type_defaults = AllKwargsForChatTemplate.__annotations__[kwarg_type]
406
+ default_value = getattr(kwarg_type_defaults, key, None)
407
+ value = kwargs.pop(key, default_value)
408
+ if value is not None and not isinstance(value, dict):
409
+ processed_kwargs[kwarg_type][key] = value
410
+
411
+ if isinstance(conversation, (list, tuple)) and (
412
+ isinstance(conversation[0], (list, tuple)) or hasattr(conversation[0], "content")
413
+ ):
414
+ is_batched = True
415
+ conversations = conversation
416
+ else:
417
+ is_batched = False
418
+ conversations = [conversation]
419
+
420
+ tokenize = processed_kwargs["template_kwargs"].pop("tokenize", False)
421
+ return_dict = processed_kwargs["template_kwargs"].pop("return_dict", False)
422
+ mm_load_kwargs = processed_kwargs["mm_load_kwargs"]
423
+
424
+ if tokenize:
425
+ batch_images, batch_videos = [], []
426
+ batch_audios = []
427
+ batch_video_metadata = []
428
+ for conversation in conversations:
429
+ images, videos = [], []
430
+ video_metadata = []
431
+ for message in conversation:
432
+ visuals = [content for content in message["content"] if content["type"] in ["image", "video"]]
433
+ audio_fnames = [
434
+ content[key]
435
+ for content in message["content"]
436
+ for key in ["audio", "url", "path"]
437
+ if key in content and content["type"] == "audio"
438
+ ]
439
+ image_fnames = [
440
+ vision_info[key]
441
+ for vision_info in visuals
442
+ for key in ["image", "url", "path", "base64"]
443
+ if key in vision_info and vision_info["type"] == "image"
444
+ ]
445
+ video_fnames = [
446
+ vision_info[key]
447
+ for vision_info in visuals
448
+ for key in ["video", "url", "path"]
449
+ if key in vision_info and vision_info["type"] == "video"
450
+ ]
451
+
452
+ for fname in image_fnames:
453
+ images.append(load_image(fname))
454
+
455
+ # Audio models do not accept nested list of audios (yet!) so we construct a flat input audio list
456
+ if not mm_load_kwargs["load_audio_from_video"]:
457
+ for fname in audio_fnames:
458
+ batch_audios.append(load_audio(fname, sampling_rate=mm_load_kwargs["sampling_rate"]))
459
+ else:
460
+ for fname in video_fnames:
461
+ batch_audios.append(load_audio(fname, sampling_rate=mm_load_kwargs["sampling_rate"]))
462
+
463
+ for fname in video_fnames:
464
+ if isinstance(fname, (list, tuple)) and isinstance(fname[0], str):
465
+ video = [np.array(load_image(image_fname)) for image_fname in fname]
466
+ # create a 4D video because `load_video` always returns a 4D array
467
+ video = np.stack(video)
468
+ metadata = None
469
+ logger.warning(
470
+ "When loading the video from list of images, we cannot infer metadata such as `fps` or `duration`. "
471
+ "If your model uses this metadata during processing, please load the whole video and let the model sample frames instead."
472
+ )
473
+ else:
474
+ # TODO: raushan, should be `self.video_processor.load_video_for_model` when API is added
475
+ video, metadata = self._load_video_for_model(
476
+ fname,
477
+ num_frames=mm_load_kwargs.get("num_frames", None),
478
+ fps=mm_load_kwargs.get("video_fps", None),
479
+ backend=mm_load_kwargs["video_load_backend"],
480
+ **kwargs,
481
+ )
482
+ videos.append(video)
483
+ video_metadata.append(metadata)
484
+
485
+ # Currently all processors can accept nested list of batches, but not flat list of visuals
486
+ # So we'll make a batched list of images and let the processor handle it
487
+ if images:
488
+ batch_images.append(images)
489
+ if videos:
490
+ batch_videos.append(videos)
491
+ batch_video_metadata.append(video_metadata)
492
+
493
+ # Process conversation with video/image information if needed. Then convert into a prompt using Jinja template
494
+ conversations = self._process_messages_for_chat_template(
495
+ conversations,
496
+ batch_images=batch_images,
497
+ batch_videos=batch_videos,
498
+ batch_video_metadata=batch_video_metadata,
499
+ **processed_kwargs["mm_load_kwargs"],
500
+ )
501
+
502
+ prompt = self.tokenizer.apply_chat_template(
503
+ conversations,
504
+ chat_template=chat_template,
505
+ tokenize=False,
506
+ return_dict=False,
507
+ **processed_kwargs["template_kwargs"],
508
+ )
509
+
510
+ if not is_batched:
511
+ prompt = prompt[0]
512
+
513
+ if tokenize:
514
+ # Tokenizer's `apply_chat_template` never adds special tokens when tokenizing
515
+ # But processor's `apply_chat_template` didn't have an option to tokenize, so users had to format the prompt
516
+ # and pass it to the processor. Users thus never worried about special tokens relying on processor handling
517
+ # everything internally. The below line is to keep BC for that and be able to work with model that have
518
+ # special tokens in the template (consistent with tokenizers). We dont want to raise warning, it will flood command line
519
+ # without actionable solution for users
520
+ single_prompt = prompt[0] if is_batched else prompt
521
+ if self.tokenizer.bos_token is not None and single_prompt.startswith(self.tokenizer.bos_token):
522
+ kwargs["add_special_tokens"] = False
523
+
524
+ out = self(
525
+ text=prompt,
526
+ images=batch_images if batch_images else None,
527
+ videos=batch_videos if batch_videos else None,
528
+ audio=batch_audios if batch_audios else None,
529
+ **kwargs,
530
+ )
531
+ if return_dict:
532
+ return out
533
+ else:
534
+ return out["input_ids"]
535
+ return prompt
536
+
537
+
538
+ __all__ = ["Phi4MultimodalProcessor"]
539
+
540
+
541
+ Phi4MultimodalProcessor.register_for_auto_class()
special_tokens_map.json CHANGED
@@ -13,7 +13,13 @@
13
  "rstrip": false,
14
  "single_word": false
15
  },
16
- "pad_token": "<|endoftext|>",
 
 
 
 
 
 
17
  "unk_token": {
18
  "content": "<|endoftext|>",
19
  "lstrip": false,
 
13
  "rstrip": false,
14
  "single_word": false
15
  },
16
+ "pad_token": {
17
+ "content": "<|endoftext|>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
  "unk_token": {
24
  "content": "<|endoftext|>",
25
  "lstrip": false,
speech-lora/adapter_config.json CHANGED
@@ -1,23 +1,31 @@
1
  {
 
2
  "auto_mapping": null,
3
- "base_model_name_or_path": "TBA",
4
  "bias": "none",
 
 
 
5
  "fan_in_fan_out": false,
6
- "inference_mode": true,
7
  "init_lora_weights": true,
 
8
  "layers_pattern": null,
9
  "layers_to_transform": null,
 
10
  "lora_alpha": 640,
 
11
  "lora_dropout": 0.01,
12
- "modules_to_save": [],
 
 
13
  "peft_type": "LORA",
14
  "r": 320,
 
15
  "revision": null,
16
- "target_modules": [
17
- "qkv_proj",
18
- "o_proj",
19
- "gate_up_proj",
20
- "down_proj"
21
- ],
22
- "task_type": "CAUSAL_LM"
23
  }
 
1
  {
2
+ "alpha_pattern": {},
3
  "auto_mapping": null,
4
+ "base_model_name_or_path": null,
5
  "bias": "none",
6
+ "corda_config": null,
7
+ "eva_config": null,
8
+ "exclude_modules": null,
9
  "fan_in_fan_out": false,
10
+ "inference_mode": false,
11
  "init_lora_weights": true,
12
+ "layer_replication": null,
13
  "layers_pattern": null,
14
  "layers_to_transform": null,
15
+ "loftq_config": {},
16
  "lora_alpha": 640,
17
+ "lora_bias": false,
18
  "lora_dropout": 0.01,
19
+ "megatron_config": null,
20
+ "megatron_core": "megatron.core",
21
+ "modules_to_save": null,
22
  "peft_type": "LORA",
23
  "r": 320,
24
+ "rank_pattern": {},
25
  "revision": null,
26
+ "target_modules": "model.layers.\\d+.((self_attn.(qkv|o)_proj)|(mlp.(gate_up|down)_proj))",
27
+ "task_type": "CAUSAL_LM",
28
+ "trainable_token_indices": null,
29
+ "use_dora": false,
30
+ "use_rslora": false
 
 
31
  }
speech-lora/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1c2237461a4d1f9292cd128147bd3f0f70326a48d5d79c8e0f7583b26c095b30
3
- size 922782296
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16f70b0aba566f6c30e67a11e90033453e9375d102e031cec40956a2a0e9771e
3
+ size 922777944
tokenizer.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4c1b9f641d4f8b7247b8d5007dd3b6a9f6a87cb5123134fe0d326f14d10c0585
3
- size 15524479
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:57589a5827b578065aecc0a91cc1e4e9a0bac0a17fb02539bea63bb9beb889a2
3
+ size 13303259
tokenizer_config.json CHANGED
@@ -1,125 +1 @@
1
- {
2
- "add_prefix_space": false,
3
- "added_tokens_decoder": {
4
- "200010": {
5
- "content": "<|endoftext10|>",
6
- "lstrip": false,
7
- "normalized": false,
8
- "rstrip": false,
9
- "single_word": false,
10
- "special": true
11
- },
12
- "200011": {
13
- "content": "<|endoftext11|>",
14
- "lstrip": false,
15
- "normalized": false,
16
- "rstrip": false,
17
- "single_word": false,
18
- "special": true
19
- },
20
- "199999": {
21
- "content": "<|endoftext|>",
22
- "lstrip": false,
23
- "normalized": false,
24
- "rstrip": false,
25
- "single_word": false,
26
- "special": true
27
- },
28
- "200018": {
29
- "content": "<|endofprompt|>",
30
- "lstrip": false,
31
- "normalized": false,
32
- "rstrip": false,
33
- "single_word": false,
34
- "special": true
35
- },
36
- "200019": {
37
- "content": "<|assistant|>",
38
- "lstrip": false,
39
- "normalized": false,
40
- "rstrip": true,
41
- "single_word": false,
42
- "special": true
43
- },
44
- "200020": {
45
- "content": "<|end|>",
46
- "lstrip": false,
47
- "normalized": false,
48
- "rstrip": true,
49
- "single_word": false,
50
- "special": true
51
- },
52
- "200021": {
53
- "content": "<|user|>",
54
- "lstrip": false,
55
- "normalized": false,
56
- "rstrip": true,
57
- "single_word": false,
58
- "special": true
59
- },
60
- "200022": {
61
- "content": "<|system|>",
62
- "lstrip": false,
63
- "normalized": false,
64
- "rstrip": true,
65
- "single_word": false,
66
- "special": true
67
- },
68
- "200023": {
69
- "content": "<|tool|>",
70
- "lstrip": false,
71
- "normalized": false,
72
- "rstrip": true,
73
- "single_word": false,
74
- "special": false
75
- },
76
- "200024": {
77
- "content": "<|/tool|>",
78
- "lstrip": false,
79
- "normalized": false,
80
- "rstrip": true,
81
- "single_word": false,
82
- "special": false
83
- },
84
- "200025": {
85
- "content": "<|tool_call|>",
86
- "lstrip": false,
87
- "normalized": false,
88
- "rstrip": true,
89
- "single_word": false,
90
- "special": false
91
- },
92
- "200026": {
93
- "content": "<|/tool_call|>",
94
- "lstrip": false,
95
- "normalized": false,
96
- "rstrip": true,
97
- "single_word": false,
98
- "special": false
99
- },
100
- "200027": {
101
- "content": "<|tool_response|>",
102
- "lstrip": false,
103
- "normalized": false,
104
- "rstrip": true,
105
- "single_word": false,
106
- "special": false
107
- },
108
- "200028": {
109
- "content": "<|tag|>",
110
- "lstrip": false,
111
- "normalized": false,
112
- "rstrip": true,
113
- "single_word": false,
114
- "special": true
115
- }
116
- },
117
- "bos_token": "<|endoftext|>",
118
- "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and 'tools' in message and message['tools'] is not none %}{{ '<|' + message['role'] + '|>' + message['content'] + '<|tool|>' + message['tools'] + '<|/tool|>' + '<|end|>' }}{% else %}{{ '<|' + message['role'] + '|>' + message['content'] + '<|end|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>' }}{% else %}{{ eos_token }}{% endif %}",
119
- "clean_up_tokenization_spaces": false,
120
- "eos_token": "<|endoftext|>",
121
- "model_max_length": 131072,
122
- "pad_token": "<|endoftext|>",
123
- "tokenizer_class": "GPT2TokenizerFast",
124
- "unk_token": "<|endoftext|>"
125
- }
 
1
+ {"add_prefix_space": false, "added_tokens_decoder": {"199999": {"content": "<|endoftext|>", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true}, "200010": {"content": "<|image|>", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true}, "200011": {"content": "<|audio|>", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true}, "200018": {"content": "<|endofprompt|>", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true}, "200019": {"content": "<|assistant|>", "lstrip": false, "normalized": false, "rstrip": true, "single_word": false, "special": true}, "200020": {"content": "<|end|>", "lstrip": false, "normalized": false, "rstrip": true, "single_word": false, "special": true}, "200021": {"content": "<|user|>", "lstrip": false, "normalized": false, "rstrip": true, "single_word": false, "special": true}, "200022": {"content": "<|system|>", "lstrip": false, "normalized": false, "rstrip": true, "single_word": false, "special": true}, "200023": {"content": "<|tool|>", "lstrip": false, "normalized": false, "rstrip": true, "single_word": false, "special": false}, "200024": {"content": "<|/tool|>", "lstrip": false, "normalized": false, "rstrip": true, "single_word": false, "special": false}, "200025": {"content": "<|tool_call|>", "lstrip": false, "normalized": false, "rstrip": true, "single_word": false, "special": false}, "200026": {"content": "<|/tool_call|>", "lstrip": false, "normalized": false, "rstrip": true, "single_word": false, "special": false}, "200027": {"content": "<|tool_response|>", "lstrip": false, "normalized": false, "rstrip": true, "single_word": false, "special": false}, "200028": {"content": "<|tag|>", "lstrip": false, "normalized": false, "rstrip": true, "single_word": false, "special": true}}, "audio_token": "<|audio|>", "bos_token": "<|endoftext|>", "clean_up_tokenization_spaces": false, "eos_token": "<|endoftext|>", "extra_special_tokens": {"audio_token": "<|audio|>", "image_token": "<|image|>"}, "image_token": "<|image|>", "model_max_length": 131072, "pad_token": "<|endoftext|>", "processor_class": "Phi4MultimodalProcessor", "tokenizer_class": "GPT2Tokenizer", "unk_token": "<|endoftext|>"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
vision-lora/adapter_config.json CHANGED
@@ -1,23 +1,31 @@
1
  {
 
2
  "auto_mapping": null,
3
- "base_model_name_or_path": "TBA",
4
  "bias": "none",
 
 
 
5
  "fan_in_fan_out": false,
6
- "inference_mode": true,
7
  "init_lora_weights": true,
 
8
  "layers_pattern": null,
9
  "layers_to_transform": null,
 
10
  "lora_alpha": 512,
 
11
  "lora_dropout": 0.0,
12
- "modules_to_save": [],
 
 
13
  "peft_type": "LORA",
14
  "r": 256,
 
15
  "revision": null,
16
- "target_modules": [
17
- "qkv_proj",
18
- "o_proj",
19
- "gate_up_proj",
20
- "down_proj"
21
- ],
22
- "task_type": "CAUSAL_LM"
23
  }
 
1
  {
2
+ "alpha_pattern": {},
3
  "auto_mapping": null,
4
+ "base_model_name_or_path": null,
5
  "bias": "none",
6
+ "corda_config": null,
7
+ "eva_config": null,
8
+ "exclude_modules": null,
9
  "fan_in_fan_out": false,
10
+ "inference_mode": false,
11
  "init_lora_weights": true,
12
+ "layer_replication": null,
13
  "layers_pattern": null,
14
  "layers_to_transform": null,
15
+ "loftq_config": {},
16
  "lora_alpha": 512,
17
+ "lora_bias": false,
18
  "lora_dropout": 0.0,
19
+ "megatron_config": null,
20
+ "megatron_core": "megatron.core",
21
+ "modules_to_save": null,
22
  "peft_type": "LORA",
23
  "r": 256,
24
+ "rank_pattern": {},
25
  "revision": null,
26
+ "target_modules": "model.layers.\\d+.((self_attn.(qkv|o)_proj)|(mlp.(gate_up|down)_proj))",
27
+ "task_type": "CAUSAL_LM",
28
+ "trainable_token_indices": null,
29
+ "use_dora": false,
30
+ "use_rslora": false
 
 
31
  }
vision-lora/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1620b16722edf701038bf66e3cd46412c7cc5458e58df89e9f92cedb71fcbde8
3
- size 738232904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76facf464ca0246e9f5dc409520e83764e0b73fa66fdb561526e064133728f8a
3
+ size 738228552
vocab.json CHANGED
The diff for this file is too large to render. See raw diff