Upload folder using huggingface_hub
Browse files- chat_template.jinja +1 -0
- config.json +55 -87
- configuration_phi4_multimodal.py +484 -0
- feature_extraction_phi4_multimodal.py +353 -0
- generation_config.json +3 -4
- image_processing_phi4_multimodal_fast.py +284 -0
- model-00001-of-00002.safetensors +3 -0
- model-00002-of-00002.safetensors +3 -0
- model.safetensors.index.json +0 -0
- modeling_phi4_multimodal.py +0 -0
- preprocessor_config.json +14 -7
- processing_phi4_multimodal.py +541 -0
- special_tokens_map.json +7 -1
- speech-lora/adapter_config.json +18 -10
- speech-lora/adapter_model.safetensors +2 -2
- tokenizer.json +2 -2
- tokenizer_config.json +1 -125
- vision-lora/adapter_config.json +18 -10
- vision-lora/adapter_model.safetensors +2 -2
- vocab.json +0 -0
chat_template.jinja
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{% for message in messages %}{{ '<|' + message['role'] + '|>' }}{% if message['content'] is string %}{{ message['content'] }}{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' %}{{ '<|image|>' }}{% elif content['type'] == 'audio' %}{{ '<|audio|>' }}{% elif content['type'] == 'text' %}{{ content['text'] }}{% endif %}{% endfor %}{% endif %}{% if message['role'] == 'system' and 'tools' in message and message['tools'] is not none %}{{ '<|tool|>' + message['tools'] + '<|/tool|>' + '<|end|>' }}{% endif %}{{ '<|end|>' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>' }}{% else %}{{ eos_token }}{% endif %}
|
config.json
CHANGED
@@ -1,82 +1,47 @@
|
|
1 |
{
|
2 |
-
"
|
|
|
|
|
|
|
3 |
"architectures": [
|
4 |
-
"
|
5 |
],
|
6 |
"attention_bias": false,
|
7 |
"attention_dropout": 0.0,
|
8 |
-
"
|
9 |
-
"
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
"kernel_size": 3,
|
36 |
-
"left_chunk": 18,
|
37 |
-
"linear_units": 1536,
|
38 |
-
"nemo_conv_settings": {
|
39 |
-
"conv_channels": 1024
|
40 |
-
},
|
41 |
-
"num_blocks": 24,
|
42 |
-
"relative_attention_bias_args": {
|
43 |
-
"t5_bias_max_distance": 500,
|
44 |
-
"type": "t5"
|
45 |
-
},
|
46 |
-
"time_reduction": 8
|
47 |
-
},
|
48 |
-
"name": "cascades"
|
49 |
-
},
|
50 |
-
"auto_map": {
|
51 |
-
"AutoConfig": "configuration_phi4mm.Phi4MMConfig",
|
52 |
-
"AutoModelForCausalLM": "modeling_phi4mm.Phi4MMForCausalLM",
|
53 |
-
"AutoTokenizer": "Xenova/gpt-4o"
|
54 |
},
|
55 |
"bos_token_id": 199999,
|
56 |
-
"embd_layer": {
|
57 |
-
"audio_embd_layer": {
|
58 |
-
"compression_rate": 8,
|
59 |
-
"downsample_rate": 1,
|
60 |
-
"embedding_cls": "audio",
|
61 |
-
"enable_gradient_checkpointing": true,
|
62 |
-
"projection_cls": "mlp",
|
63 |
-
"use_conv_downsample": false,
|
64 |
-
"use_qformer": false
|
65 |
-
},
|
66 |
-
"embedding_cls": "image_audio",
|
67 |
-
"image_embd_layer": {
|
68 |
-
"crop_size": 448,
|
69 |
-
"embedding_cls": "tune_image",
|
70 |
-
"enable_gradient_checkpointing": true,
|
71 |
-
"hd_transform_order": "sub_glb",
|
72 |
-
"image_token_compression_cls": "avg_pool_2d",
|
73 |
-
"projection_cls": "mlp",
|
74 |
-
"use_hd_transform": true,
|
75 |
-
"with_learnable_separator": true
|
76 |
-
}
|
77 |
-
},
|
78 |
"embd_pdrop": 0.0,
|
79 |
-
"eos_token_id":
|
|
|
|
|
|
|
80 |
"full_attn_mod": 1,
|
81 |
"hidden_act": "silu",
|
82 |
"hidden_size": 3072,
|
@@ -84,21 +49,9 @@
|
|
84 |
"intermediate_size": 8192,
|
85 |
"interpolate_factor": 1,
|
86 |
"lm_head_bias": false,
|
87 |
-
"vision_lora": {
|
88 |
-
"dp": 0.0,
|
89 |
-
"layer": "layers.*((self_attn\\.(qkv_proj|o_proj))|(mlp\\.(gate_up|down)_proj))",
|
90 |
-
"lora_alpha": 512,
|
91 |
-
"r": 256
|
92 |
-
},
|
93 |
-
"speech_lora": {
|
94 |
-
"dp": 0.01,
|
95 |
-
"layer": "((layers.*self_attn\\.(qkv|o)_proj)|(layers.*mlp\\.(gate_up|down)_proj))",
|
96 |
-
"lora_alpha": 640,
|
97 |
-
"r": 320
|
98 |
-
},
|
99 |
"max_position_embeddings": 131072,
|
100 |
"mlp_bias": false,
|
101 |
-
"model_type": "
|
102 |
"num_attention_heads": 24,
|
103 |
"num_hidden_layers": 32,
|
104 |
"num_key_value_heads": 8,
|
@@ -214,8 +167,23 @@
|
|
214 |
"sliding_window": 262144,
|
215 |
"tie_word_embeddings": true,
|
216 |
"torch_dtype": "bfloat16",
|
217 |
-
"transformers_version": "4.
|
218 |
"use_cache": true,
|
219 |
-
"
|
220 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
221 |
}
|
|
|
1 |
{
|
2 |
+
"auto_map": {
|
3 |
+
"AutoConfig": "configuration_phi4_multimodal.Phi4MultimodalConfig",
|
4 |
+
"AutoModelForCausalLM": "modeling_phi4_multimodal.Phi4MultimodalForCausalLM"
|
5 |
+
},
|
6 |
"architectures": [
|
7 |
+
"Phi4MultimodalForCausalLM"
|
8 |
],
|
9 |
"attention_bias": false,
|
10 |
"attention_dropout": 0.0,
|
11 |
+
"audio_config": {
|
12 |
+
"activation": "swish",
|
13 |
+
"audio_token_id": 200011,
|
14 |
+
"bias_max_distance": 500,
|
15 |
+
"bias_symmetric": false,
|
16 |
+
"chunk_size": -1,
|
17 |
+
"conv_activation": "swish",
|
18 |
+
"conv_glu_type": "swish",
|
19 |
+
"depthwise_multiplier": 1,
|
20 |
+
"depthwise_seperable_out_channel": 1024,
|
21 |
+
"downsample_rate": 1,
|
22 |
+
"dropout_rate": 0.0,
|
23 |
+
"ext_pw_out_channel": 1024,
|
24 |
+
"feature_layer": -2,
|
25 |
+
"hidden_size": 1024,
|
26 |
+
"initializer_range": 0.02,
|
27 |
+
"input_size": 80,
|
28 |
+
"intermediate_size": 1536,
|
29 |
+
"kernel_size": 3,
|
30 |
+
"left_chunk": 18,
|
31 |
+
"model_type": "phi4_multimodal_audio",
|
32 |
+
"nemo_activation": "relu",
|
33 |
+
"nemo_conv_channels": 1024,
|
34 |
+
"nemo_final_size": 10,
|
35 |
+
"num_attention_heads": 16,
|
36 |
+
"num_blocks": 24,
|
37 |
+
"time_reduction": 8
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
},
|
39 |
"bos_token_id": 199999,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
"embd_pdrop": 0.0,
|
41 |
+
"eos_token_id": [
|
42 |
+
199999,
|
43 |
+
200020
|
44 |
+
],
|
45 |
"full_attn_mod": 1,
|
46 |
"hidden_act": "silu",
|
47 |
"hidden_size": 3072,
|
|
|
49 |
"intermediate_size": 8192,
|
50 |
"interpolate_factor": 1,
|
51 |
"lm_head_bias": false,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
"max_position_embeddings": 131072,
|
53 |
"mlp_bias": false,
|
54 |
+
"model_type": "phi4_multimodal",
|
55 |
"num_attention_heads": 24,
|
56 |
"num_hidden_layers": 32,
|
57 |
"num_key_value_heads": 8,
|
|
|
167 |
"sliding_window": 262144,
|
168 |
"tie_word_embeddings": true,
|
169 |
"torch_dtype": "bfloat16",
|
170 |
+
"transformers_version": "4.52.0.dev0",
|
171 |
"use_cache": true,
|
172 |
+
"vision_config": {
|
173 |
+
"attention_dropout": 0.0,
|
174 |
+
"crop_size": 448,
|
175 |
+
"feature_layer": -2,
|
176 |
+
"hidden_act": "gelu_pytorch_tanh",
|
177 |
+
"hidden_size": 1152,
|
178 |
+
"image_size": 448,
|
179 |
+
"image_token_id": 200010,
|
180 |
+
"intermediate_size": 4304,
|
181 |
+
"layer_norm_eps": 1e-06,
|
182 |
+
"model_type": "phi4_multimodal_vision",
|
183 |
+
"num_attention_heads": 16,
|
184 |
+
"num_channels": 3,
|
185 |
+
"num_hidden_layers": 27,
|
186 |
+
"patch_size": 14
|
187 |
+
},
|
188 |
+
"vocab_size": 200064
|
189 |
}
|
configuration_phi4_multimodal.py
ADDED
@@ -0,0 +1,484 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
2 |
+
# This file was automatically generated from src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py.
|
3 |
+
# Do NOT edit this file manually as any edits will be overwritten by the generation of
|
4 |
+
# the file from the modular. If any change should be done, please apply the change to the
|
5 |
+
# modular_phi4_multimodal.py file directly. One of our CI enforces this.
|
6 |
+
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
7 |
+
# Copyright 2025 Microsoft and the HuggingFace Inc. team. All rights reserved.
|
8 |
+
#
|
9 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
10 |
+
# you may not use this file except in compliance with the License.
|
11 |
+
# You may obtain a copy of the License at
|
12 |
+
#
|
13 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
14 |
+
#
|
15 |
+
# Unless required by applicable law or agreed to in writing, software
|
16 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
17 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
18 |
+
# See the License for the specific language governing permissions and
|
19 |
+
# limitations under the License.
|
20 |
+
|
21 |
+
import math
|
22 |
+
|
23 |
+
from transformers.configuration_utils import PretrainedConfig
|
24 |
+
|
25 |
+
|
26 |
+
class Phi4MultimodalVisionConfig(PretrainedConfig):
|
27 |
+
r"""
|
28 |
+
This is the configuration class to store the configuration of a [`Phi4MultimodalVisionModel`]. It is used to instantiate a
|
29 |
+
Phi4Multimodal vision encoder according to the specified arguments, defining the model architecture. Instantiating a
|
30 |
+
configuration with the defaults will yield a similar configuration to that of the vision encoder of
|
31 |
+
[microsoft/Phi-4-multimodal-instruct](https://huggingface.co/microsoft/Phi-4-multimodal-instruct) architecture.
|
32 |
+
|
33 |
+
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
34 |
+
documentation from [`PretrainedConfig`] for more information.
|
35 |
+
|
36 |
+
Args:
|
37 |
+
hidden_size (`int`, *optional*, defaults to 1152):
|
38 |
+
Dimensionality of the encoder layers and the pooler layer.
|
39 |
+
intermediate_size (`int`, *optional*, defaults to 4304):
|
40 |
+
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
|
41 |
+
num_hidden_layers (`int`, *optional*, defaults to 27):
|
42 |
+
Number of hidden layers in the Transformer encoder.
|
43 |
+
num_attention_heads (`int`, *optional*, defaults to 16):
|
44 |
+
Number of attention heads for each attention layer in the Transformer encoder.
|
45 |
+
num_channels (`int`, *optional*, defaults to 3):
|
46 |
+
Number of channels in the input images.
|
47 |
+
image_size (`int`, *optional*, defaults to 448):
|
48 |
+
The size (resolution) of each image.
|
49 |
+
patch_size (`int`, *optional*, defaults to 14):
|
50 |
+
The size (resolution) of each patch.
|
51 |
+
hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
|
52 |
+
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
|
53 |
+
`"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
|
54 |
+
layer_norm_eps (`float`, *optional*, defaults to 1e-06):
|
55 |
+
The epsilon used by the layer normalization layers.
|
56 |
+
attention_dropout (`float`, *optional*, defaults to 0.0):
|
57 |
+
The dropout ratio for the attention probabilities.
|
58 |
+
crop_size (`int`, *optional*, defaults to 448):
|
59 |
+
Crop size for the input images.
|
60 |
+
image_token_id (`int`, *optional*, defaults to 200010):
|
61 |
+
The image token id.
|
62 |
+
feature_layer (`int`, *optional*, defaults to -2):
|
63 |
+
The index of the layer of the encoder from which to extract image features.
|
64 |
+
|
65 |
+
Example:
|
66 |
+
|
67 |
+
```python
|
68 |
+
>>> from transformers import Phi4MultimodalVisionConfig
|
69 |
+
|
70 |
+
>>> # Initializing a Phi4MultimodalVisionConfig with microsoft/Phi-4-multimodal-instruct style configuration
|
71 |
+
>>> configuration = Phi4MultimodalVisionConfig()
|
72 |
+
```"""
|
73 |
+
|
74 |
+
model_type = "phi4_multimodal_vision"
|
75 |
+
base_config_key = "vision_config"
|
76 |
+
|
77 |
+
def __init__(
|
78 |
+
self,
|
79 |
+
hidden_size=1152,
|
80 |
+
intermediate_size=4304,
|
81 |
+
num_hidden_layers=27,
|
82 |
+
num_attention_heads=16,
|
83 |
+
num_channels=3,
|
84 |
+
image_size=448,
|
85 |
+
patch_size=14,
|
86 |
+
hidden_act="gelu_pytorch_tanh",
|
87 |
+
layer_norm_eps=1e-6,
|
88 |
+
attention_dropout=0.0,
|
89 |
+
crop_size: int = 448,
|
90 |
+
image_token_id: int = 200010,
|
91 |
+
feature_layer: int = -2,
|
92 |
+
**kwargs,
|
93 |
+
):
|
94 |
+
super().__init__(**kwargs)
|
95 |
+
|
96 |
+
self.hidden_size = hidden_size
|
97 |
+
self.intermediate_size = intermediate_size
|
98 |
+
self.num_hidden_layers = num_hidden_layers
|
99 |
+
self.num_attention_heads = num_attention_heads
|
100 |
+
self.num_channels = num_channels
|
101 |
+
self.patch_size = patch_size
|
102 |
+
self.image_size = image_size
|
103 |
+
self.attention_dropout = attention_dropout
|
104 |
+
self.layer_norm_eps = layer_norm_eps
|
105 |
+
self.hidden_act = hidden_act
|
106 |
+
self.crop_size = crop_size
|
107 |
+
self.image_token_id = image_token_id
|
108 |
+
self.feature_layer = feature_layer
|
109 |
+
|
110 |
+
|
111 |
+
class Phi4MultimodalAudioConfig(PretrainedConfig):
|
112 |
+
r"""
|
113 |
+
This is the configuration class to store the configuration of a [`Phi4MultimodalAudioModel`]. It is used to instantiate a
|
114 |
+
Phi4Multimodal audio encoder according to the specified arguments, defining the model architecture. Instantiating a
|
115 |
+
configuration with the defaults will yield a similar configuration to that of the audio encoder of
|
116 |
+
[microsoft/Phi-4-multimodal-instruct](https://huggingface.co/microsoft/Phi-4-multimodal-instruct) architecture.
|
117 |
+
|
118 |
+
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
119 |
+
documentation from [`PretrainedConfig`] for more information.
|
120 |
+
|
121 |
+
Args:
|
122 |
+
hidden_size (`int`, *optional*, defaults to 1024):
|
123 |
+
Dimensionality of the encoder layers.
|
124 |
+
intermediate_size (`int`, *optional*, defaults to 1536):
|
125 |
+
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
|
126 |
+
num_blocks (`int`, *optional*, defaults to 24):
|
127 |
+
Number of hidden layers in the Transformer encoder.
|
128 |
+
num_attention_heads (`int`, *optional*, defaults to 16):
|
129 |
+
Number of attention heads for each attention layer in the Transformer encoder.
|
130 |
+
activation (`str`, *optional*, defaults to `"swish"`):
|
131 |
+
The non-linear activation function in the MLPs.
|
132 |
+
chunk_size (`int`, *optional*, defaults to -1):
|
133 |
+
The chunk size to create the masks.
|
134 |
+
left_chunk (`int`, *optional*, defaults to 18):
|
135 |
+
The left chunk to create the masks.
|
136 |
+
dropout_rate (`float`, *optional*, defaults to 0.0):
|
137 |
+
The dropout ratio.
|
138 |
+
ext_pw_out_channel (`int`, *optional*, defaults to 1024):
|
139 |
+
Number of out channels in the point-wise conv modules.
|
140 |
+
depthwise_seperable_out_channel (`int`, *optional*, defaults to 1024):
|
141 |
+
Number of out channels in the depth-wise separable conv modules.
|
142 |
+
depthwise_multiplier (`int`, *optional*, defaults to 1):
|
143 |
+
Input size multiplier for the depth-wise separable conv modules.
|
144 |
+
kernel_size (`int`, *optional*, defaults to 3):
|
145 |
+
Kernel size for the depth-wise separable conv modules.
|
146 |
+
conv_activation (`str`, *optional*, defaults to `"swish"`):
|
147 |
+
The non-linear activation function in the conv modules.
|
148 |
+
input_size (`int`, *optional*, defaults to 80):
|
149 |
+
Input size for the audio model.
|
150 |
+
conv_glu_type (`str`, *optional*, defaults to `"swish"`):
|
151 |
+
The non-linear activation function in the point-wise conv modules.
|
152 |
+
time_reduction (`int`, *optional*, defaults to 8):
|
153 |
+
Time reduction (subsampling factor).
|
154 |
+
bias_max_distance (`int`, *optional*, defaults to 1000):
|
155 |
+
Max distance for the relative attention bias module.
|
156 |
+
bias_symmetric (`bool`, *optional*, defaults to `False`):
|
157 |
+
Whether the relative attention bias should be symmetric or not.
|
158 |
+
nemo_activation (`str`, *optional*, defaults to `"relu"`):
|
159 |
+
The non-linear activation function in the nemo conv modules.
|
160 |
+
nemo_conv_channels (`int`, *optional*, defaults to 1024):
|
161 |
+
Number of channels in the nemo conv modules.
|
162 |
+
downsample_rate (`int`, *optional*, defaults to 1):
|
163 |
+
Downsample rate for the audio feature extractor.
|
164 |
+
initializer_range (`float`, *optional*, defaults to 0.02):
|
165 |
+
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
166 |
+
audio_token_id (`int`, *optional*, defaults to 200011):
|
167 |
+
The audio token id.
|
168 |
+
feature_layer (`int`, *optional*, defaults to -2):
|
169 |
+
The index of the layer of the encoder from which to extract audio features.
|
170 |
+
|
171 |
+
Example:
|
172 |
+
|
173 |
+
```python
|
174 |
+
>>> from transformers import Phi4MultimodalAudioConfig
|
175 |
+
|
176 |
+
>>> # Initializing a Phi4MultimodalAudioConfig with microsoft/Phi-4-multimodal-instruct style configuration
|
177 |
+
>>> configuration = Phi4MultimodalAudioConfig()
|
178 |
+
```"""
|
179 |
+
|
180 |
+
model_type = "phi4_multimodal_audio"
|
181 |
+
|
182 |
+
def __init__(
|
183 |
+
self,
|
184 |
+
hidden_size: int = 1024,
|
185 |
+
intermediate_size: int = 1536,
|
186 |
+
num_blocks: int = 24,
|
187 |
+
num_attention_heads: int = 16,
|
188 |
+
activation: str = "swish",
|
189 |
+
chunk_size: int = -1,
|
190 |
+
left_chunk: int = 18,
|
191 |
+
dropout_rate: float = 0.0,
|
192 |
+
ext_pw_out_channel: int = 1024,
|
193 |
+
depthwise_seperable_out_channel: int = 1024,
|
194 |
+
depthwise_multiplier: int = 1,
|
195 |
+
kernel_size: int = 3,
|
196 |
+
conv_activation: str = "swish",
|
197 |
+
input_size: int = 80,
|
198 |
+
conv_glu_type: str = "swish",
|
199 |
+
time_reduction: int = 8,
|
200 |
+
bias_max_distance: int = 1000,
|
201 |
+
bias_symmetric: bool = False,
|
202 |
+
nemo_activation: str = "relu",
|
203 |
+
nemo_conv_channels: int = 1024,
|
204 |
+
downsample_rate: int = 1,
|
205 |
+
initializer_range: float = 0.02,
|
206 |
+
audio_token_id: int = 200011,
|
207 |
+
feature_layer: int = -2,
|
208 |
+
**kwargs,
|
209 |
+
):
|
210 |
+
super().__init__(**kwargs)
|
211 |
+
self.hidden_size = hidden_size
|
212 |
+
self.num_attention_heads = num_attention_heads
|
213 |
+
self.intermediate_size = intermediate_size
|
214 |
+
self.activation = activation
|
215 |
+
self.chunk_size = chunk_size
|
216 |
+
self.left_chunk = left_chunk
|
217 |
+
self.num_blocks = num_blocks
|
218 |
+
self.dropout_rate = dropout_rate
|
219 |
+
self.ext_pw_out_channel = ext_pw_out_channel
|
220 |
+
self.depthwise_seperable_out_channel = depthwise_seperable_out_channel
|
221 |
+
self.depthwise_multiplier = depthwise_multiplier
|
222 |
+
self.kernel_size = kernel_size
|
223 |
+
self.conv_activation = conv_activation
|
224 |
+
self.input_size = input_size
|
225 |
+
self.conv_glu_type = conv_glu_type
|
226 |
+
self.time_reduction = time_reduction
|
227 |
+
self.bias_max_distance = bias_max_distance
|
228 |
+
self.bias_symmetric = bias_symmetric
|
229 |
+
self.nemo_activation = nemo_activation
|
230 |
+
self.nemo_conv_channels = nemo_conv_channels
|
231 |
+
self.downsample_rate = downsample_rate
|
232 |
+
self.audio_token_id = audio_token_id
|
233 |
+
self.initializer_range = initializer_range
|
234 |
+
self.feature_layer = feature_layer
|
235 |
+
|
236 |
+
if time_reduction % 2 != 0:
|
237 |
+
raise ValueError("`time_reduction` should be a multiple of 2!")
|
238 |
+
length = input_size
|
239 |
+
for _ in range(int(math.log(time_reduction, 2))):
|
240 |
+
length = math.floor((length - 1) / 2 + 1)
|
241 |
+
self.nemo_final_size = length
|
242 |
+
|
243 |
+
|
244 |
+
class Phi4MultimodalConfig(PretrainedConfig):
|
245 |
+
r"""
|
246 |
+
This is the configuration class to store the configuration of a [`Phi4MultimodalModel`]. It is used to instantiate a
|
247 |
+
Phi4Multimodal model according to the specified arguments, defining the model architecture. Instantiating a configuration
|
248 |
+
with the defaults will yield a similar configuration to that of the
|
249 |
+
[microsoft/Phi-4-multimodal-instruct](https://huggingface.co/microsoft/Phi-4-multimodal-instruct) architecture.
|
250 |
+
|
251 |
+
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
252 |
+
documentation from [`PretrainedConfig`] for more information.
|
253 |
+
|
254 |
+
Args:
|
255 |
+
vocab_size (`int`, *optional*, defaults to 200064):
|
256 |
+
Vocabulary size of the Phi-3 model. Defines the number of different tokens that can be represented by the
|
257 |
+
`inputs_ids` passed when calling [`Phi3Model`].
|
258 |
+
hidden_size (`int`, *optional*, defaults to 3072):
|
259 |
+
Dimension of the hidden representations.
|
260 |
+
intermediate_size (`int`, *optional*, defaults to 8192):
|
261 |
+
Dimension of the MLP representations.
|
262 |
+
num_hidden_layers (`int`, *optional*, defaults to 32):
|
263 |
+
Number of hidden layers in the Transformer decoder.
|
264 |
+
num_attention_heads (`int`, *optional*, defaults to 32):
|
265 |
+
Number of attention heads for each attention layer in the Transformer decoder.
|
266 |
+
num_key_value_heads (`int`, *optional*, defaults to 8):
|
267 |
+
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
|
268 |
+
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
269 |
+
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
270 |
+
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
271 |
+
by meanpooling all the original heads within that group. For more details checkout [this
|
272 |
+
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
273 |
+
`num_attention_heads`.
|
274 |
+
resid_pdrop (`float`, *optional*, defaults to 0.0):
|
275 |
+
Dropout probability for mlp outputs.
|
276 |
+
embd_pdrop (`int`, *optional*, defaults to 0.0):
|
277 |
+
The dropout ratio for the embeddings.
|
278 |
+
attention_dropout (`float`, *optional*, defaults to 0.0):
|
279 |
+
The dropout ratio after computing the attention scores.
|
280 |
+
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
281 |
+
The non-linear activation function (function or string) in the decoder.
|
282 |
+
max_position_embeddings (`int`, *optional*, defaults to 131072):
|
283 |
+
The maximum sequence length that this model might ever be used with.
|
284 |
+
initializer_range (`float`, *optional*, defaults to 0.02):
|
285 |
+
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
286 |
+
rms_norm_eps (`float`, *optional*, defaults to 1e-05):
|
287 |
+
The epsilon value used for the RMSNorm.
|
288 |
+
use_cache (`bool`, *optional*, defaults to `True`):
|
289 |
+
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
290 |
+
relevant if `config.is_decoder=True`. Whether to tie weight embeddings or not.
|
291 |
+
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
|
292 |
+
Whether to tie weight embeddings
|
293 |
+
rope_theta (`float`, *optional*, defaults to 10000.0):
|
294 |
+
The base period of the RoPE embeddings.
|
295 |
+
rope_scaling (`dict`, *optional*):
|
296 |
+
The scaling strategy for the RoPE embeddings. If `None`, no scaling is applied. If a dictionary, it must
|
297 |
+
contain the following keys: `type`, `short_factor` and `long_factor`. The `type` must be `longrope` and
|
298 |
+
the `short_factor` and `long_factor` must be lists of numbers with the same length as the hidden size
|
299 |
+
divided by the number of attention heads divided by 2.
|
300 |
+
partial_rotary_factor (`float`, *optional*, defaults to `1.0`):
|
301 |
+
Percentage of the query and keys which will have rotary embedding. Must be between 0.0 and 1.0.
|
302 |
+
bos_token_id (`int`, *optional*, defaults to 199999):
|
303 |
+
The id of the "beginning-of-sequence" token.
|
304 |
+
eos_token_id (`int` or `list[int]`, *optional*, defaults to `[199999, 200020]`):
|
305 |
+
The id of the "end-of-sequence" token.
|
306 |
+
pad_token_id (`int`, *optional*, defaults to 199999):
|
307 |
+
The id of the padding token.
|
308 |
+
original_max_position_embeddings (`int`, *optional*, defaults to 4096):
|
309 |
+
The maximum sequence length that this model was trained with. This is used to determine the size of the
|
310 |
+
original RoPE embeddings when using long scaling.
|
311 |
+
sliding_window (`int`, *optional*):
|
312 |
+
Sliding window attention window size. If `None`, no sliding window is applied.
|
313 |
+
vision_config (`Phi4MultimodalVisionConfig` or `dict`, *optional*):
|
314 |
+
The vision config for the underlying image embedding model. If not provided, will default to the configuration
|
315 |
+
used to instantiate a model similar in architecture as
|
316 |
+
[microsoft/Phi-4-multimodal-instruct](https://huggingface.co/microsoft/Phi-4-multimodal-instruct).
|
317 |
+
audio_config (`Phi4MultimodalAudioConfig` or `dict`, *optional*):
|
318 |
+
The audio config for the underlying audio embedding model. If not provided, will default to the configuration
|
319 |
+
used to instantiate a model similar in architecture as
|
320 |
+
[microsoft/Phi-4-multimodal-instruct](https://huggingface.co/microsoft/Phi-4-multimodal-instruct).
|
321 |
+
|
322 |
+
Example:
|
323 |
+
|
324 |
+
```python
|
325 |
+
>>> from transformers import Phi4MultimodalModel, Phi4MultimodalConfig
|
326 |
+
|
327 |
+
>>> # Initializing a Phi4Multimodal style configuration
|
328 |
+
>>> configuration = Phi4MultimodalConfig.from_pretrained("microsoft/Phi-4-multimodal-instruct")
|
329 |
+
|
330 |
+
>>> # Initializing a model from the configuration
|
331 |
+
>>> model = Phi4MultimodalModel(configuration)
|
332 |
+
|
333 |
+
>>> # Accessing the model configuration
|
334 |
+
>>> configuration = model.config
|
335 |
+
```"""
|
336 |
+
|
337 |
+
model_type = "phi4_multimodal"
|
338 |
+
keys_to_ignore_at_inference = ["past_key_values"]
|
339 |
+
base_model_tp_plan = {
|
340 |
+
"layers.*.self_attn.qkv_proj": "colwise_rep", # we need to replicate here due to the slicing of qkv
|
341 |
+
"layers.*.self_attn.o_proj": "rowwise_rep", # we need to replicate here due to the slicing of qkv
|
342 |
+
"layers.*.mlp.gate_up_proj": "colwise_rep", # we need to replicate here due to the `chunk` operation
|
343 |
+
"layers.*.mlp.down_proj": "rowwise_rep", # we need to replicate here due to the `chunk` operation
|
344 |
+
}
|
345 |
+
base_model_pp_plan = {
|
346 |
+
"embed_tokens": (["input_ids"], ["inputs_embeds"]),
|
347 |
+
"layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
|
348 |
+
"norm": (["hidden_states"], ["hidden_states"]),
|
349 |
+
}
|
350 |
+
|
351 |
+
sub_configs = {"audio_config": Phi4MultimodalAudioConfig, "vision_config": Phi4MultimodalVisionConfig}
|
352 |
+
|
353 |
+
def __init__(
|
354 |
+
self,
|
355 |
+
vocab_size=200064,
|
356 |
+
hidden_size=3072,
|
357 |
+
intermediate_size=8192,
|
358 |
+
num_hidden_layers=32,
|
359 |
+
num_attention_heads=32,
|
360 |
+
num_key_value_heads=8,
|
361 |
+
resid_pdrop=0.0,
|
362 |
+
embd_pdrop=0.0,
|
363 |
+
attention_dropout=0.0,
|
364 |
+
hidden_act="silu",
|
365 |
+
max_position_embeddings=131072,
|
366 |
+
initializer_range=0.02,
|
367 |
+
rms_norm_eps=1e-5,
|
368 |
+
use_cache=True,
|
369 |
+
tie_word_embeddings=False,
|
370 |
+
rope_theta=10000.0,
|
371 |
+
rope_scaling=None,
|
372 |
+
partial_rotary_factor=1,
|
373 |
+
bos_token_id=199999,
|
374 |
+
eos_token_id=[199999, 200020],
|
375 |
+
pad_token_id=199999,
|
376 |
+
original_max_position_embeddings=4096,
|
377 |
+
sliding_window=None,
|
378 |
+
vision_config=None,
|
379 |
+
audio_config=None,
|
380 |
+
**kwargs,
|
381 |
+
):
|
382 |
+
super().__init__(
|
383 |
+
bos_token_id=bos_token_id,
|
384 |
+
eos_token_id=eos_token_id,
|
385 |
+
pad_token_id=pad_token_id,
|
386 |
+
tie_word_embeddings=tie_word_embeddings,
|
387 |
+
**kwargs,
|
388 |
+
)
|
389 |
+
self.vocab_size = vocab_size
|
390 |
+
self.hidden_size = hidden_size
|
391 |
+
self.intermediate_size = intermediate_size
|
392 |
+
self.num_hidden_layers = num_hidden_layers
|
393 |
+
self.num_attention_heads = num_attention_heads
|
394 |
+
|
395 |
+
if num_key_value_heads is None:
|
396 |
+
num_key_value_heads = num_attention_heads
|
397 |
+
|
398 |
+
self.num_key_value_heads = num_key_value_heads
|
399 |
+
self.resid_pdrop = resid_pdrop
|
400 |
+
self.embd_pdrop = embd_pdrop
|
401 |
+
self.attention_dropout = attention_dropout
|
402 |
+
self.hidden_act = hidden_act
|
403 |
+
self.max_position_embeddings = max_position_embeddings
|
404 |
+
self.original_max_position_embeddings = original_max_position_embeddings
|
405 |
+
self.initializer_range = initializer_range
|
406 |
+
self.rms_norm_eps = rms_norm_eps
|
407 |
+
self.use_cache = use_cache
|
408 |
+
self.rope_theta = rope_theta
|
409 |
+
self.rope_scaling = rope_scaling
|
410 |
+
self.partial_rotary_factor = partial_rotary_factor
|
411 |
+
self._rope_scaling_adjustment()
|
412 |
+
self._rope_scaling_validation()
|
413 |
+
self.sliding_window = sliding_window
|
414 |
+
|
415 |
+
if isinstance(vision_config, dict):
|
416 |
+
vision_config = Phi4MultimodalVisionConfig(**vision_config)
|
417 |
+
elif vision_config is None:
|
418 |
+
Phi4MultimodalVisionConfig()
|
419 |
+
self.vision_config = vision_config
|
420 |
+
|
421 |
+
if isinstance(audio_config, dict):
|
422 |
+
audio_config = Phi4MultimodalAudioConfig(**audio_config)
|
423 |
+
elif vision_config is None:
|
424 |
+
audio_config = Phi4MultimodalAudioConfig()
|
425 |
+
self.audio_config = audio_config
|
426 |
+
|
427 |
+
def _rope_scaling_adjustment(self):
|
428 |
+
"""
|
429 |
+
Adjust the `type` of the `rope_scaling` configuration for backward compatibility.
|
430 |
+
"""
|
431 |
+
if self.rope_scaling is None:
|
432 |
+
return
|
433 |
+
|
434 |
+
rope_scaling_type = self.rope_scaling.get("type", None)
|
435 |
+
|
436 |
+
# For backward compatibility if previous version used "su" or "yarn"
|
437 |
+
if rope_scaling_type is not None and rope_scaling_type in ["su", "yarn"]:
|
438 |
+
self.rope_scaling["type"] = "longrope"
|
439 |
+
|
440 |
+
def _rope_scaling_validation(self):
|
441 |
+
"""
|
442 |
+
Validate the `rope_scaling` configuration.
|
443 |
+
"""
|
444 |
+
if self.rope_scaling is None:
|
445 |
+
return
|
446 |
+
|
447 |
+
if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 3:
|
448 |
+
raise ValueError(
|
449 |
+
"`rope_scaling` must be a dictionary with three fields, `type`, `short_factor` and `long_factor`, "
|
450 |
+
f"got {self.rope_scaling}"
|
451 |
+
)
|
452 |
+
rope_scaling_type = self.rope_scaling.get("type", None)
|
453 |
+
rope_scaling_short_factor = self.rope_scaling.get("short_factor", None)
|
454 |
+
rope_scaling_long_factor = self.rope_scaling.get("long_factor", None)
|
455 |
+
if rope_scaling_type is None or rope_scaling_type not in ["longrope"]:
|
456 |
+
raise ValueError(f"`rope_scaling`'s type field must be one of ['longrope'], got {rope_scaling_type}")
|
457 |
+
if not (
|
458 |
+
isinstance(rope_scaling_short_factor, list)
|
459 |
+
and all(isinstance(x, (int, float)) for x in rope_scaling_short_factor)
|
460 |
+
):
|
461 |
+
raise ValueError(
|
462 |
+
f"`rope_scaling`'s short_factor field must be a list of numbers, got {rope_scaling_short_factor}"
|
463 |
+
)
|
464 |
+
rotary_ndims = int(self.hidden_size // self.num_attention_heads * self.partial_rotary_factor)
|
465 |
+
if not len(rope_scaling_short_factor) == rotary_ndims // 2:
|
466 |
+
raise ValueError(
|
467 |
+
f"`rope_scaling`'s short_factor field must have length {rotary_ndims // 2}, got {len(rope_scaling_short_factor)}"
|
468 |
+
)
|
469 |
+
if not (
|
470 |
+
isinstance(rope_scaling_long_factor, list)
|
471 |
+
and all(isinstance(x, (int, float)) for x in rope_scaling_long_factor)
|
472 |
+
):
|
473 |
+
raise ValueError(
|
474 |
+
f"`rope_scaling`'s long_factor field must be a list of numbers, got {rope_scaling_long_factor}"
|
475 |
+
)
|
476 |
+
if not len(rope_scaling_long_factor) == rotary_ndims // 2:
|
477 |
+
raise ValueError(
|
478 |
+
f"`rope_scaling`'s long_factor field must have length {rotary_ndims // 2}, got {len(rope_scaling_long_factor)}"
|
479 |
+
)
|
480 |
+
|
481 |
+
|
482 |
+
__all__ = ["Phi4MultimodalVisionConfig", "Phi4MultimodalAudioConfig", "Phi4MultimodalConfig"]
|
483 |
+
|
484 |
+
Phi4MultimodalConfig.register_for_auto_class()
|
feature_extraction_phi4_multimodal.py
ADDED
@@ -0,0 +1,353 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
"""
|
16 |
+
Processor class for Phi4Multimodal
|
17 |
+
"""
|
18 |
+
|
19 |
+
from typing import Optional, Union, List, Tuple
|
20 |
+
|
21 |
+
import numpy as np
|
22 |
+
|
23 |
+
from transformers.feature_extraction_sequence_utils import SequenceFeatureExtractor
|
24 |
+
from transformers.image_processing_utils import BatchFeature
|
25 |
+
from transformers.utils import TensorType, is_torch_available, logging
|
26 |
+
|
27 |
+
|
28 |
+
if is_torch_available():
|
29 |
+
import torch
|
30 |
+
|
31 |
+
|
32 |
+
logger = logging.get_logger(__name__)
|
33 |
+
|
34 |
+
AudioInput = Union[
|
35 |
+
np.ndarray, "torch.Tensor", List[np.ndarray], Tuple[np.ndarray], List["torch.Tensor"], Tuple["torch.Tensor"] # noqa: F821
|
36 |
+
]
|
37 |
+
|
38 |
+
|
39 |
+
# TODO: @eustlb, remove this once #36603 is merged.
|
40 |
+
def speechlib_mel(sample_rate, n_fft, n_mels, fmin=None, fmax=None):
|
41 |
+
"""Create a Mel filter-bank the same as SpeechLib FbankFC.
|
42 |
+
|
43 |
+
Args:
|
44 |
+
sample_rate (int): Sample rate in Hz. number > 0 [scalar]
|
45 |
+
n_fft (int): FFT size. int > 0 [scalar]
|
46 |
+
n_mel (int): Mel filter size. int > 0 [scalar]
|
47 |
+
fmin (float): lowest frequency (in Hz). If None use 0.0.
|
48 |
+
float >= 0 [scalar]
|
49 |
+
fmax: highest frequency (in Hz). If None use sample_rate / 2.
|
50 |
+
float >= 0 [scalar]
|
51 |
+
|
52 |
+
Returns
|
53 |
+
out (numpy.ndarray): Mel transform matrix
|
54 |
+
[shape=(n_mels, 1 + n_fft/2)]
|
55 |
+
"""
|
56 |
+
|
57 |
+
bank_width = int(n_fft // 2 + 1)
|
58 |
+
if fmax is None:
|
59 |
+
fmax = sample_rate / 2
|
60 |
+
if fmin is None:
|
61 |
+
fmin = 0
|
62 |
+
assert fmin >= 0, "fmin cannot be negtive"
|
63 |
+
assert fmin < fmax <= sample_rate / 2, "fmax must be between (fmin, samplerate / 2]"
|
64 |
+
|
65 |
+
def mel(f):
|
66 |
+
return 1127.0 * np.log(1.0 + f / 700.0)
|
67 |
+
|
68 |
+
def bin2mel(fft_bin):
|
69 |
+
return 1127.0 * np.log(1.0 + fft_bin * sample_rate / (n_fft * 700.0))
|
70 |
+
|
71 |
+
def f2bin(f):
|
72 |
+
return int((f * n_fft / sample_rate) + 0.5)
|
73 |
+
|
74 |
+
# Spec 1: FFT bin range [f2bin(fmin) + 1, f2bin(fmax) - 1]
|
75 |
+
klo = f2bin(fmin) + 1
|
76 |
+
khi = f2bin(fmax)
|
77 |
+
|
78 |
+
khi = max(khi, klo)
|
79 |
+
|
80 |
+
# Spec 2: SpeechLib uses trianges in Mel space
|
81 |
+
mlo = mel(fmin)
|
82 |
+
mhi = mel(fmax)
|
83 |
+
m_centers = np.linspace(mlo, mhi, n_mels + 2)
|
84 |
+
ms = (mhi - mlo) / (n_mels + 1)
|
85 |
+
|
86 |
+
matrix = np.zeros((n_mels, bank_width), dtype=np.float32)
|
87 |
+
for m in range(0, n_mels):
|
88 |
+
left = m_centers[m]
|
89 |
+
center = m_centers[m + 1]
|
90 |
+
right = m_centers[m + 2]
|
91 |
+
for fft_bin in range(klo, khi):
|
92 |
+
mbin = bin2mel(fft_bin)
|
93 |
+
if left < mbin < right:
|
94 |
+
matrix[m, fft_bin] = 1.0 - abs(center - mbin) / ms
|
95 |
+
|
96 |
+
return matrix
|
97 |
+
|
98 |
+
|
99 |
+
class Phi4MultimodalFeatureExtractor(SequenceFeatureExtractor):
|
100 |
+
model_input_names = ["audio_input_features", "audio_embed_sizes", "audio_attention_mask"]
|
101 |
+
|
102 |
+
def __init__(
|
103 |
+
self,
|
104 |
+
feature_size: int = 80,
|
105 |
+
sampling_rate: int = 16000,
|
106 |
+
hop_length: int = 160,
|
107 |
+
n_fft: int = 512,
|
108 |
+
win_length: int = 400,
|
109 |
+
preemphasis: float = 0.97,
|
110 |
+
padding_value: float = 0.0,
|
111 |
+
audio_compression_rate: int = 8,
|
112 |
+
audio_downsample_rate: int = 1,
|
113 |
+
audio_feat_stride: int = 1,
|
114 |
+
mel_min_frequency: float = 0,
|
115 |
+
mel_max_frequency: float = 7690,
|
116 |
+
**kwargs,
|
117 |
+
):
|
118 |
+
super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
|
119 |
+
|
120 |
+
self.hop_length = hop_length
|
121 |
+
self.n_fft = n_fft
|
122 |
+
self.win_length = win_length
|
123 |
+
self.preemphasis = preemphasis
|
124 |
+
self.padding_value = padding_value
|
125 |
+
self.audio_compression_rate = audio_compression_rate
|
126 |
+
self.audio_downsample_rate = audio_downsample_rate
|
127 |
+
self.audio_feat_stride = audio_feat_stride
|
128 |
+
|
129 |
+
# TODO: @eustlb, uncomment and remove speechlib_mel once #36603 is merged.
|
130 |
+
# self.mel_filters = mel_filter_bank(
|
131 |
+
# num_frequency_bins=self.n_fft // 2 + 1,
|
132 |
+
# num_mel_filters=self.feature_size,
|
133 |
+
# min_frequency=mel_min_frequency,
|
134 |
+
# max_frequency=mel_max_frequency,
|
135 |
+
# sampling_rate=self.sampling_rate,
|
136 |
+
# triangularize_in_mel_space=True,
|
137 |
+
# mel_scale="kaldi",
|
138 |
+
# )
|
139 |
+
self.mel_filters = speechlib_mel(
|
140 |
+
self.sampling_rate, self.n_fft, self.feature_size, mel_min_frequency, mel_max_frequency
|
141 |
+
).T
|
142 |
+
|
143 |
+
def __call__(
|
144 |
+
self,
|
145 |
+
raw_speech: AudioInput,
|
146 |
+
sampling_rate: Optional[int] = None,
|
147 |
+
pad_to_multiple_of: Optional[int] = None,
|
148 |
+
padding: Optional[str] = "longest",
|
149 |
+
max_length: Optional[int] = None,
|
150 |
+
truncation: bool = False,
|
151 |
+
return_tensors: Optional[Union[str, TensorType]] = None,
|
152 |
+
return_attention_mask: Optional[bool] = True,
|
153 |
+
device: Optional[str] = "cpu",
|
154 |
+
**kwargs,
|
155 |
+
) -> BatchFeature:
|
156 |
+
"""
|
157 |
+
Main method to featurize and prepare for the model one or several audio sequence(s). Implementation uses PyTorch for
|
158 |
+
the STFT computation if available, otherwise a slower NumPy based one.
|
159 |
+
|
160 |
+
Args:
|
161 |
+
raw_speech (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
|
162 |
+
The sequence or batch of sequences to be processed. Each sequence can be a numpy array or PyTorch tensor.
|
163 |
+
For batched inputs, sequences can be a list of numpy arrays or PyTorch tensors, or a single numpy array or
|
164 |
+
PyTorch tensor with first dimension being the batch size.
|
165 |
+
sampling_rate (`int`, *optional*):
|
166 |
+
The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
|
167 |
+
`sampling_rate` at the forward call to prevent silent errors.
|
168 |
+
pad_to_multiple_of (`int`, *optional*, defaults to None):
|
169 |
+
If set will pad the sequence to a multiple of the provided value.
|
170 |
+
padding (`str`, *optional*, defaults to "longest"):
|
171 |
+
Padding strategy. Can be "longest" to pad to the longest sequence in the batch, or a specific length.
|
172 |
+
max_length (`int`, *optional*):
|
173 |
+
Maximum length of the returned list and optionally padding length.
|
174 |
+
truncation (`bool`, *optional*, defaults to False):
|
175 |
+
Activates truncation to cut input sequences longer than *max_length* to *max_length*.
|
176 |
+
return_tensors (`str` or [`~utils.TensorType`], *optional*):
|
177 |
+
If set, will return tensors instead of numpy arrays. Acceptable values are:
|
178 |
+
- `'pt'`: Return PyTorch `torch.Tensor` objects.
|
179 |
+
- `'np'`: Return Numpy `np.ndarray` objects.
|
180 |
+
- `'tf'`: Return TensorFlow `tf.constant` objects.
|
181 |
+
return_attention_mask (`bool`, *optional*, defaults to `True`):
|
182 |
+
Whether to return the extracted audio input features' attention mask.
|
183 |
+
device (`str`, *optional*, defaults to "cpu"):
|
184 |
+
Specifies the device for computation of the audio features. (e.g., "cpu", "cuda")
|
185 |
+
|
186 |
+
Returns:
|
187 |
+
[`BatchFeature`]: A [`BatchFeature`] with the following fields:
|
188 |
+
- **audio_input_features** -- Audio features extracted from the raw audio input, shape (batch_size, max_feature_length, feature_size).
|
189 |
+
- **audio_lengths** -- Length of each audio sample in the batch, shape (batch_size,).
|
190 |
+
- **audio_attention_mask** -- Attention mask for the audio input, shape (batch_size, max_feature_length).
|
191 |
+
If `return_tensors` is not specified, the fields will be PyTorch tensors if PyTorch is available, otherwise NumPy arrays.
|
192 |
+
"""
|
193 |
+
if sampling_rate is not None:
|
194 |
+
if sampling_rate != self.sampling_rate:
|
195 |
+
raise ValueError(
|
196 |
+
f"The model corresponding to this feature extractor: {self.__class__.__name__} was trained using a"
|
197 |
+
f" sampling rate of {self.sampling_rate}. Please make sure that the provided `raw_speech` input"
|
198 |
+
f" was sampled with {self.sampling_rate} and not {sampling_rate}."
|
199 |
+
)
|
200 |
+
else:
|
201 |
+
logger.warning(
|
202 |
+
f"It is strongly recommended to pass the `sampling_rate` argument to `{self.__class__.__name__}()`. "
|
203 |
+
"Failing to do so can result in silent errors that might be hard to debug."
|
204 |
+
)
|
205 |
+
|
206 |
+
# Convert to torch tensor
|
207 |
+
if isinstance(raw_speech, np.ndarray):
|
208 |
+
raw_speech = torch.tensor(raw_speech)
|
209 |
+
elif isinstance(raw_speech, (list, tuple)) and isinstance(raw_speech[0], np.ndarray):
|
210 |
+
raw_speech = [torch.tensor(speech) for speech in raw_speech]
|
211 |
+
|
212 |
+
is_batched_torch = isinstance(raw_speech, torch.Tensor) and len(raw_speech.shape) > 1
|
213 |
+
if is_batched_torch and len(raw_speech.shape) > 2:
|
214 |
+
logger.warning(
|
215 |
+
f"Only mono-channel audio is supported for input to {self.__class__.__name__}. "
|
216 |
+
"We will take the mean of the channels to convert to mono."
|
217 |
+
)
|
218 |
+
raw_speech = raw_speech.mean(-1)
|
219 |
+
|
220 |
+
is_batched_sequence = isinstance(raw_speech, (list, tuple))
|
221 |
+
if is_batched_sequence:
|
222 |
+
for speech in raw_speech:
|
223 |
+
if len(speech.shape) > 1:
|
224 |
+
logger.warning(
|
225 |
+
f"Only mono-channel audio is supported for input to {self.__class__.__name__}. "
|
226 |
+
"We will take the mean of the channels to convert to mono."
|
227 |
+
)
|
228 |
+
speech = speech.mean(-1)
|
229 |
+
|
230 |
+
if is_batched_torch or is_batched_sequence:
|
231 |
+
raw_speech = [speech[:, None].to(torch.float32) for speech in raw_speech]
|
232 |
+
else:
|
233 |
+
raw_speech = [raw_speech[:, None].to(torch.float32)]
|
234 |
+
|
235 |
+
audio_lengths = [len(speech) for speech in raw_speech]
|
236 |
+
|
237 |
+
# convert into correct format for padding
|
238 |
+
batched_speech = BatchFeature(data={"audio_input_features": raw_speech, "audio_lengths": audio_lengths})
|
239 |
+
padded_inputs = self.pad(
|
240 |
+
batched_speech,
|
241 |
+
padding=padding,
|
242 |
+
max_length=max_length,
|
243 |
+
truncation=truncation,
|
244 |
+
pad_to_multiple_of=pad_to_multiple_of,
|
245 |
+
return_tensors="pt",
|
246 |
+
)
|
247 |
+
input_features = padded_inputs.audio_input_features.squeeze(-1)
|
248 |
+
audio_lengths = padded_inputs.audio_lengths
|
249 |
+
|
250 |
+
input_features = self._torch_extract_fbank_features(input_features, audio_lengths, device)
|
251 |
+
|
252 |
+
feature_lengths = (audio_lengths - self.win_length) // self.hop_length + 1
|
253 |
+
feature_lengths = feature_lengths * self.audio_feat_stride
|
254 |
+
audio_embed_sizes = self._compute_audio_embed_size(feature_lengths)
|
255 |
+
|
256 |
+
feature_attention_mask = (
|
257 |
+
torch.arange(0, feature_lengths.max()) if is_torch_available() else np.arange(0, feature_lengths.max())
|
258 |
+
)
|
259 |
+
feature_attention_mask = (
|
260 |
+
feature_attention_mask[None, :] < feature_lengths[:, None] if len(feature_lengths) > 1 else None
|
261 |
+
)
|
262 |
+
|
263 |
+
data = {
|
264 |
+
"audio_input_features": input_features,
|
265 |
+
"audio_embed_sizes": audio_embed_sizes,
|
266 |
+
}
|
267 |
+
if feature_attention_mask is not None and return_attention_mask:
|
268 |
+
data["audio_attention_mask"] = feature_attention_mask
|
269 |
+
|
270 |
+
return BatchFeature(data=data, tensor_type=return_tensors)
|
271 |
+
|
272 |
+
# TODO; @eustlb, move this to audio_utils in a general spectogram_batch function that handles torch and numpy
|
273 |
+
def _torch_extract_fbank_features(
|
274 |
+
self, waveform: "torch.FloatTensor", audio_lengths: "torch.Tensor", device: str = "cpu"
|
275 |
+
) -> "torch.FloatTensor":
|
276 |
+
"""
|
277 |
+
Compute the log mel-scaled spectrogram of batched waveforms using PyTorch's FFT implementation.
|
278 |
+
|
279 |
+
Args:
|
280 |
+
waveform (torch.FloatTensor` of shape `(batch_size, max_audio_length)`):
|
281 |
+
The batched waveforms.
|
282 |
+
audio_lengths (`torch.Tensor` of shape `(batch_size,)`):
|
283 |
+
The lengths of the waveforms along the max_audio_length dimension.
|
284 |
+
device (`str`, *optional*, defaults to "cpu"):
|
285 |
+
The device to run the computation on. (e.g., "cpu", "cuda")
|
286 |
+
|
287 |
+
Returns:
|
288 |
+
`torch.FloatTensor` of shape `(batch_size, max_feature_length, feature_size)`:
|
289 |
+
The log mel-scaled spectrogram of the batched waveforms.
|
290 |
+
"""
|
291 |
+
fft_window = torch.hamming_window(self.win_length, periodic=False, device=device, dtype=torch.float64)
|
292 |
+
|
293 |
+
# batched implementation
|
294 |
+
batch_size = waveform.shape[0]
|
295 |
+
frames = waveform.unfold(-1, self.win_length, self.hop_length)
|
296 |
+
|
297 |
+
# ---
|
298 |
+
# the unbatched (and unpaded) original implementation skips last few audio values that can't be included in a frame
|
299 |
+
# we need to ensure that the corresponding frames for the padded input also mask these values
|
300 |
+
if batch_size > 1:
|
301 |
+
frames = frames.clone()
|
302 |
+
# concerned batch indices
|
303 |
+
to_mask_batch_idxs = torch.arange(batch_size)[audio_lengths != audio_lengths.max()]
|
304 |
+
if to_mask_batch_idxs.numel() > 0:
|
305 |
+
batch_idxs_down = (audio_lengths[to_mask_batch_idxs] - self.win_length) // self.hop_length + 1
|
306 |
+
batch_idxs_up = audio_lengths[to_mask_batch_idxs] // self.hop_length + 1
|
307 |
+
offset_idx = batch_idxs_down.min()
|
308 |
+
max_idx = batch_idxs_up.max()
|
309 |
+
|
310 |
+
mask = torch.arange(max_idx - offset_idx, device=device).expand(to_mask_batch_idxs.shape[0], -1)
|
311 |
+
mask = ((batch_idxs_down - offset_idx).unsqueeze(1) <= mask) & (
|
312 |
+
mask < (batch_idxs_up - offset_idx).unsqueeze(1)
|
313 |
+
)
|
314 |
+
mask = mask.unsqueeze(-1).expand(-1, -1, self.win_length)
|
315 |
+
masked_frames = frames[to_mask_batch_idxs, offset_idx:max_idx].masked_fill_(mask, 0)
|
316 |
+
frames[to_mask_batch_idxs, offset_idx:max_idx] = masked_frames
|
317 |
+
# ---
|
318 |
+
|
319 |
+
# apply pre-emphasis first order filter on fft windows
|
320 |
+
frames_prev = torch.roll(frames, 1, dims=-1)
|
321 |
+
frames_prev[:, :, 0] = frames_prev[:, :, 1]
|
322 |
+
frames = (frames - self.preemphasis * frames_prev) * 32768
|
323 |
+
|
324 |
+
# apply fft
|
325 |
+
S = torch.fft.rfft(fft_window * frames.view(-1, self.win_length), n=self.n_fft, dim=1)
|
326 |
+
S = S.view(frames.shape[0], -1, S.shape[-1])
|
327 |
+
S = S.to(torch.complex64)
|
328 |
+
|
329 |
+
spec = torch.abs(S)
|
330 |
+
spec_power = spec**2
|
331 |
+
|
332 |
+
# apply triangular mel filter bank
|
333 |
+
mel_filters = torch.from_numpy(self.mel_filters).to(device, torch.float32)
|
334 |
+
log_spec = torch.clamp(spec_power @ mel_filters, min=1.0)
|
335 |
+
log_spec = torch.log(log_spec)
|
336 |
+
|
337 |
+
return log_spec
|
338 |
+
|
339 |
+
def _compute_audio_embed_size(self, audio_frames):
|
340 |
+
integer = audio_frames // self.audio_compression_rate
|
341 |
+
remainder = audio_frames % self.audio_compression_rate
|
342 |
+
result = integer + (remainder > 0).to(integer.dtype)
|
343 |
+
|
344 |
+
integer = result // self.audio_downsample_rate
|
345 |
+
remainder = result % self.audio_downsample_rate
|
346 |
+
result = integer + (remainder > 0).to(integer.dtype) # qformer compression
|
347 |
+
|
348 |
+
return result
|
349 |
+
|
350 |
+
|
351 |
+
__all__ = ["Phi4MultimodalFeatureExtractor"]
|
352 |
+
|
353 |
+
Phi4MultimodalFeatureExtractor.register_for_auto_class()
|
generation_config.json
CHANGED
@@ -2,10 +2,9 @@
|
|
2 |
"_from_model_config": true,
|
3 |
"bos_token_id": 199999,
|
4 |
"eos_token_id": [
|
5 |
-
|
6 |
-
|
7 |
],
|
8 |
"pad_token_id": 199999,
|
9 |
-
"transformers_version": "4.
|
10 |
-
"use_cache": true
|
11 |
}
|
|
|
2 |
"_from_model_config": true,
|
3 |
"bos_token_id": 199999,
|
4 |
"eos_token_id": [
|
5 |
+
199999,
|
6 |
+
200020
|
7 |
],
|
8 |
"pad_token_id": 199999,
|
9 |
+
"transformers_version": "4.52.0.dev0"
|
|
|
10 |
}
|
image_processing_phi4_multimodal_fast.py
ADDED
@@ -0,0 +1,284 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright 2025 Microsoft and the HuggingFace Inc. team. All rights reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
"""
|
16 |
+
Processor class for Phi4Multimodal
|
17 |
+
"""
|
18 |
+
|
19 |
+
import math
|
20 |
+
from typing import List, Optional, Union, TypedDict
|
21 |
+
|
22 |
+
import torch
|
23 |
+
from torchvision.transforms import functional as F
|
24 |
+
|
25 |
+
from transformers.image_processing_utils_fast import (
|
26 |
+
BaseImageProcessorFast,
|
27 |
+
BatchFeature,
|
28 |
+
Unpack,
|
29 |
+
convert_to_rgb,
|
30 |
+
ChannelDimension
|
31 |
+
)
|
32 |
+
from transformers.image_utils import ImageInput, make_flat_list_of_images, valid_images
|
33 |
+
from transformers.utils import TensorType, logging
|
34 |
+
|
35 |
+
|
36 |
+
logger = logging.get_logger(__name__)
|
37 |
+
|
38 |
+
|
39 |
+
class DefaultFastImageProcessorKwargs(TypedDict, total=False):
|
40 |
+
do_resize: Optional[bool]
|
41 |
+
size: Optional[dict[str, int]]
|
42 |
+
default_to_square: Optional[bool]
|
43 |
+
resample: Optional[Union["PILImageResampling", "F.InterpolationMode"]]
|
44 |
+
do_center_crop: Optional[bool]
|
45 |
+
crop_size: Optional[dict[str, int]]
|
46 |
+
do_rescale: Optional[bool]
|
47 |
+
rescale_factor: Optional[Union[int, float]]
|
48 |
+
do_normalize: Optional[bool]
|
49 |
+
image_mean: Optional[Union[float, list[float]]]
|
50 |
+
image_std: Optional[Union[float, list[float]]]
|
51 |
+
do_convert_rgb: Optional[bool]
|
52 |
+
return_tensors: Optional[Union[str, TensorType]]
|
53 |
+
data_format: Optional[ChannelDimension]
|
54 |
+
input_data_format: Optional[Union[str, ChannelDimension]]
|
55 |
+
device: Optional["torch.device"]
|
56 |
+
|
57 |
+
|
58 |
+
class Phi4MultimodalFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
|
59 |
+
image_size: Optional[int]
|
60 |
+
patch_size: Optional[int]
|
61 |
+
dynamic_hd: Optional[int]
|
62 |
+
|
63 |
+
|
64 |
+
class Phi4MultimodalImageProcessorFast(BaseImageProcessorFast):
|
65 |
+
r"""
|
66 |
+
Constructs a Phi4Multimodal image processor.
|
67 |
+
"""
|
68 |
+
|
69 |
+
image_size = 448
|
70 |
+
patch_size = 14
|
71 |
+
dynamic_hd = 36
|
72 |
+
image_mean = [0.5, 0.5, 0.5]
|
73 |
+
image_std = [0.5, 0.5, 0.5]
|
74 |
+
valid_init_kwargs = Phi4MultimodalFastImageProcessorKwargs
|
75 |
+
model_input_names = ["image_pixel_values", "image_sizes", "image_attention_mask"]
|
76 |
+
|
77 |
+
def __init__(self, **kwargs: Unpack[Phi4MultimodalFastImageProcessorKwargs]):
|
78 |
+
super().__init__(**kwargs)
|
79 |
+
|
80 |
+
def find_closest_aspect_ratio(self, aspect_ratio, target_ratios, width, height):
|
81 |
+
best_ratio_diff = float("inf")
|
82 |
+
best_ratio = (1, 1)
|
83 |
+
area = width * height
|
84 |
+
for ratio in target_ratios:
|
85 |
+
target_aspect_ratio = ratio[0] / ratio[1]
|
86 |
+
ratio_diff = abs(aspect_ratio - target_aspect_ratio)
|
87 |
+
if ratio_diff < best_ratio_diff:
|
88 |
+
best_ratio_diff = ratio_diff
|
89 |
+
best_ratio = ratio
|
90 |
+
elif ratio_diff == best_ratio_diff:
|
91 |
+
if area > 0.5 * self.image_size * self.image_size * ratio[0] * ratio[1]:
|
92 |
+
best_ratio = ratio
|
93 |
+
return best_ratio
|
94 |
+
|
95 |
+
def dynamic_preprocess(self, image, max_num=36, min_num=1):
|
96 |
+
image_size = self.image_size
|
97 |
+
patch_size = self.patch_size
|
98 |
+
mask_size = image_size // patch_size
|
99 |
+
orig_width, orig_height = image.size
|
100 |
+
|
101 |
+
w_crop_num = math.ceil(orig_width / float(image_size))
|
102 |
+
h_crop_num = math.ceil(orig_height / float(image_size))
|
103 |
+
if w_crop_num * h_crop_num > max_num:
|
104 |
+
aspect_ratio = orig_width / orig_height
|
105 |
+
|
106 |
+
# calculate the existing image aspect ratio
|
107 |
+
target_ratios = {
|
108 |
+
(i, j)
|
109 |
+
for n in range(min_num, max_num + 1)
|
110 |
+
for i in range(1, n + 1)
|
111 |
+
for j in range(1, n + 1)
|
112 |
+
if i * j <= max_num and i * j >= min_num
|
113 |
+
}
|
114 |
+
target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
|
115 |
+
|
116 |
+
# find the closest aspect ratio to the target
|
117 |
+
target_aspect_ratio = self.find_closest_aspect_ratio(aspect_ratio, target_ratios, orig_width, orig_height)
|
118 |
+
|
119 |
+
# calculate the target width and height
|
120 |
+
target_width = image_size * target_aspect_ratio[0]
|
121 |
+
target_height = image_size * target_aspect_ratio[1]
|
122 |
+
else:
|
123 |
+
target_width = image_size * w_crop_num
|
124 |
+
target_height = image_size * h_crop_num
|
125 |
+
target_aspect_ratio = (w_crop_num, h_crop_num)
|
126 |
+
|
127 |
+
# Calculate the ratio
|
128 |
+
ratio_width = target_width / orig_width
|
129 |
+
ratio_height = target_height / orig_height
|
130 |
+
if ratio_width < ratio_height:
|
131 |
+
new_size = (target_width, int(orig_height * ratio_width))
|
132 |
+
padding_width = 0
|
133 |
+
padding_height = target_height - int(orig_height * ratio_width)
|
134 |
+
else:
|
135 |
+
new_size = (int(orig_width * ratio_height), target_height)
|
136 |
+
padding_width = target_width - int(orig_width * ratio_height)
|
137 |
+
padding_height = 0
|
138 |
+
|
139 |
+
attention_mask = torch.ones((int(mask_size * target_aspect_ratio[1]), int(mask_size * target_aspect_ratio[0])))
|
140 |
+
if padding_width >= patch_size:
|
141 |
+
attention_mask[:, -math.floor(padding_width / patch_size) :] = 0
|
142 |
+
if padding_height >= patch_size:
|
143 |
+
attention_mask[-math.floor(padding_height / patch_size) :, :] = 0
|
144 |
+
|
145 |
+
if min(new_size[1], target_height) < 10 or min(new_size[0], target_width) < 10:
|
146 |
+
raise ValueError(f"the aspect ratio is very extreme {new_size}")
|
147 |
+
|
148 |
+
image = F.resize(image, [new_size[1], new_size[0]])
|
149 |
+
resized_img = F.pad(image, [0, 0, padding_width, padding_height], fill=[255, 255, 255])
|
150 |
+
|
151 |
+
return resized_img, attention_mask
|
152 |
+
|
153 |
+
def pad_to_max_num_crops(self, images, max_crops=5):
|
154 |
+
"""
|
155 |
+
images: B x 3 x H x W, B<=max_crops
|
156 |
+
"""
|
157 |
+
B, _, H, W = images.shape
|
158 |
+
if B < max_crops:
|
159 |
+
pad = torch.zeros(max_crops - B, 3, H, W, dtype=images.dtype, device=images.device)
|
160 |
+
images = torch.cat([images, pad], dim=0)
|
161 |
+
return images
|
162 |
+
|
163 |
+
def pad_mask_to_max_num_crops(self, masks, max_crops=5):
|
164 |
+
B, H, W = masks.shape
|
165 |
+
if B < max_crops:
|
166 |
+
pad = torch.ones(max_crops - B, H, W, dtype=masks.dtype, device=masks.device)
|
167 |
+
masks = torch.cat([masks, pad], dim=0)
|
168 |
+
return masks
|
169 |
+
|
170 |
+
def preprocess(
|
171 |
+
self,
|
172 |
+
images: ImageInput,
|
173 |
+
image_mean: Optional[Union[float, List[float]]] = None,
|
174 |
+
image_std: Optional[Union[float, List[float]]] = None,
|
175 |
+
return_tensors: Optional[Union[str, TensorType]] = None,
|
176 |
+
):
|
177 |
+
"""
|
178 |
+
Args:
|
179 |
+
images (`ImageInput`):
|
180 |
+
Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
|
181 |
+
passing in images with pixel values between 0 and 1, set `do_rescale=False`.
|
182 |
+
image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
|
183 |
+
Mean to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
|
184 |
+
image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
|
185 |
+
Standard deviation to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
|
186 |
+
return_tensors (`str` or `TensorType`, *optional*):
|
187 |
+
The type of tensors to return. Can be one of:
|
188 |
+
- Unset: Return a list of `np.ndarray`.
|
189 |
+
- `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
|
190 |
+
- `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
|
191 |
+
- `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
|
192 |
+
- `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
|
193 |
+
"""
|
194 |
+
image_mean = image_mean if image_mean is not None else self.image_mean
|
195 |
+
image_std = image_std if image_std is not None else self.image_std
|
196 |
+
|
197 |
+
images = make_flat_list_of_images(images)
|
198 |
+
if not valid_images(images):
|
199 |
+
raise ValueError(
|
200 |
+
"Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
|
201 |
+
"torch.Tensor, tf.Tensor or jax.ndarray."
|
202 |
+
)
|
203 |
+
images = [convert_to_rgb(image) for image in images]
|
204 |
+
|
205 |
+
image_size = self.image_size
|
206 |
+
patch_size = self.patch_size
|
207 |
+
mask_size = image_size // patch_size
|
208 |
+
imgs_and_masks = [self.dynamic_preprocess(image, max_num=self.dynamic_hd) for image in images]
|
209 |
+
images, image_attention_masks = [x[0] for x in imgs_and_masks], [x[1] for x in imgs_and_masks]
|
210 |
+
|
211 |
+
images = [F.to_tensor(image) for image in images]
|
212 |
+
hd_images = [F.normalize(image, image_mean, image_std) for image in images]
|
213 |
+
global_image = [
|
214 |
+
torch.nn.functional.interpolate(
|
215 |
+
image.unsqueeze(0).float(),
|
216 |
+
size=(image_size, image_size),
|
217 |
+
mode="bicubic",
|
218 |
+
).to(image.dtype)
|
219 |
+
for image in hd_images
|
220 |
+
]
|
221 |
+
|
222 |
+
shapes = [[image.size(1), image.size(2)] for image in hd_images]
|
223 |
+
mask_shapes = [[mask.size(0), mask.size(1)] for mask in image_attention_masks]
|
224 |
+
global_attention_mask = [torch.ones((1, mask_size, mask_size)) for _ in hd_images]
|
225 |
+
|
226 |
+
hd_images_reshape = []
|
227 |
+
for im, (h, w) in zip(hd_images, shapes):
|
228 |
+
im = im.reshape(1, 3, h // image_size, image_size, w // image_size, image_size)
|
229 |
+
im = im.permute(0, 2, 4, 1, 3, 5)
|
230 |
+
im = im.reshape(-1, 3, image_size, image_size)
|
231 |
+
hd_images_reshape.append(im.contiguous())
|
232 |
+
|
233 |
+
attention_masks_reshape = []
|
234 |
+
for mask, (h, w) in zip(image_attention_masks, mask_shapes):
|
235 |
+
mask = mask.reshape(h // mask_size, mask_size, w // mask_size, mask_size)
|
236 |
+
mask = mask.transpose(1, 2)
|
237 |
+
mask = mask.reshape(-1, mask_size, mask_size)
|
238 |
+
attention_masks_reshape.append(mask.contiguous())
|
239 |
+
|
240 |
+
downsample_attention_masks = []
|
241 |
+
for mask, (h, w) in zip(attention_masks_reshape, mask_shapes):
|
242 |
+
mask = mask[:, 0::2, 0::2]
|
243 |
+
mask = mask.reshape(
|
244 |
+
h // mask_size, w // mask_size, mask_size // 2 + mask_size % 2, mask_size // 2 + mask_size % 2
|
245 |
+
)
|
246 |
+
mask = mask.transpose(1, 2)
|
247 |
+
mask = mask.reshape(mask.size(0) * mask.size(1), mask.size(2) * mask.size(3))
|
248 |
+
downsample_attention_masks.append(mask)
|
249 |
+
|
250 |
+
num_img_tokens = [
|
251 |
+
256 + 1 + int(mask.sum().item()) + int(mask[:, 0].sum().item()) + 16 for mask in downsample_attention_masks
|
252 |
+
]
|
253 |
+
|
254 |
+
hd_images_reshape = [
|
255 |
+
torch.cat([_global_image] + [_im], dim=0) for _global_image, _im in zip(global_image, hd_images_reshape)
|
256 |
+
]
|
257 |
+
hd_masks_reshape = [
|
258 |
+
torch.cat([_global_mask] + [_mask], dim=0)
|
259 |
+
for _global_mask, _mask in zip(global_attention_mask, attention_masks_reshape)
|
260 |
+
]
|
261 |
+
max_crops = max([img.size(0) for img in hd_images_reshape])
|
262 |
+
image_transformed = [self.pad_to_max_num_crops(im, max_crops) for im in hd_images_reshape]
|
263 |
+
image_transformed = torch.stack(image_transformed, dim=0)
|
264 |
+
mask_transformed = [self.pad_mask_to_max_num_crops(mask, max_crops) for mask in hd_masks_reshape]
|
265 |
+
mask_transformed = torch.stack(mask_transformed, dim=0)
|
266 |
+
|
267 |
+
returned_input_image_embeds = image_transformed
|
268 |
+
returned_image_sizes = torch.tensor(shapes, dtype=torch.long)
|
269 |
+
returned_image_attention_mask = mask_transformed
|
270 |
+
returned_num_img_tokens = num_img_tokens
|
271 |
+
|
272 |
+
data = {
|
273 |
+
"image_pixel_values": returned_input_image_embeds,
|
274 |
+
"image_sizes": returned_image_sizes,
|
275 |
+
"image_attention_mask": returned_image_attention_mask,
|
276 |
+
"num_img_tokens": returned_num_img_tokens,
|
277 |
+
}
|
278 |
+
|
279 |
+
return BatchFeature(data=data, tensor_type=return_tensors)
|
280 |
+
|
281 |
+
|
282 |
+
__all__ = ["Phi4MultimodalImageProcessorFast"]
|
283 |
+
|
284 |
+
Phi4MultimodalImageProcessorFast.register_for_auto_class()
|
model-00001-of-00002.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:57b93f5d0c9422c0b76b68119660187989bd8bb47848994376be3ac53eb61a95
|
3 |
+
size 4903637712
|
model-00002-of-00002.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fd6f60df08041b5c48afe7d7624d4de6e9d7d86162dec7a7e908a71d595e2967
|
3 |
+
size 4584575136
|
model.safetensors.index.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
modeling_phi4_multimodal.py
ADDED
The diff for this file is too large to render.
See raw diff
|
|
preprocessor_config.json
CHANGED
@@ -1,14 +1,21 @@
|
|
1 |
{
|
2 |
"auto_map": {
|
3 |
-
"AutoProcessor": "
|
4 |
-
"AutoImageProcessor": "
|
5 |
-
"AutoFeatureExtractor": "
|
6 |
},
|
7 |
-
"image_processor_type": "Phi4MMImageProcessor",
|
8 |
-
"processor_class": "Phi4MMProcessor",
|
9 |
-
"feature_extractor_type": "Phi4MMAudioFeatureExtractor",
|
10 |
"audio_compression_rate": 8,
|
11 |
"audio_downsample_rate": 1,
|
12 |
"audio_feat_stride": 1,
|
13 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
}
|
|
|
1 |
{
|
2 |
"auto_map": {
|
3 |
+
"AutoProcessor": "processing_phi4_multimodal.Phi4MultimodalProcessor",
|
4 |
+
"AutoImageProcessor": "image_processing_phi4_multimodal_fast.Phi4MultimodalImageProcessorFast",
|
5 |
+
"AutoFeatureExtractor": "feature_extraction_phi4_multimodal.Phi4MultimodalFeatureExtractor"
|
6 |
},
|
|
|
|
|
|
|
7 |
"audio_compression_rate": 8,
|
8 |
"audio_downsample_rate": 1,
|
9 |
"audio_feat_stride": 1,
|
10 |
+
"feature_extractor_type": "Phi4MultimodalFeatureExtractor",
|
11 |
+
"feature_size": 80,
|
12 |
+
"hop_length": 160,
|
13 |
+
"n_fft": 512,
|
14 |
+
"padding_side": "right",
|
15 |
+
"padding_value": 0.0,
|
16 |
+
"preemphasis": 0.97,
|
17 |
+
"processor_class": "Phi4MultimodalProcessor",
|
18 |
+
"return_attention_mask": true,
|
19 |
+
"sampling_rate": 16000,
|
20 |
+
"win_length": 400
|
21 |
}
|
processing_phi4_multimodal.py
ADDED
@@ -0,0 +1,541 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright 2025 Microsoft and the HuggingFace Inc. team. All rights reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
"""
|
16 |
+
Processor class for Phi4Multimodal
|
17 |
+
"""
|
18 |
+
|
19 |
+
import re
|
20 |
+
import os
|
21 |
+
import requests
|
22 |
+
import base64
|
23 |
+
from io import BytesIO
|
24 |
+
from typing import List, Optional, Union, TypedDict
|
25 |
+
|
26 |
+
import librosa
|
27 |
+
import numpy as np
|
28 |
+
import PIL.Image
|
29 |
+
|
30 |
+
from transformers.image_processing_utils import BatchFeature
|
31 |
+
from transformers.image_utils import ImageInput
|
32 |
+
from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack, TextKwargs, ImagesKwargs, VideosKwargs, AudioKwargs, CommonKwargs, ProcessorChatTemplateKwargs
|
33 |
+
from transformers.tokenization_utils_base import TextInput
|
34 |
+
from transformers.utils import logging
|
35 |
+
|
36 |
+
|
37 |
+
from .feature_extraction_phi4_multimodal import AudioInput
|
38 |
+
|
39 |
+
|
40 |
+
logger = logging.get_logger(__name__)
|
41 |
+
|
42 |
+
|
43 |
+
class ChatTemplateLoadKwargs(TypedDict, total=False):
|
44 |
+
"""
|
45 |
+
Keyword arguments used to load multimodal data in processor chat templates.
|
46 |
+
|
47 |
+
num_frames (`int`, *optional*):
|
48 |
+
Number of frames to sample uniformly. If not passed, the whole video is loaded.
|
49 |
+
video_load_backend (`str`, *optional*, defaults to `"pyav"`):
|
50 |
+
The backend to use when loading the video which will be used only when there are videos in the conversation.
|
51 |
+
Can be any of ["decord", "pyav", "opencv", "torchvision"]. Defaults to "pyav" because it is the only backend
|
52 |
+
that supports all types of sources to load from.
|
53 |
+
video_fps (`int`, *optional*):
|
54 |
+
Number of frames to sample per second. Should be passed only when `num_frames=None`.
|
55 |
+
If not specified and `num_frames==None`, all frames are sampled.
|
56 |
+
sample_indices_fn (`Callable`, *optional*):
|
57 |
+
A callable function that will return indices at which the video should be sampled. If the video has to be loaded using
|
58 |
+
by a different sampling technique than provided by `num_frames` or `fps` arguments, one should provide their own `sample_indices_fn`.
|
59 |
+
If not provided, simple uniformt sampling with fps is performed, otherwise `sample_indices_fn` has priority over other args.
|
60 |
+
The function expects at input the all args along with all kwargs passed to `load_video` and should output valid
|
61 |
+
indices at which the video should be sampled. For example:
|
62 |
+
|
63 |
+
def sample_indices_fn(num_frames, fps, metadata, **kwargs):
|
64 |
+
# add you sampling logic here ...
|
65 |
+
return np.linspace(start_idx, end_idx, num_frames, dtype=int)
|
66 |
+
"""
|
67 |
+
|
68 |
+
num_frames: Optional[int] = None
|
69 |
+
video_load_backend: Optional[str] = "pyav"
|
70 |
+
video_fps: Optional[int] = None
|
71 |
+
sampling_rate: Optional[int] = 16_000
|
72 |
+
load_audio_from_video: Optional[bool] = False
|
73 |
+
|
74 |
+
|
75 |
+
class AllKwargsForChatTemplate(
|
76 |
+
TextKwargs, ImagesKwargs, VideosKwargs, AudioKwargs, CommonKwargs, ProcessorChatTemplateKwargs
|
77 |
+
):
|
78 |
+
processor_kwargs: ProcessingKwargs = {
|
79 |
+
**ProcessingKwargs.__annotations__,
|
80 |
+
}
|
81 |
+
mm_load_kwargs: ChatTemplateLoadKwargs = {
|
82 |
+
**TextKwargs.__annotations__,
|
83 |
+
}
|
84 |
+
template_kwargs: ProcessorChatTemplateKwargs = {
|
85 |
+
**ProcessorChatTemplateKwargs.__annotations__,
|
86 |
+
}
|
87 |
+
|
88 |
+
|
89 |
+
class Phi4MultimodalProcessorKwargs(ProcessingKwargs, total=False):
|
90 |
+
_defaults = {
|
91 |
+
"audio_kwargs": {
|
92 |
+
"device": "cpu",
|
93 |
+
},
|
94 |
+
}
|
95 |
+
|
96 |
+
|
97 |
+
def load_audio(audio: Union[str, np.ndarray], sampling_rate=16000, timeout=None) -> np.ndarray:
|
98 |
+
"""
|
99 |
+
Loads `audio` to an np.ndarray object.
|
100 |
+
|
101 |
+
Args:
|
102 |
+
audio (`str` or `np.ndarray`):
|
103 |
+
The audio to be laoded to the numpy array format.
|
104 |
+
sampling_rate (`int`, *optional*, defaults to 16000):
|
105 |
+
The samlping rate to be used when loading the audio. It should be same as the
|
106 |
+
sampling rate the model you will be using further was trained with.
|
107 |
+
timeout (`float`, *optional*):
|
108 |
+
The timeout value in seconds for the URL request.
|
109 |
+
|
110 |
+
Returns:
|
111 |
+
`np.ndarray`: A numpy artay representing the audio.
|
112 |
+
"""
|
113 |
+
|
114 |
+
if isinstance(audio, str):
|
115 |
+
# Load audio from URL (e.g https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/translate_to_chinese.wav)
|
116 |
+
if audio.startswith("http://") or audio.startswith("https://"):
|
117 |
+
audio = librosa.load(BytesIO(requests.get(audio, timeout=timeout).content), sr=sampling_rate)[0]
|
118 |
+
elif os.path.isfile(audio):
|
119 |
+
audio = librosa.load(audio, sr=sampling_rate)[0]
|
120 |
+
elif isinstance(audio, np.ndarray):
|
121 |
+
audio = audio
|
122 |
+
else:
|
123 |
+
raise TypeError(
|
124 |
+
"Incorrect format used for `audio`. Should be an url linking to an audio, a local path, or numpy array."
|
125 |
+
)
|
126 |
+
return audio
|
127 |
+
|
128 |
+
|
129 |
+
def load_image(image: Union[str, "PIL.Image.Image"], timeout: Optional[float] = None) -> "PIL.Image.Image":
|
130 |
+
"""
|
131 |
+
Loads `image` to a PIL Image.
|
132 |
+
|
133 |
+
Args:
|
134 |
+
image (`str` or `PIL.Image.Image`):
|
135 |
+
The image to convert to the PIL Image format.
|
136 |
+
timeout (`float`, *optional*):
|
137 |
+
The timeout value in seconds for the URL request.
|
138 |
+
|
139 |
+
Returns:
|
140 |
+
`PIL.Image.Image`: A PIL Image.
|
141 |
+
"""
|
142 |
+
if isinstance(image, str):
|
143 |
+
if image.startswith("http://") or image.startswith("https://"):
|
144 |
+
# We need to actually check for a real protocol, otherwise it's impossible to use a local file
|
145 |
+
# like http_huggingface_co.png
|
146 |
+
image = PIL.Image.open(BytesIO(requests.get(image, timeout=timeout).content))
|
147 |
+
elif os.path.isfile(image):
|
148 |
+
image = PIL.Image.open(image)
|
149 |
+
else:
|
150 |
+
if image.startswith("data:image/"):
|
151 |
+
image = image.split(",")[1]
|
152 |
+
|
153 |
+
# Try to load as base64
|
154 |
+
try:
|
155 |
+
b64 = base64.decodebytes(image.encode())
|
156 |
+
image = PIL.Image.open(BytesIO(b64))
|
157 |
+
except Exception as e:
|
158 |
+
raise ValueError(
|
159 |
+
f"Incorrect image source. Must be a valid URL starting with `http://` or `https://`, a valid path to an image file, or a base64 encoded string. Got {image}. Failed with {e}"
|
160 |
+
)
|
161 |
+
elif isinstance(image, PIL.Image.Image):
|
162 |
+
image = image
|
163 |
+
else:
|
164 |
+
raise TypeError(
|
165 |
+
"Incorrect format used for image. Should be an url linking to an image, a base64 string, a local path, or a PIL image."
|
166 |
+
)
|
167 |
+
image = PIL.ImageOps.exif_transpose(image)
|
168 |
+
image = image.convert("RGB")
|
169 |
+
return image
|
170 |
+
|
171 |
+
|
172 |
+
class Phi4MultimodalProcessor(ProcessorMixin):
|
173 |
+
r"""
|
174 |
+
Constructs a Phi4Multimodal processor which raps an image processor, a audio processor, and a GPT tokenizer into a single processor.
|
175 |
+
|
176 |
+
[`Phi4MultimodalProcessor`] offers all the functionalities of [`Phi4MultimodalImageProcessorFast`] and [`GPT2Tokenizer`]. See the
|
177 |
+
[`~Phi4MultimodalProcessor.__call__`] and [`~Phi4MultimodalProcessor.decode`] for more information.
|
178 |
+
|
179 |
+
Args:
|
180 |
+
image_processor (`Phi4MultimodalImageProcessorFast`):
|
181 |
+
The image processor to use for images.
|
182 |
+
audio_processor (`Phi4MultimodalFeatureExtractor`):
|
183 |
+
The audio processor to use for audio inputs.
|
184 |
+
tokenizer (`GPT2TokenizerFast`):
|
185 |
+
The tokenizer to use for text.
|
186 |
+
fake_image_token_pattern (`str`, *optional*, defaults to `r"<\|image_\d+\|>"`):
|
187 |
+
The fake image token pattern.
|
188 |
+
fake_audio_token_pattern (`str`, *optional*, defaults to `r"<\|audio_\d+\|>"`):
|
189 |
+
The fake audio token pattern.
|
190 |
+
"""
|
191 |
+
|
192 |
+
attributes = ["image_processor", "audio_processor", "tokenizer"]
|
193 |
+
tokenizer_class = "GPT2TokenizerFast"
|
194 |
+
image_processor_class = "AutoImageProcessor"
|
195 |
+
audio_processor_class = "AutoFeatureExtractor"
|
196 |
+
valid_kwargs = ["chat_template"]
|
197 |
+
|
198 |
+
def __init__(
|
199 |
+
self,
|
200 |
+
image_processor,
|
201 |
+
audio_processor,
|
202 |
+
tokenizer,
|
203 |
+
**kwargs,
|
204 |
+
):
|
205 |
+
self.image_token = tokenizer.image_token
|
206 |
+
self.image_token_id = tokenizer.image_token_id
|
207 |
+
self.audio_token = tokenizer.audio_token
|
208 |
+
self.audio_token_id = tokenizer.audio_token_id
|
209 |
+
super().__init__(image_processor, audio_processor, tokenizer, **kwargs)
|
210 |
+
|
211 |
+
def __call__(
|
212 |
+
self,
|
213 |
+
text: Union[TextInput, List[TextInput]],
|
214 |
+
images: Optional[ImageInput] = None,
|
215 |
+
audio: Optional[AudioInput] = None,
|
216 |
+
**kwargs: Unpack[ProcessingKwargs],
|
217 |
+
) -> BatchFeature:
|
218 |
+
"""
|
219 |
+
Main method to prepare for the model one or several sequences(s) and image(s). This method forards the `text`
|
220 |
+
and `kwargs` arguments to GPT2Tokenizer's [`~GPT2Tokenizer.__call__`] if `text` is not `None` to encode
|
221 |
+
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
222 |
+
Phi4MultimodalImageProcessorFast's [`~Phi4MultimodalImageProcessorFast.__call__`] if `images` is not `None`. Please refer to the doctsring
|
223 |
+
of the above two methods for more information.
|
224 |
+
|
225 |
+
Args:
|
226 |
+
text (`str`, `List[str]`, `List[List[str]]`):
|
227 |
+
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
|
228 |
+
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
|
229 |
+
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
|
230 |
+
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
|
231 |
+
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
|
232 |
+
tensor. Both channels-first and channels-last formats are supported.
|
233 |
+
audio (`List[Union[np.ndarray, torch.Tensor]]`):
|
234 |
+
List of the audios to be prepared.
|
235 |
+
|
236 |
+
Returns:
|
237 |
+
[`BatchFeature`]: A [`BatchFeature`] with the following fields:
|
238 |
+
|
239 |
+
- **input_ids** -- List of token ids to be fed to a model.
|
240 |
+
- **attention_mask** -- List of indices specifying which tokens should be attended to by the model.
|
241 |
+
- **input_image_embeds** -- Pixel values to be fed to a model.
|
242 |
+
- **image_sizes** -- List of tuples specifying the size of each image in `input_image_embeds`.
|
243 |
+
- **image_attention_mask** -- List of attention masks for each image in `input_image_embeds`.
|
244 |
+
- **input_audio_embeds** -- Audio embeddings to be fed to a model.
|
245 |
+
- **audio_embed_sizes** -- List of integers specifying the size of each audio in `input_audio_embeds`.
|
246 |
+
"""
|
247 |
+
|
248 |
+
output_kwargs = self._merge_kwargs(Phi4MultimodalProcessorKwargs, self.tokenizer.init_kwargs, **kwargs)
|
249 |
+
image_kwargs = output_kwargs["images_kwargs"]
|
250 |
+
audio_kwargs = output_kwargs["audio_kwargs"]
|
251 |
+
|
252 |
+
image_inputs = self.image_processor(images, **image_kwargs) if images is not None else {}
|
253 |
+
audio_inputs = self.audio_processor(audio, **audio_kwargs) if audio is not None else {}
|
254 |
+
|
255 |
+
# We pop here for images as we don't need it later
|
256 |
+
num_img_tokens = image_inputs.pop("num_img_tokens", [])
|
257 |
+
audio_embed_sizes = audio_inputs.get("audio_embed_sizes", [])
|
258 |
+
|
259 |
+
# Replace certain special tokens for compatibility
|
260 |
+
if isinstance(text, str):
|
261 |
+
text = [text]
|
262 |
+
elif not isinstance(text, list) and not isinstance(text[0], str):
|
263 |
+
raise ValueError("Invalid input text. Please provide a string, or a list of strings")
|
264 |
+
|
265 |
+
image_token = self.tokenizer.image_token
|
266 |
+
audio_token = self.tokenizer.audio_token
|
267 |
+
|
268 |
+
# Check that the number of special tokens is sound
|
269 |
+
concatenated_prompt = "".join(text)
|
270 |
+
if concatenated_prompt.count(image_token) != len(num_img_tokens):
|
271 |
+
raise ValueError(
|
272 |
+
"You should add as much image tokens `<|image|>` in your prompt as you pass `images` to the processor. ",
|
273 |
+
f"Input contains {concatenated_prompt.count(image_token)} tokens != {len(num_img_tokens)} images",
|
274 |
+
)
|
275 |
+
if concatenated_prompt.count(audio_token) != len(audio_embed_sizes):
|
276 |
+
raise ValueError(
|
277 |
+
"You should add as much audio tokens `<|audio|>` in your prompt as you pass `audios` to the processor. "
|
278 |
+
f"Input contains {concatenated_prompt.count(audio_token)} tokens != {len(audio_embed_sizes)} audios"
|
279 |
+
)
|
280 |
+
|
281 |
+
# Add appropriate number of image/audio tokens (note that the count of replacement is dynamic)
|
282 |
+
image_count_iter = iter(num_img_tokens)
|
283 |
+
audio_count_iter = iter(audio_embed_sizes)
|
284 |
+
processed_text = [
|
285 |
+
re.sub(re.escape(image_token), lambda _: image_token * next(image_count_iter), t) for t in text
|
286 |
+
]
|
287 |
+
processed_text = [
|
288 |
+
re.sub(re.escape(audio_token), lambda _: audio_token * next(audio_count_iter), t) for t in processed_text
|
289 |
+
]
|
290 |
+
|
291 |
+
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
|
292 |
+
text_inputs = self.tokenizer(processed_text, **output_kwargs["text_kwargs"])
|
293 |
+
self._check_special_mm_tokens(processed_text, text_inputs, modalities=["image"])
|
294 |
+
|
295 |
+
# prepare batch feature
|
296 |
+
data = {
|
297 |
+
**text_inputs,
|
298 |
+
**image_inputs,
|
299 |
+
**audio_inputs,
|
300 |
+
}
|
301 |
+
|
302 |
+
return BatchFeature(data=data, tensor_type=return_tensors)
|
303 |
+
|
304 |
+
def batch_decode(self, *args, **kwargs):
|
305 |
+
"""
|
306 |
+
This method forwards all its arguments to GPT2Tokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
|
307 |
+
refer to the docstring of this method for more information.
|
308 |
+
"""
|
309 |
+
return self.tokenizer.batch_decode(*args, **kwargs)
|
310 |
+
|
311 |
+
def decode(self, *args, **kwargs):
|
312 |
+
"""
|
313 |
+
This method forwards all its arguments to GPT2Tokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to
|
314 |
+
the docstring of this method for more information.
|
315 |
+
"""
|
316 |
+
return self.tokenizer.decode(*args, **kwargs)
|
317 |
+
|
318 |
+
@property
|
319 |
+
def model_input_names(self):
|
320 |
+
tokenizer_input_names = self.tokenizer.model_input_names
|
321 |
+
image_processor_input_names = self.image_processor.model_input_names
|
322 |
+
audio_processor_input_names = self.audio_processor.model_input_names
|
323 |
+
return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names + audio_processor_input_names))
|
324 |
+
|
325 |
+
def _check_special_mm_tokens(self, text: list[str], text_inputs: "BatchFeature", modalities: list[str]):
|
326 |
+
"""
|
327 |
+
Checks that number of special tokens in text and processed text is same. The count can be different
|
328 |
+
if tokenized text was truncated, leading to issues in model code.
|
329 |
+
"""
|
330 |
+
for modality in modalities:
|
331 |
+
token_str = getattr(self, f"{modality}_token")
|
332 |
+
token_id = getattr(self, f"{modality}_token_id")
|
333 |
+
ids_count = [list(ids).count(token_id) for ids in text_inputs["input_ids"]]
|
334 |
+
text_count = [sample.count(token_str) for sample in text]
|
335 |
+
|
336 |
+
if ids_count != text_count:
|
337 |
+
raise ValueError(
|
338 |
+
f"Mismatch in `{modality}` token count between text and `input_ids`. Got ids={ids_count} and text={text_count}. "
|
339 |
+
"Likely due to `truncation='max_length'`. Please disable truncation or increase `max_length`."
|
340 |
+
)
|
341 |
+
|
342 |
+
def apply_chat_template(
|
343 |
+
self,
|
344 |
+
conversation: Union[list[dict[str, str]], list[list[dict[str, str]]]],
|
345 |
+
chat_template: Optional[str] = None,
|
346 |
+
**kwargs: Unpack[AllKwargsForChatTemplate],
|
347 |
+
) -> str:
|
348 |
+
"""
|
349 |
+
Similar to the `apply_chat_template` method on tokenizers, this method applies a Jinja template to input
|
350 |
+
conversations to turn them into a single tokenizable string.
|
351 |
+
|
352 |
+
The input is expected to be in the following format, where each message content is a list consisting of text and
|
353 |
+
optionally image or video inputs. One can also provide an image, video, URL or local path which will be used to form
|
354 |
+
`pixel_values` when `return_dict=True`. If not provided, one will get only the formatted text, optionally tokenized text.
|
355 |
+
|
356 |
+
conversation = [
|
357 |
+
{
|
358 |
+
"role": "user",
|
359 |
+
"content": [
|
360 |
+
{"type": "image", "image": "https://www.ilankelman.org/stopsigns/australia.jpg"},
|
361 |
+
{"type": "text", "text": "Please describe this image in detail."},
|
362 |
+
],
|
363 |
+
},
|
364 |
+
]
|
365 |
+
|
366 |
+
Args:
|
367 |
+
conversation (`Union[List[Dict, [str, str]], List[List[Dict[str, str]]]]`):
|
368 |
+
The conversation to format.
|
369 |
+
chat_template (`Optional[str]`, *optional*):
|
370 |
+
The Jinja template to use for formatting the conversation. If not provided, the tokenizer's
|
371 |
+
chat template is used.
|
372 |
+
"""
|
373 |
+
|
374 |
+
if chat_template is None:
|
375 |
+
if isinstance(self.chat_template, dict) and "default" in self.chat_template:
|
376 |
+
chat_template = self.chat_template["default"]
|
377 |
+
elif isinstance(self.chat_template, dict):
|
378 |
+
raise ValueError(
|
379 |
+
'The processor has multiple chat templates but none of them are named "default". You need to specify'
|
380 |
+
" which one to use by passing the `chat_template` argument. Available templates are: "
|
381 |
+
f"{', '.join(self.chat_template.keys())}"
|
382 |
+
)
|
383 |
+
elif self.chat_template is not None:
|
384 |
+
chat_template = self.chat_template
|
385 |
+
else:
|
386 |
+
raise ValueError(
|
387 |
+
"Cannot use apply_chat_template because this processor does not have a chat template."
|
388 |
+
)
|
389 |
+
else:
|
390 |
+
if isinstance(self.chat_template, dict) and chat_template in self.chat_template:
|
391 |
+
# It's the name of a template, not a full template string
|
392 |
+
chat_template = self.chat_template[chat_template]
|
393 |
+
else:
|
394 |
+
# It's a template string, render it directly
|
395 |
+
chat_template = chat_template
|
396 |
+
|
397 |
+
# Fill sets of kwargs that should be used by different parts of template
|
398 |
+
processed_kwargs = {
|
399 |
+
"mm_load_kwargs": {},
|
400 |
+
"template_kwargs": {},
|
401 |
+
}
|
402 |
+
|
403 |
+
for kwarg_type in processed_kwargs:
|
404 |
+
for key in AllKwargsForChatTemplate.__annotations__[kwarg_type].__annotations__.keys():
|
405 |
+
kwarg_type_defaults = AllKwargsForChatTemplate.__annotations__[kwarg_type]
|
406 |
+
default_value = getattr(kwarg_type_defaults, key, None)
|
407 |
+
value = kwargs.pop(key, default_value)
|
408 |
+
if value is not None and not isinstance(value, dict):
|
409 |
+
processed_kwargs[kwarg_type][key] = value
|
410 |
+
|
411 |
+
if isinstance(conversation, (list, tuple)) and (
|
412 |
+
isinstance(conversation[0], (list, tuple)) or hasattr(conversation[0], "content")
|
413 |
+
):
|
414 |
+
is_batched = True
|
415 |
+
conversations = conversation
|
416 |
+
else:
|
417 |
+
is_batched = False
|
418 |
+
conversations = [conversation]
|
419 |
+
|
420 |
+
tokenize = processed_kwargs["template_kwargs"].pop("tokenize", False)
|
421 |
+
return_dict = processed_kwargs["template_kwargs"].pop("return_dict", False)
|
422 |
+
mm_load_kwargs = processed_kwargs["mm_load_kwargs"]
|
423 |
+
|
424 |
+
if tokenize:
|
425 |
+
batch_images, batch_videos = [], []
|
426 |
+
batch_audios = []
|
427 |
+
batch_video_metadata = []
|
428 |
+
for conversation in conversations:
|
429 |
+
images, videos = [], []
|
430 |
+
video_metadata = []
|
431 |
+
for message in conversation:
|
432 |
+
visuals = [content for content in message["content"] if content["type"] in ["image", "video"]]
|
433 |
+
audio_fnames = [
|
434 |
+
content[key]
|
435 |
+
for content in message["content"]
|
436 |
+
for key in ["audio", "url", "path"]
|
437 |
+
if key in content and content["type"] == "audio"
|
438 |
+
]
|
439 |
+
image_fnames = [
|
440 |
+
vision_info[key]
|
441 |
+
for vision_info in visuals
|
442 |
+
for key in ["image", "url", "path", "base64"]
|
443 |
+
if key in vision_info and vision_info["type"] == "image"
|
444 |
+
]
|
445 |
+
video_fnames = [
|
446 |
+
vision_info[key]
|
447 |
+
for vision_info in visuals
|
448 |
+
for key in ["video", "url", "path"]
|
449 |
+
if key in vision_info and vision_info["type"] == "video"
|
450 |
+
]
|
451 |
+
|
452 |
+
for fname in image_fnames:
|
453 |
+
images.append(load_image(fname))
|
454 |
+
|
455 |
+
# Audio models do not accept nested list of audios (yet!) so we construct a flat input audio list
|
456 |
+
if not mm_load_kwargs["load_audio_from_video"]:
|
457 |
+
for fname in audio_fnames:
|
458 |
+
batch_audios.append(load_audio(fname, sampling_rate=mm_load_kwargs["sampling_rate"]))
|
459 |
+
else:
|
460 |
+
for fname in video_fnames:
|
461 |
+
batch_audios.append(load_audio(fname, sampling_rate=mm_load_kwargs["sampling_rate"]))
|
462 |
+
|
463 |
+
for fname in video_fnames:
|
464 |
+
if isinstance(fname, (list, tuple)) and isinstance(fname[0], str):
|
465 |
+
video = [np.array(load_image(image_fname)) for image_fname in fname]
|
466 |
+
# create a 4D video because `load_video` always returns a 4D array
|
467 |
+
video = np.stack(video)
|
468 |
+
metadata = None
|
469 |
+
logger.warning(
|
470 |
+
"When loading the video from list of images, we cannot infer metadata such as `fps` or `duration`. "
|
471 |
+
"If your model uses this metadata during processing, please load the whole video and let the model sample frames instead."
|
472 |
+
)
|
473 |
+
else:
|
474 |
+
# TODO: raushan, should be `self.video_processor.load_video_for_model` when API is added
|
475 |
+
video, metadata = self._load_video_for_model(
|
476 |
+
fname,
|
477 |
+
num_frames=mm_load_kwargs.get("num_frames", None),
|
478 |
+
fps=mm_load_kwargs.get("video_fps", None),
|
479 |
+
backend=mm_load_kwargs["video_load_backend"],
|
480 |
+
**kwargs,
|
481 |
+
)
|
482 |
+
videos.append(video)
|
483 |
+
video_metadata.append(metadata)
|
484 |
+
|
485 |
+
# Currently all processors can accept nested list of batches, but not flat list of visuals
|
486 |
+
# So we'll make a batched list of images and let the processor handle it
|
487 |
+
if images:
|
488 |
+
batch_images.append(images)
|
489 |
+
if videos:
|
490 |
+
batch_videos.append(videos)
|
491 |
+
batch_video_metadata.append(video_metadata)
|
492 |
+
|
493 |
+
# Process conversation with video/image information if needed. Then convert into a prompt using Jinja template
|
494 |
+
conversations = self._process_messages_for_chat_template(
|
495 |
+
conversations,
|
496 |
+
batch_images=batch_images,
|
497 |
+
batch_videos=batch_videos,
|
498 |
+
batch_video_metadata=batch_video_metadata,
|
499 |
+
**processed_kwargs["mm_load_kwargs"],
|
500 |
+
)
|
501 |
+
|
502 |
+
prompt = self.tokenizer.apply_chat_template(
|
503 |
+
conversations,
|
504 |
+
chat_template=chat_template,
|
505 |
+
tokenize=False,
|
506 |
+
return_dict=False,
|
507 |
+
**processed_kwargs["template_kwargs"],
|
508 |
+
)
|
509 |
+
|
510 |
+
if not is_batched:
|
511 |
+
prompt = prompt[0]
|
512 |
+
|
513 |
+
if tokenize:
|
514 |
+
# Tokenizer's `apply_chat_template` never adds special tokens when tokenizing
|
515 |
+
# But processor's `apply_chat_template` didn't have an option to tokenize, so users had to format the prompt
|
516 |
+
# and pass it to the processor. Users thus never worried about special tokens relying on processor handling
|
517 |
+
# everything internally. The below line is to keep BC for that and be able to work with model that have
|
518 |
+
# special tokens in the template (consistent with tokenizers). We dont want to raise warning, it will flood command line
|
519 |
+
# without actionable solution for users
|
520 |
+
single_prompt = prompt[0] if is_batched else prompt
|
521 |
+
if self.tokenizer.bos_token is not None and single_prompt.startswith(self.tokenizer.bos_token):
|
522 |
+
kwargs["add_special_tokens"] = False
|
523 |
+
|
524 |
+
out = self(
|
525 |
+
text=prompt,
|
526 |
+
images=batch_images if batch_images else None,
|
527 |
+
videos=batch_videos if batch_videos else None,
|
528 |
+
audio=batch_audios if batch_audios else None,
|
529 |
+
**kwargs,
|
530 |
+
)
|
531 |
+
if return_dict:
|
532 |
+
return out
|
533 |
+
else:
|
534 |
+
return out["input_ids"]
|
535 |
+
return prompt
|
536 |
+
|
537 |
+
|
538 |
+
__all__ = ["Phi4MultimodalProcessor"]
|
539 |
+
|
540 |
+
|
541 |
+
Phi4MultimodalProcessor.register_for_auto_class()
|
special_tokens_map.json
CHANGED
@@ -13,7 +13,13 @@
|
|
13 |
"rstrip": false,
|
14 |
"single_word": false
|
15 |
},
|
16 |
-
"pad_token":
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
"unk_token": {
|
18 |
"content": "<|endoftext|>",
|
19 |
"lstrip": false,
|
|
|
13 |
"rstrip": false,
|
14 |
"single_word": false
|
15 |
},
|
16 |
+
"pad_token": {
|
17 |
+
"content": "<|endoftext|>",
|
18 |
+
"lstrip": false,
|
19 |
+
"normalized": false,
|
20 |
+
"rstrip": false,
|
21 |
+
"single_word": false
|
22 |
+
},
|
23 |
"unk_token": {
|
24 |
"content": "<|endoftext|>",
|
25 |
"lstrip": false,
|
speech-lora/adapter_config.json
CHANGED
@@ -1,23 +1,31 @@
|
|
1 |
{
|
|
|
2 |
"auto_mapping": null,
|
3 |
-
"base_model_name_or_path":
|
4 |
"bias": "none",
|
|
|
|
|
|
|
5 |
"fan_in_fan_out": false,
|
6 |
-
"inference_mode":
|
7 |
"init_lora_weights": true,
|
|
|
8 |
"layers_pattern": null,
|
9 |
"layers_to_transform": null,
|
|
|
10 |
"lora_alpha": 640,
|
|
|
11 |
"lora_dropout": 0.01,
|
12 |
-
"
|
|
|
|
|
13 |
"peft_type": "LORA",
|
14 |
"r": 320,
|
|
|
15 |
"revision": null,
|
16 |
-
"target_modules":
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
],
|
22 |
-
"task_type": "CAUSAL_LM"
|
23 |
}
|
|
|
1 |
{
|
2 |
+
"alpha_pattern": {},
|
3 |
"auto_mapping": null,
|
4 |
+
"base_model_name_or_path": null,
|
5 |
"bias": "none",
|
6 |
+
"corda_config": null,
|
7 |
+
"eva_config": null,
|
8 |
+
"exclude_modules": null,
|
9 |
"fan_in_fan_out": false,
|
10 |
+
"inference_mode": false,
|
11 |
"init_lora_weights": true,
|
12 |
+
"layer_replication": null,
|
13 |
"layers_pattern": null,
|
14 |
"layers_to_transform": null,
|
15 |
+
"loftq_config": {},
|
16 |
"lora_alpha": 640,
|
17 |
+
"lora_bias": false,
|
18 |
"lora_dropout": 0.01,
|
19 |
+
"megatron_config": null,
|
20 |
+
"megatron_core": "megatron.core",
|
21 |
+
"modules_to_save": null,
|
22 |
"peft_type": "LORA",
|
23 |
"r": 320,
|
24 |
+
"rank_pattern": {},
|
25 |
"revision": null,
|
26 |
+
"target_modules": "model.layers.\\d+.((self_attn.(qkv|o)_proj)|(mlp.(gate_up|down)_proj))",
|
27 |
+
"task_type": "CAUSAL_LM",
|
28 |
+
"trainable_token_indices": null,
|
29 |
+
"use_dora": false,
|
30 |
+
"use_rslora": false
|
|
|
|
|
31 |
}
|
speech-lora/adapter_model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:16f70b0aba566f6c30e67a11e90033453e9375d102e031cec40956a2a0e9771e
|
3 |
+
size 922777944
|
tokenizer.json
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:57589a5827b578065aecc0a91cc1e4e9a0bac0a17fb02539bea63bb9beb889a2
|
3 |
+
size 13303259
|
tokenizer_config.json
CHANGED
@@ -1,125 +1 @@
|
|
1 |
-
{
|
2 |
-
"add_prefix_space": false,
|
3 |
-
"added_tokens_decoder": {
|
4 |
-
"200010": {
|
5 |
-
"content": "<|endoftext10|>",
|
6 |
-
"lstrip": false,
|
7 |
-
"normalized": false,
|
8 |
-
"rstrip": false,
|
9 |
-
"single_word": false,
|
10 |
-
"special": true
|
11 |
-
},
|
12 |
-
"200011": {
|
13 |
-
"content": "<|endoftext11|>",
|
14 |
-
"lstrip": false,
|
15 |
-
"normalized": false,
|
16 |
-
"rstrip": false,
|
17 |
-
"single_word": false,
|
18 |
-
"special": true
|
19 |
-
},
|
20 |
-
"199999": {
|
21 |
-
"content": "<|endoftext|>",
|
22 |
-
"lstrip": false,
|
23 |
-
"normalized": false,
|
24 |
-
"rstrip": false,
|
25 |
-
"single_word": false,
|
26 |
-
"special": true
|
27 |
-
},
|
28 |
-
"200018": {
|
29 |
-
"content": "<|endofprompt|>",
|
30 |
-
"lstrip": false,
|
31 |
-
"normalized": false,
|
32 |
-
"rstrip": false,
|
33 |
-
"single_word": false,
|
34 |
-
"special": true
|
35 |
-
},
|
36 |
-
"200019": {
|
37 |
-
"content": "<|assistant|>",
|
38 |
-
"lstrip": false,
|
39 |
-
"normalized": false,
|
40 |
-
"rstrip": true,
|
41 |
-
"single_word": false,
|
42 |
-
"special": true
|
43 |
-
},
|
44 |
-
"200020": {
|
45 |
-
"content": "<|end|>",
|
46 |
-
"lstrip": false,
|
47 |
-
"normalized": false,
|
48 |
-
"rstrip": true,
|
49 |
-
"single_word": false,
|
50 |
-
"special": true
|
51 |
-
},
|
52 |
-
"200021": {
|
53 |
-
"content": "<|user|>",
|
54 |
-
"lstrip": false,
|
55 |
-
"normalized": false,
|
56 |
-
"rstrip": true,
|
57 |
-
"single_word": false,
|
58 |
-
"special": true
|
59 |
-
},
|
60 |
-
"200022": {
|
61 |
-
"content": "<|system|>",
|
62 |
-
"lstrip": false,
|
63 |
-
"normalized": false,
|
64 |
-
"rstrip": true,
|
65 |
-
"single_word": false,
|
66 |
-
"special": true
|
67 |
-
},
|
68 |
-
"200023": {
|
69 |
-
"content": "<|tool|>",
|
70 |
-
"lstrip": false,
|
71 |
-
"normalized": false,
|
72 |
-
"rstrip": true,
|
73 |
-
"single_word": false,
|
74 |
-
"special": false
|
75 |
-
},
|
76 |
-
"200024": {
|
77 |
-
"content": "<|/tool|>",
|
78 |
-
"lstrip": false,
|
79 |
-
"normalized": false,
|
80 |
-
"rstrip": true,
|
81 |
-
"single_word": false,
|
82 |
-
"special": false
|
83 |
-
},
|
84 |
-
"200025": {
|
85 |
-
"content": "<|tool_call|>",
|
86 |
-
"lstrip": false,
|
87 |
-
"normalized": false,
|
88 |
-
"rstrip": true,
|
89 |
-
"single_word": false,
|
90 |
-
"special": false
|
91 |
-
},
|
92 |
-
"200026": {
|
93 |
-
"content": "<|/tool_call|>",
|
94 |
-
"lstrip": false,
|
95 |
-
"normalized": false,
|
96 |
-
"rstrip": true,
|
97 |
-
"single_word": false,
|
98 |
-
"special": false
|
99 |
-
},
|
100 |
-
"200027": {
|
101 |
-
"content": "<|tool_response|>",
|
102 |
-
"lstrip": false,
|
103 |
-
"normalized": false,
|
104 |
-
"rstrip": true,
|
105 |
-
"single_word": false,
|
106 |
-
"special": false
|
107 |
-
},
|
108 |
-
"200028": {
|
109 |
-
"content": "<|tag|>",
|
110 |
-
"lstrip": false,
|
111 |
-
"normalized": false,
|
112 |
-
"rstrip": true,
|
113 |
-
"single_word": false,
|
114 |
-
"special": true
|
115 |
-
}
|
116 |
-
},
|
117 |
-
"bos_token": "<|endoftext|>",
|
118 |
-
"chat_template": "{% for message in messages %}{% if message['role'] == 'system' and 'tools' in message and message['tools'] is not none %}{{ '<|' + message['role'] + '|>' + message['content'] + '<|tool|>' + message['tools'] + '<|/tool|>' + '<|end|>' }}{% else %}{{ '<|' + message['role'] + '|>' + message['content'] + '<|end|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>' }}{% else %}{{ eos_token }}{% endif %}",
|
119 |
-
"clean_up_tokenization_spaces": false,
|
120 |
-
"eos_token": "<|endoftext|>",
|
121 |
-
"model_max_length": 131072,
|
122 |
-
"pad_token": "<|endoftext|>",
|
123 |
-
"tokenizer_class": "GPT2TokenizerFast",
|
124 |
-
"unk_token": "<|endoftext|>"
|
125 |
-
}
|
|
|
1 |
+
{"add_prefix_space": false, "added_tokens_decoder": {"199999": {"content": "<|endoftext|>", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true}, "200010": {"content": "<|image|>", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true}, "200011": {"content": "<|audio|>", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true}, "200018": {"content": "<|endofprompt|>", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true}, "200019": {"content": "<|assistant|>", "lstrip": false, "normalized": false, "rstrip": true, "single_word": false, "special": true}, "200020": {"content": "<|end|>", "lstrip": false, "normalized": false, "rstrip": true, "single_word": false, "special": true}, "200021": {"content": "<|user|>", "lstrip": false, "normalized": false, "rstrip": true, "single_word": false, "special": true}, "200022": {"content": "<|system|>", "lstrip": false, "normalized": false, "rstrip": true, "single_word": false, "special": true}, "200023": {"content": "<|tool|>", "lstrip": false, "normalized": false, "rstrip": true, "single_word": false, "special": false}, "200024": {"content": "<|/tool|>", "lstrip": false, "normalized": false, "rstrip": true, "single_word": false, "special": false}, "200025": {"content": "<|tool_call|>", "lstrip": false, "normalized": false, "rstrip": true, "single_word": false, "special": false}, "200026": {"content": "<|/tool_call|>", "lstrip": false, "normalized": false, "rstrip": true, "single_word": false, "special": false}, "200027": {"content": "<|tool_response|>", "lstrip": false, "normalized": false, "rstrip": true, "single_word": false, "special": false}, "200028": {"content": "<|tag|>", "lstrip": false, "normalized": false, "rstrip": true, "single_word": false, "special": true}}, "audio_token": "<|audio|>", "bos_token": "<|endoftext|>", "clean_up_tokenization_spaces": false, "eos_token": "<|endoftext|>", "extra_special_tokens": {"audio_token": "<|audio|>", "image_token": "<|image|>"}, "image_token": "<|image|>", "model_max_length": 131072, "pad_token": "<|endoftext|>", "processor_class": "Phi4MultimodalProcessor", "tokenizer_class": "GPT2Tokenizer", "unk_token": "<|endoftext|>"}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vision-lora/adapter_config.json
CHANGED
@@ -1,23 +1,31 @@
|
|
1 |
{
|
|
|
2 |
"auto_mapping": null,
|
3 |
-
"base_model_name_or_path":
|
4 |
"bias": "none",
|
|
|
|
|
|
|
5 |
"fan_in_fan_out": false,
|
6 |
-
"inference_mode":
|
7 |
"init_lora_weights": true,
|
|
|
8 |
"layers_pattern": null,
|
9 |
"layers_to_transform": null,
|
|
|
10 |
"lora_alpha": 512,
|
|
|
11 |
"lora_dropout": 0.0,
|
12 |
-
"
|
|
|
|
|
13 |
"peft_type": "LORA",
|
14 |
"r": 256,
|
|
|
15 |
"revision": null,
|
16 |
-
"target_modules":
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
],
|
22 |
-
"task_type": "CAUSAL_LM"
|
23 |
}
|
|
|
1 |
{
|
2 |
+
"alpha_pattern": {},
|
3 |
"auto_mapping": null,
|
4 |
+
"base_model_name_or_path": null,
|
5 |
"bias": "none",
|
6 |
+
"corda_config": null,
|
7 |
+
"eva_config": null,
|
8 |
+
"exclude_modules": null,
|
9 |
"fan_in_fan_out": false,
|
10 |
+
"inference_mode": false,
|
11 |
"init_lora_weights": true,
|
12 |
+
"layer_replication": null,
|
13 |
"layers_pattern": null,
|
14 |
"layers_to_transform": null,
|
15 |
+
"loftq_config": {},
|
16 |
"lora_alpha": 512,
|
17 |
+
"lora_bias": false,
|
18 |
"lora_dropout": 0.0,
|
19 |
+
"megatron_config": null,
|
20 |
+
"megatron_core": "megatron.core",
|
21 |
+
"modules_to_save": null,
|
22 |
"peft_type": "LORA",
|
23 |
"r": 256,
|
24 |
+
"rank_pattern": {},
|
25 |
"revision": null,
|
26 |
+
"target_modules": "model.layers.\\d+.((self_attn.(qkv|o)_proj)|(mlp.(gate_up|down)_proj))",
|
27 |
+
"task_type": "CAUSAL_LM",
|
28 |
+
"trainable_token_indices": null,
|
29 |
+
"use_dora": false,
|
30 |
+
"use_rslora": false
|
|
|
|
|
31 |
}
|
vision-lora/adapter_model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:76facf464ca0246e9f5dc409520e83764e0b73fa66fdb561526e064133728f8a
|
3 |
+
size 738228552
|
vocab.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|