cyrilvallez HF Staff commited on 18 days ago

Commit

698b586

verified ·

1 Parent(s): 0af439b

Upload folder using huggingface_hub

Browse files

Files changed (20) hide show

chat_template.jinja +1 -0
config.json +55 -87
configuration_phi4_multimodal.py +484 -0
feature_extraction_phi4_multimodal.py +353 -0
generation_config.json +3 -4
image_processing_phi4_multimodal_fast.py +284 -0
model-00001-of-00002.safetensors +3 -0
model-00002-of-00002.safetensors +3 -0
model.safetensors.index.json +0 -0
modeling_phi4_multimodal.py +0 -0
preprocessor_config.json +14 -7
processing_phi4_multimodal.py +541 -0
special_tokens_map.json +7 -1
speech-lora/adapter_config.json +18 -10
speech-lora/adapter_model.safetensors +2 -2
tokenizer.json +2 -2
tokenizer_config.json +1 -125
vision-lora/adapter_config.json +18 -10
vision-lora/adapter_model.safetensors +2 -2
vocab.json +0 -0

chat_template.jinja ADDED Viewed

	@@ -0,0 +1 @@

+ {% for message in messages %}{{ '<|' + message['role'] + '|>' }}{% if message['content'] is string %}{{ message['content'] }}{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' %}{{ '<|image|>' }}{% elif content['type'] == 'audio' %}{{ '<|audio|>' }}{% elif content['type'] == 'text' %}{{ content['text'] }}{% endif %}{% endfor %}{% endif %}{% if message['role'] == 'system' and 'tools' in message and message['tools'] is not none %}{{ '<|tool|>' + message['tools'] + '<|/tool|>' + '<|end|>' }}{% endif %}{{ '<|end|>' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>' }}{% else %}{{ eos_token }}{% endif %}

config.json CHANGED Viewed

@@ -1,82 +1,47 @@
 {
-  "_name_or_path": "Phi-4-multimodal-instruct",
   "architectures": [
-    "Phi4MMForCausalLM"
   ],
   "attention_bias": false,
   "attention_dropout": 0.0,
-  "audio_processor": {
-    "config": {
-      "activation": "swish",
-      "activation_checkpointing": {
-        "interval": 1,
-        "module": "transformer",
-        "offload": false
-      },
-      "attention_dim": 1024,
-      "attention_heads": 16,
-      "batch_norm": false,
-      "bias_in_glu": true,
-      "causal": true,
-      "chunk_size": -1,
-      "cnn_layer_norm": true,
-      "conv_activation": "swish",
-      "conv_glu_type": "swish",
-      "depthwise_multiplier": 1,
-      "depthwise_seperable_out_channel": 1024,
-      "dropout_rate": 0.0,
-      "encoder_embedding_config": {
-        "input_size": 80
-      },
-      "ext_pw_kernel_size": 1,
-      "ext_pw_out_channel": 1024,
-      "input_layer": "nemo_conv",
-      "input_size": 80,
-      "kernel_size": 3,
-      "left_chunk": 18,
-      "linear_units": 1536,
-      "nemo_conv_settings": {
-        "conv_channels": 1024
-      },
-      "num_blocks": 24,
-      "relative_attention_bias_args": {
-        "t5_bias_max_distance": 500,
-        "type": "t5"
-      },
-      "time_reduction": 8
-    },
-    "name": "cascades"
-  },
-  "auto_map": {
-    "AutoConfig": "configuration_phi4mm.Phi4MMConfig",
-    "AutoModelForCausalLM": "modeling_phi4mm.Phi4MMForCausalLM",
-    "AutoTokenizer": "Xenova/gpt-4o"
   },
   "bos_token_id": 199999,
-  "embd_layer": {
-    "audio_embd_layer": {
-      "compression_rate": 8,
-      "downsample_rate": 1,
-      "embedding_cls": "audio",
-      "enable_gradient_checkpointing": true,
-      "projection_cls": "mlp",
-      "use_conv_downsample": false,
-      "use_qformer": false
-    },
-    "embedding_cls": "image_audio",
-    "image_embd_layer": {
-      "crop_size": 448,
-      "embedding_cls": "tune_image",
-      "enable_gradient_checkpointing": true,
-      "hd_transform_order": "sub_glb",
-      "image_token_compression_cls": "avg_pool_2d",
-      "projection_cls": "mlp",
-      "use_hd_transform": true,
-      "with_learnable_separator": true
-    }
-  },
   "embd_pdrop": 0.0,
-  "eos_token_id": 199999,
   "full_attn_mod": 1,
   "hidden_act": "silu",
   "hidden_size": 3072,
@@ -84,21 +49,9 @@
   "intermediate_size": 8192,
   "interpolate_factor": 1,
   "lm_head_bias": false,
-  "vision_lora": {
-    "dp": 0.0,
-    "layer": "layers.*((self_attn\\.(qkv_proj|o_proj))|(mlp\\.(gate_up|down)_proj))",
-    "lora_alpha": 512,
-    "r": 256
-  },
-  "speech_lora": {
-    "dp": 0.01,
-    "layer": "((layers.*self_attn\\.(qkv|o)_proj)|(layers.*mlp\\.(gate_up|down)_proj))",
-    "lora_alpha": 640,
-    "r": 320
-  },
   "max_position_embeddings": 131072,
   "mlp_bias": false,
-  "model_type": "phi4mm",
   "num_attention_heads": 24,
   "num_hidden_layers": 32,
   "num_key_value_heads": 8,
@@ -214,8 +167,23 @@
   "sliding_window": 262144,
   "tie_word_embeddings": true,
   "torch_dtype": "bfloat16",
-  "transformers_version": "4.46.1",
   "use_cache": true,
-  "vocab_size": 200064,
-  "_attn_implementation": "flash_attention_2"
 }

 {
+  "auto_map": {
+    "AutoConfig": "configuration_phi4_multimodal.Phi4MultimodalConfig",
+    "AutoModelForCausalLM": "modeling_phi4_multimodal.Phi4MultimodalForCausalLM"
+  },
   "architectures": [
+    "Phi4MultimodalForCausalLM"
   ],
   "attention_bias": false,
   "attention_dropout": 0.0,
+  "audio_config": {
+    "activation": "swish",
+    "audio_token_id": 200011,
+    "bias_max_distance": 500,
+    "bias_symmetric": false,
+    "chunk_size": -1,
+    "conv_activation": "swish",
+    "conv_glu_type": "swish",
+    "depthwise_multiplier": 1,
+    "depthwise_seperable_out_channel": 1024,
+    "downsample_rate": 1,
+    "dropout_rate": 0.0,
+    "ext_pw_out_channel": 1024,
+    "feature_layer": -2,
+    "hidden_size": 1024,
+    "initializer_range": 0.02,
+    "input_size": 80,
+    "intermediate_size": 1536,
+    "kernel_size": 3,
+    "left_chunk": 18,
+    "model_type": "phi4_multimodal_audio",
+    "nemo_activation": "relu",
+    "nemo_conv_channels": 1024,
+    "nemo_final_size": 10,
+    "num_attention_heads": 16,
+    "num_blocks": 24,
+    "time_reduction": 8
   },
   "bos_token_id": 199999,
   "embd_pdrop": 0.0,
+  "eos_token_id": [
+    199999,
+    200020
+  ],
   "full_attn_mod": 1,
   "hidden_act": "silu",
   "hidden_size": 3072,
   "intermediate_size": 8192,
   "interpolate_factor": 1,
   "lm_head_bias": false,
   "max_position_embeddings": 131072,
   "mlp_bias": false,
+  "model_type": "phi4_multimodal",
   "num_attention_heads": 24,
   "num_hidden_layers": 32,
   "num_key_value_heads": 8,
   "sliding_window": 262144,
   "tie_word_embeddings": true,
   "torch_dtype": "bfloat16",
+  "transformers_version": "4.52.0.dev0",
   "use_cache": true,
+  "vision_config": {
+    "attention_dropout": 0.0,
+    "crop_size": 448,
+    "feature_layer": -2,
+    "hidden_act": "gelu_pytorch_tanh",
+    "hidden_size": 1152,
+    "image_size": 448,
+    "image_token_id": 200010,
+    "intermediate_size": 4304,
+    "layer_norm_eps": 1e-06,
+    "model_type": "phi4_multimodal_vision",
+    "num_attention_heads": 16,
+    "num_channels": 3,
+    "num_hidden_layers": 27,
+    "patch_size": 14
+  },
+  "vocab_size": 200064
 }

configuration_phi4_multimodal.py ADDED Viewed

	@@ -0,0 +1,484 @@

+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_phi4_multimodal.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# Copyright 2025 Microsoft and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from transformers.configuration_utils import PretrainedConfig
+class Phi4MultimodalVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Phi4MultimodalVisionModel`]. It is used to instantiate a
+    Phi4Multimodal vision encoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the vision encoder of
+    [microsoft/Phi-4-multimodal-instruct](https://huggingface.co/microsoft/Phi-4-multimodal-instruct) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        hidden_size (`int`, *optional*, defaults to 1152):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 4304):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 27):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_channels (`int`, *optional*, defaults to 3):
+            Number of channels in the input images.
+        image_size (`int`, *optional*, defaults to 448):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 14):
+            The size (resolution) of each patch.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        crop_size (`int`, *optional*, defaults to 448):
+            Crop size for the input images.
+        image_token_id (`int`, *optional*, defaults to 200010):
+            The image token id.
+        feature_layer (`int`, *optional*, defaults to -2):
+            The index of the layer of the encoder from which to extract image features.
+    Example:
+    ```python
+    >>> from transformers import Phi4MultimodalVisionConfig
+    >>> # Initializing a Phi4MultimodalVisionConfig with microsoft/Phi-4-multimodal-instruct style configuration
+    >>> configuration = Phi4MultimodalVisionConfig()
+    ```"""
+    model_type = "phi4_multimodal_vision"
+    base_config_key = "vision_config"
+    def __init__(
+        self,
+        hidden_size=1152,
+        intermediate_size=4304,
+        num_hidden_layers=27,
+        num_attention_heads=16,
+        num_channels=3,
+        image_size=448,
+        patch_size=14,
+        hidden_act="gelu_pytorch_tanh",
+        layer_norm_eps=1e-6,
+        attention_dropout=0.0,
+        crop_size: int = 448,
+        image_token_id: int = 200010,
+        feature_layer: int = -2,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.crop_size = crop_size
+        self.image_token_id = image_token_id
+        self.feature_layer = feature_layer
+class Phi4MultimodalAudioConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Phi4MultimodalAudioModel`]. It is used to instantiate a
+    Phi4Multimodal audio encoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the audio encoder of
+    [microsoft/Phi-4-multimodal-instruct](https://huggingface.co/microsoft/Phi-4-multimodal-instruct) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        hidden_size (`int`, *optional*, defaults to 1024):
+            Dimensionality of the encoder layers.
+        intermediate_size (`int`, *optional*, defaults to 1536):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_blocks (`int`, *optional*, defaults to 24):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        activation (`str`, *optional*, defaults to `"swish"`):
+            The non-linear activation function in the MLPs.
+        chunk_size (`int`, *optional*, defaults to -1):
+            The chunk size to create the masks.
+        left_chunk (`int`, *optional*, defaults to 18):
+            The left chunk to create the masks.
+        dropout_rate (`float`, *optional*, defaults to 0.0):
+            The dropout ratio.
+        ext_pw_out_channel (`int`, *optional*, defaults to 1024):
+            Number of out channels in the point-wise conv modules.
+        depthwise_seperable_out_channel (`int`, *optional*, defaults to 1024):
+            Number of out channels in the depth-wise separable conv modules.
+        depthwise_multiplier (`int`, *optional*, defaults to 1):
+            Input size multiplier for the depth-wise separable conv modules.
+        kernel_size (`int`, *optional*, defaults to 3):
+            Kernel size for the depth-wise separable conv modules.
+        conv_activation (`str`, *optional*, defaults to `"swish"`):
+            The non-linear activation function in the conv modules.
+        input_size (`int`, *optional*, defaults to 80):
+            Input size for the audio model.
+        conv_glu_type (`str`, *optional*, defaults to `"swish"`):
+            The non-linear activation function in the point-wise conv modules.
+        time_reduction (`int`, *optional*, defaults to 8):
+            Time reduction (subsampling factor).
+        bias_max_distance (`int`, *optional*, defaults to 1000):
+            Max distance for the relative attention bias module.
+        bias_symmetric (`bool`, *optional*, defaults to `False`):
+            Whether the relative attention bias should be symmetric or not.
+        nemo_activation (`str`, *optional*, defaults to `"relu"`):
+            The non-linear activation function in the nemo conv modules.
+        nemo_conv_channels (`int`, *optional*, defaults to 1024):
+            Number of channels in the nemo conv modules.
+        downsample_rate (`int`, *optional*, defaults to 1):
+            Downsample rate for the audio feature extractor.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        audio_token_id (`int`, *optional*, defaults to 200011):
+            The audio token id.
+        feature_layer (`int`, *optional*, defaults to -2):
+            The index of the layer of the encoder from which to extract audio features.
+    Example:
+    ```python
+    >>> from transformers import Phi4MultimodalAudioConfig
+    >>> # Initializing a Phi4MultimodalAudioConfig with microsoft/Phi-4-multimodal-instruct style configuration
+    >>> configuration = Phi4MultimodalAudioConfig()
+    ```"""
+    model_type = "phi4_multimodal_audio"
+    def __init__(
+        self,
+        hidden_size: int = 1024,
+        intermediate_size: int = 1536,
+        num_blocks: int = 24,
+        num_attention_heads: int = 16,
+        activation: str = "swish",
+        chunk_size: int = -1,
+        left_chunk: int = 18,
+        dropout_rate: float = 0.0,
+        ext_pw_out_channel: int = 1024,
+        depthwise_seperable_out_channel: int = 1024,
+        depthwise_multiplier: int = 1,
+        kernel_size: int = 3,
+        conv_activation: str = "swish",
+        input_size: int = 80,
+        conv_glu_type: str = "swish",
+        time_reduction: int = 8,
+        bias_max_distance: int = 1000,
+        bias_symmetric: bool = False,
+        nemo_activation: str = "relu",
+        nemo_conv_channels: int = 1024,
+        downsample_rate: int = 1,
+        initializer_range: float = 0.02,
+        audio_token_id: int = 200011,
+        feature_layer: int = -2,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.hidden_size = hidden_size
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.activation = activation
+        self.chunk_size = chunk_size
+        self.left_chunk = left_chunk
+        self.num_blocks = num_blocks
+        self.dropout_rate = dropout_rate
+        self.ext_pw_out_channel = ext_pw_out_channel
+        self.depthwise_seperable_out_channel = depthwise_seperable_out_channel
+        self.depthwise_multiplier = depthwise_multiplier
+        self.kernel_size = kernel_size
+        self.conv_activation = conv_activation
+        self.input_size = input_size
+        self.conv_glu_type = conv_glu_type
+        self.time_reduction = time_reduction
+        self.bias_max_distance = bias_max_distance
+        self.bias_symmetric = bias_symmetric
+        self.nemo_activation = nemo_activation
+        self.nemo_conv_channels = nemo_conv_channels
+        self.downsample_rate = downsample_rate
+        self.audio_token_id = audio_token_id
+        self.initializer_range = initializer_range
+        self.feature_layer = feature_layer
+        if time_reduction % 2 != 0:
+            raise ValueError("`time_reduction` should be a multiple of 2!")
+        length = input_size
+        for _ in range(int(math.log(time_reduction, 2))):
+            length = math.floor((length - 1) / 2 + 1)
+        self.nemo_final_size = length
+class Phi4MultimodalConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Phi4MultimodalModel`]. It is used to instantiate a
+    Phi4Multimodal model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the
+    [microsoft/Phi-4-multimodal-instruct](https://huggingface.co/microsoft/Phi-4-multimodal-instruct) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 200064):
+            Vocabulary size of the Phi-3 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Phi3Model`].
+        hidden_size (`int`, *optional*, defaults to 3072):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 8192):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*, defaults to 8):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        resid_pdrop (`float`, *optional*, defaults to 0.0):
+            Dropout probability for mlp outputs.
+        embd_pdrop (`int`, *optional*, defaults to 0.0):
+            The dropout ratio for the embeddings.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio after computing the attention scores.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 131072):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon value used for the RMSNorm.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`. Whether to tie weight embeddings or not.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`dict`, *optional*):
+            The scaling strategy for the RoPE embeddings. If `None`, no scaling is applied. If a dictionary, it must
+            contain the following keys: `type`, `short_factor` and `long_factor`. The `type` must be `longrope` and
+            the `short_factor` and `long_factor` must be lists of numbers with the same length as the hidden size
+            divided by the number of attention heads divided by 2.
+        partial_rotary_factor (`float`, *optional*, defaults to `1.0`):
+            Percentage of the query and keys which will have rotary embedding. Must be between 0.0 and 1.0.
+        bos_token_id (`int`, *optional*, defaults to 199999):
+            The id of the "beginning-of-sequence" token.
+        eos_token_id (`int` or `list[int]`, *optional*, defaults to `[199999, 200020]`):
+            The id of the "end-of-sequence" token.
+        pad_token_id (`int`, *optional*, defaults to 199999):
+            The id of the padding token.
+        original_max_position_embeddings (`int`, *optional*, defaults to 4096):
+            The maximum sequence length that this model was trained with. This is used to determine the size of the
+            original RoPE embeddings when using long scaling.
+        sliding_window (`int`, *optional*):
+            Sliding window attention window size. If `None`, no sliding window is applied.
+        vision_config (`Phi4MultimodalVisionConfig` or `dict`, *optional*):
+            The vision config for the underlying image embedding model. If not provided, will default to the configuration
+            used to instantiate a model similar in architecture as
+            [microsoft/Phi-4-multimodal-instruct](https://huggingface.co/microsoft/Phi-4-multimodal-instruct).
+        audio_config (`Phi4MultimodalAudioConfig` or `dict`, *optional*):
+            The audio config for the underlying audio embedding model. If not provided, will default to the configuration
+            used to instantiate a model similar in architecture as
+            [microsoft/Phi-4-multimodal-instruct](https://huggingface.co/microsoft/Phi-4-multimodal-instruct).
+    Example:
+    ```python
+    >>> from transformers import Phi4MultimodalModel, Phi4MultimodalConfig
+    >>> # Initializing a Phi4Multimodal style configuration
+    >>> configuration = Phi4MultimodalConfig.from_pretrained("microsoft/Phi-4-multimodal-instruct")
+    >>> # Initializing a model from the configuration
+    >>> model = Phi4MultimodalModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "phi4_multimodal"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    base_model_tp_plan = {
+        "layers.*.self_attn.qkv_proj": "colwise_rep",  # we need to replicate here due to the slicing of qkv
+        "layers.*.self_attn.o_proj": "rowwise_rep",  # we need to replicate here due to the slicing of qkv
+        "layers.*.mlp.gate_up_proj": "colwise_rep",  # we need to replicate here due to the `chunk` operation
+        "layers.*.mlp.down_proj": "rowwise_rep",  # we need to replicate here due to the `chunk` operation
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+    sub_configs = {"audio_config": Phi4MultimodalAudioConfig, "vision_config": Phi4MultimodalVisionConfig}
+    def __init__(
+        self,
+        vocab_size=200064,
+        hidden_size=3072,
+        intermediate_size=8192,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=8,
+        resid_pdrop=0.0,
+        embd_pdrop=0.0,
+        attention_dropout=0.0,
+        hidden_act="silu",
+        max_position_embeddings=131072,
+        initializer_range=0.02,
+        rms_norm_eps=1e-5,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        partial_rotary_factor=1,
+        bos_token_id=199999,
+        eos_token_id=[199999, 200020],
+        pad_token_id=199999,
+        original_max_position_embeddings=4096,
+        sliding_window=None,
+        vision_config=None,
+        audio_config=None,
+        **kwargs,
+    ):
+        super().__init__(
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            pad_token_id=pad_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.resid_pdrop = resid_pdrop
+        self.embd_pdrop = embd_pdrop
+        self.attention_dropout = attention_dropout
+        self.hidden_act = hidden_act
+        self.max_position_embeddings = max_position_embeddings
+        self.original_max_position_embeddings = original_max_position_embeddings
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.partial_rotary_factor = partial_rotary_factor
+        self._rope_scaling_adjustment()
+        self._rope_scaling_validation()
+        self.sliding_window = sliding_window
+        if isinstance(vision_config, dict):
+            vision_config = Phi4MultimodalVisionConfig(**vision_config)
+        elif vision_config is None:
+            Phi4MultimodalVisionConfig()
+        self.vision_config = vision_config
+        if isinstance(audio_config, dict):
+            audio_config = Phi4MultimodalAudioConfig(**audio_config)
+        elif vision_config is None:
+            audio_config = Phi4MultimodalAudioConfig()
+        self.audio_config = audio_config
+    def _rope_scaling_adjustment(self):
+        """
+        Adjust the `type` of the `rope_scaling` configuration for backward compatibility.
+        """
+        if self.rope_scaling is None:
+            return
+        rope_scaling_type = self.rope_scaling.get("type", None)
+        # For backward compatibility if previous version used "su" or "yarn"
+        if rope_scaling_type is not None and rope_scaling_type in ["su", "yarn"]:
+            self.rope_scaling["type"] = "longrope"
+    def _rope_scaling_validation(self):
+        """
+        Validate the `rope_scaling` configuration.
+        """
+        if self.rope_scaling is None:
+            return
+        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 3:
+            raise ValueError(
+                "`rope_scaling` must be a dictionary with three fields, `type`, `short_factor` and `long_factor`, "
+                f"got {self.rope_scaling}"
+            )
+        rope_scaling_type = self.rope_scaling.get("type", None)
+        rope_scaling_short_factor = self.rope_scaling.get("short_factor", None)
+        rope_scaling_long_factor = self.rope_scaling.get("long_factor", None)
+        if rope_scaling_type is None or rope_scaling_type not in ["longrope"]:
+            raise ValueError(f"`rope_scaling`'s type field must be one of ['longrope'], got {rope_scaling_type}")
+        if not (
+            isinstance(rope_scaling_short_factor, list)
+            and all(isinstance(x, (int, float)) for x in rope_scaling_short_factor)
+        ):
+            raise ValueError(
+                f"`rope_scaling`'s short_factor field must be a list of numbers, got {rope_scaling_short_factor}"
+            )
+        rotary_ndims = int(self.hidden_size // self.num_attention_heads * self.partial_rotary_factor)
+        if not len(rope_scaling_short_factor) == rotary_ndims // 2:
+            raise ValueError(
+                f"`rope_scaling`'s short_factor field must have length {rotary_ndims // 2}, got {len(rope_scaling_short_factor)}"
+            )
+        if not (
+            isinstance(rope_scaling_long_factor, list)
+            and all(isinstance(x, (int, float)) for x in rope_scaling_long_factor)
+        ):
+            raise ValueError(
+                f"`rope_scaling`'s long_factor field must be a list of numbers, got {rope_scaling_long_factor}"
+            )
+        if not len(rope_scaling_long_factor) == rotary_ndims // 2:
+            raise ValueError(
+                f"`rope_scaling`'s long_factor field must have length {rotary_ndims // 2}, got {len(rope_scaling_long_factor)}"
+            )
+__all__ = ["Phi4MultimodalVisionConfig", "Phi4MultimodalAudioConfig", "Phi4MultimodalConfig"]
+Phi4MultimodalConfig.register_for_auto_class()

feature_extraction_phi4_multimodal.py ADDED Viewed

	@@ -0,0 +1,353 @@

+# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for Phi4Multimodal
+"""
+from typing import Optional, Union, List, Tuple
+import numpy as np
+from transformers.feature_extraction_sequence_utils import SequenceFeatureExtractor
+from transformers.image_processing_utils import BatchFeature
+from transformers.utils import TensorType, is_torch_available, logging
+if is_torch_available():
+    import torch
+logger = logging.get_logger(__name__)
+AudioInput = Union[
+    np.ndarray, "torch.Tensor", List[np.ndarray], Tuple[np.ndarray], List["torch.Tensor"], Tuple["torch.Tensor"]  # noqa: F821
+]
+# TODO: @eustlb, remove this once #36603 is merged.
+def speechlib_mel(sample_rate, n_fft, n_mels, fmin=None, fmax=None):
+    """Create a Mel filter-bank the same as SpeechLib FbankFC.
+    Args:
+        sample_rate (int): Sample rate in Hz. number > 0 [scalar]
+        n_fft (int): FFT size. int > 0 [scalar]
+        n_mel (int): Mel filter size. int > 0 [scalar]
+        fmin (float): lowest frequency (in Hz). If None use 0.0.
+            float >= 0 [scalar]
+        fmax: highest frequency (in Hz). If None use sample_rate / 2.
+            float >= 0 [scalar]
+    Returns
+        out (numpy.ndarray): Mel transform matrix
+            [shape=(n_mels, 1 + n_fft/2)]
+    """
+    bank_width = int(n_fft // 2 + 1)
+    if fmax is None:
+        fmax = sample_rate / 2
+    if fmin is None:
+        fmin = 0
+    assert fmin >= 0, "fmin cannot be negtive"
+    assert fmin < fmax <= sample_rate / 2, "fmax must be between (fmin, samplerate / 2]"
+    def mel(f):
+        return 1127.0 * np.log(1.0 + f / 700.0)
+    def bin2mel(fft_bin):
+        return 1127.0 * np.log(1.0 + fft_bin * sample_rate / (n_fft * 700.0))
+    def f2bin(f):
+        return int((f * n_fft / sample_rate) + 0.5)
+    # Spec 1: FFT bin range [f2bin(fmin) + 1, f2bin(fmax) - 1]
+    klo = f2bin(fmin) + 1
+    khi = f2bin(fmax)
+    khi = max(khi, klo)
+    # Spec 2: SpeechLib uses trianges in Mel space
+    mlo = mel(fmin)
+    mhi = mel(fmax)
+    m_centers = np.linspace(mlo, mhi, n_mels + 2)
+    ms = (mhi - mlo) / (n_mels + 1)
+    matrix = np.zeros((n_mels, bank_width), dtype=np.float32)
+    for m in range(0, n_mels):
+        left = m_centers[m]
+        center = m_centers[m + 1]
+        right = m_centers[m + 2]
+        for fft_bin in range(klo, khi):
+            mbin = bin2mel(fft_bin)
+            if left < mbin < right:
+                matrix[m, fft_bin] = 1.0 - abs(center - mbin) / ms
+    return matrix
+class Phi4MultimodalFeatureExtractor(SequenceFeatureExtractor):
+    model_input_names = ["audio_input_features", "audio_embed_sizes", "audio_attention_mask"]
+    def __init__(
+        self,
+        feature_size: int = 80,
+        sampling_rate: int = 16000,
+        hop_length: int = 160,
+        n_fft: int = 512,
+        win_length: int = 400,
+        preemphasis: float = 0.97,
+        padding_value: float = 0.0,
+        audio_compression_rate: int = 8,
+        audio_downsample_rate: int = 1,
+        audio_feat_stride: int = 1,
+        mel_min_frequency: float = 0,
+        mel_max_frequency: float = 7690,
+        **kwargs,
+    ):
+        super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
+        self.hop_length = hop_length
+        self.n_fft = n_fft
+        self.win_length = win_length
+        self.preemphasis = preemphasis
+        self.padding_value = padding_value
+        self.audio_compression_rate = audio_compression_rate
+        self.audio_downsample_rate = audio_downsample_rate
+        self.audio_feat_stride = audio_feat_stride
+        # TODO: @eustlb, uncomment and remove speechlib_mel once #36603 is merged.
+        # self.mel_filters = mel_filter_bank(
+        #     num_frequency_bins=self.n_fft // 2 + 1,
+        #     num_mel_filters=self.feature_size,
+        #     min_frequency=mel_min_frequency,
+        #     max_frequency=mel_max_frequency,
+        #     sampling_rate=self.sampling_rate,
+        #     triangularize_in_mel_space=True,
+        #     mel_scale="kaldi",
+        # )
+        self.mel_filters = speechlib_mel(
+            self.sampling_rate, self.n_fft, self.feature_size, mel_min_frequency, mel_max_frequency
+        ).T
+    def __call__(
+        self,
+        raw_speech: AudioInput,
+        sampling_rate: Optional[int] = None,
+        pad_to_multiple_of: Optional[int] = None,
+        padding: Optional[str] = "longest",
+        max_length: Optional[int] = None,
+        truncation: bool = False,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_attention_mask: Optional[bool] = True,
+        device: Optional[str] = "cpu",
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        Main method to featurize and prepare for the model one or several audio sequence(s). Implementation uses PyTorch for
+        the STFT computation if available, otherwise a slower NumPy based one.
+        Args:
+            raw_speech (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The sequence or batch of sequences to be processed. Each sequence can be a numpy array or PyTorch tensor.
+                For batched inputs, sequences can be a list of numpy arrays or PyTorch tensors, or a single numpy array or
+                PyTorch tensor with first dimension being the batch size.
+            sampling_rate (`int`, *optional*):
+                The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
+                `sampling_rate` at the forward call to prevent silent errors.
+            pad_to_multiple_of (`int`, *optional*, defaults to None):
+                If set will pad the sequence to a multiple of the provided value.
+            padding (`str`, *optional*, defaults to "longest"):
+                Padding strategy. Can be "longest" to pad to the longest sequence in the batch, or a specific length.
+            max_length (`int`, *optional*):
+                Maximum length of the returned list and optionally padding length.
+            truncation (`bool`, *optional*, defaults to False):
+                Activates truncation to cut input sequences longer than *max_length* to *max_length*.
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors instead of numpy arrays. Acceptable values are:
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+            return_attention_mask (`bool`, *optional*, defaults to `True`):
+                Whether to return the extracted audio input features' attention mask.
+            device (`str`, *optional*, defaults to "cpu"):
+                Specifies the device for computation of the audio features. (e.g., "cpu", "cuda")
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+                - **audio_input_features** -- Audio features extracted from the raw audio input, shape (batch_size, max_feature_length, feature_size).
+                - **audio_lengths** -- Length of each audio sample in the batch, shape (batch_size,).
+                - **audio_attention_mask** -- Attention mask for the audio input, shape (batch_size, max_feature_length).
+                If `return_tensors` is not specified, the fields will be PyTorch tensors if PyTorch is available, otherwise NumPy arrays.
+        """
+        if sampling_rate is not None:
+            if sampling_rate != self.sampling_rate:
+                raise ValueError(
+                    f"The model corresponding to this feature extractor: {self.__class__.__name__} was trained using a"
+                    f" sampling rate of {self.sampling_rate}. Please make sure that the provided `raw_speech` input"
+                    f" was sampled with {self.sampling_rate} and not {sampling_rate}."
+                )
+        else:
+            logger.warning(
+                f"It is strongly recommended to pass the `sampling_rate` argument to `{self.__class__.__name__}()`. "
+                "Failing to do so can result in silent errors that might be hard to debug."
+            )
+        # Convert to torch tensor
+        if isinstance(raw_speech, np.ndarray):
+            raw_speech = torch.tensor(raw_speech)
+        elif isinstance(raw_speech, (list, tuple)) and isinstance(raw_speech[0], np.ndarray):
+            raw_speech = [torch.tensor(speech) for speech in raw_speech]
+        is_batched_torch = isinstance(raw_speech, torch.Tensor) and len(raw_speech.shape) > 1
+        if is_batched_torch and len(raw_speech.shape) > 2:
+            logger.warning(
+                f"Only mono-channel audio is supported for input to {self.__class__.__name__}. "
+                "We will take the mean of the channels to convert to mono."
+            )
+            raw_speech = raw_speech.mean(-1)
+        is_batched_sequence = isinstance(raw_speech, (list, tuple))
+        if is_batched_sequence:
+            for speech in raw_speech:
+                if len(speech.shape) > 1:
+                    logger.warning(
+                        f"Only mono-channel audio is supported for input to {self.__class__.__name__}. "
+                        "We will take the mean of the channels to convert to mono."
+                    )
+                    speech = speech.mean(-1)
+        if is_batched_torch or is_batched_sequence:
+            raw_speech = [speech[:, None].to(torch.float32) for speech in raw_speech]
+        else:
+            raw_speech = [raw_speech[:, None].to(torch.float32)]
+        audio_lengths = [len(speech) for speech in raw_speech]
+        # convert into correct format for padding
+        batched_speech = BatchFeature(data={"audio_input_features": raw_speech, "audio_lengths": audio_lengths})
+        padded_inputs = self.pad(
+            batched_speech,
+            padding=padding,
+            max_length=max_length,
+            truncation=truncation,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_tensors="pt",
+        )
+        input_features = padded_inputs.audio_input_features.squeeze(-1)
+        audio_lengths = padded_inputs.audio_lengths
+        input_features = self._torch_extract_fbank_features(input_features, audio_lengths, device)
+        feature_lengths = (audio_lengths - self.win_length) // self.hop_length + 1
+        feature_lengths = feature_lengths * self.audio_feat_stride
+        audio_embed_sizes = self._compute_audio_embed_size(feature_lengths)
+        feature_attention_mask = (
+            torch.arange(0, feature_lengths.max()) if is_torch_available() else np.arange(0, feature_lengths.max())
+        )
+        feature_attention_mask = (
+            feature_attention_mask[None, :] < feature_lengths[:, None] if len(feature_lengths) > 1 else None
+        )
+        data = {
+            "audio_input_features": input_features,
+            "audio_embed_sizes": audio_embed_sizes,
+        }
+        if feature_attention_mask is not None and return_attention_mask:
+            data["audio_attention_mask"] = feature_attention_mask
+        return BatchFeature(data=data, tensor_type=return_tensors)
+    # TODO; @eustlb, move this to audio_utils in a general spectogram_batch function that handles torch and numpy
+    def _torch_extract_fbank_features(
+        self, waveform: "torch.FloatTensor", audio_lengths: "torch.Tensor", device: str = "cpu"
+    ) -> "torch.FloatTensor":
+        """
+        Compute the log mel-scaled spectrogram of batched waveforms using PyTorch's FFT implementation.
+        Args:
+            waveform (torch.FloatTensor` of shape `(batch_size, max_audio_length)`):
+                The batched waveforms.
+            audio_lengths (`torch.Tensor` of shape `(batch_size,)`):
+                The lengths of the waveforms along the max_audio_length dimension.
+            device (`str`, *optional*, defaults to "cpu"):
+                The device to run the computation on. (e.g., "cpu", "cuda")
+        Returns:
+            `torch.FloatTensor` of shape `(batch_size, max_feature_length, feature_size)`:
+                The log mel-scaled spectrogram of the batched waveforms.
+        """
+        fft_window = torch.hamming_window(self.win_length, periodic=False, device=device, dtype=torch.float64)
+        # batched implementation
+        batch_size = waveform.shape[0]
+        frames = waveform.unfold(-1, self.win_length, self.hop_length)
+        # ---
+        # the unbatched (and unpaded) original implementation skips last few audio values that can't be included in a frame
+        # we need to ensure that the corresponding frames for the padded input also mask these values
+        if batch_size > 1:
+            frames = frames.clone()
+            # concerned batch indices
+            to_mask_batch_idxs = torch.arange(batch_size)[audio_lengths != audio_lengths.max()]
+            if to_mask_batch_idxs.numel() > 0:
+                batch_idxs_down = (audio_lengths[to_mask_batch_idxs] - self.win_length) // self.hop_length + 1
+                batch_idxs_up = audio_lengths[to_mask_batch_idxs] // self.hop_length + 1
+                offset_idx = batch_idxs_down.min()
+                max_idx = batch_idxs_up.max()
+                mask = torch.arange(max_idx - offset_idx, device=device).expand(to_mask_batch_idxs.shape[0], -1)
+                mask = ((batch_idxs_down - offset_idx).unsqueeze(1) <= mask) & (
+                    mask < (batch_idxs_up - offset_idx).unsqueeze(1)
+                )
+                mask = mask.unsqueeze(-1).expand(-1, -1, self.win_length)
+                masked_frames = frames[to_mask_batch_idxs, offset_idx:max_idx].masked_fill_(mask, 0)
+                frames[to_mask_batch_idxs, offset_idx:max_idx] = masked_frames
+        # ---
+        # apply pre-emphasis first order filter on fft windows
+        frames_prev = torch.roll(frames, 1, dims=-1)
+        frames_prev[:, :, 0] = frames_prev[:, :, 1]
+        frames = (frames - self.preemphasis * frames_prev) * 32768
+        # apply fft
+        S = torch.fft.rfft(fft_window * frames.view(-1, self.win_length), n=self.n_fft, dim=1)
+        S = S.view(frames.shape[0], -1, S.shape[-1])
+        S = S.to(torch.complex64)
+        spec = torch.abs(S)
+        spec_power = spec**2
+        # apply triangular mel filter bank
+        mel_filters = torch.from_numpy(self.mel_filters).to(device, torch.float32)
+        log_spec = torch.clamp(spec_power @ mel_filters, min=1.0)
+        log_spec = torch.log(log_spec)
+        return log_spec
+    def _compute_audio_embed_size(self, audio_frames):
+        integer = audio_frames // self.audio_compression_rate
+        remainder = audio_frames % self.audio_compression_rate
+        result = integer + (remainder > 0).to(integer.dtype)
+        integer = result // self.audio_downsample_rate
+        remainder = result % self.audio_downsample_rate
+        result = integer + (remainder > 0).to(integer.dtype)  # qformer compression
+        return result
+__all__ = ["Phi4MultimodalFeatureExtractor"]
+Phi4MultimodalFeatureExtractor.register_for_auto_class()

generation_config.json CHANGED Viewed

@@ -2,10 +2,9 @@
   "_from_model_config": true,
   "bos_token_id": 199999,
   "eos_token_id": [
-    200020,
-    199999
   ],
   "pad_token_id": 199999,
-  "transformers_version": "4.46.1",
-  "use_cache": true
 }

   "_from_model_config": true,
   "bos_token_id": 199999,
   "eos_token_id": [
+    199999,
+    200020
   ],
   "pad_token_id": 199999,
+  "transformers_version": "4.52.0.dev0"
 }

image_processing_phi4_multimodal_fast.py ADDED Viewed

	@@ -0,0 +1,284 @@

+# Copyright 2025 Microsoft and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for Phi4Multimodal
+"""
+import math
+from typing import List, Optional, Union, TypedDict
+import torch
+from torchvision.transforms import functional as F
+from transformers.image_processing_utils_fast import (
+    BaseImageProcessorFast,
+    BatchFeature,
+    Unpack,
+    convert_to_rgb,
+    ChannelDimension
+)
+from transformers.image_utils import ImageInput, make_flat_list_of_images, valid_images
+from transformers.utils import TensorType, logging
+logger = logging.get_logger(__name__)
+class DefaultFastImageProcessorKwargs(TypedDict, total=False):
+    do_resize: Optional[bool]
+    size: Optional[dict[str, int]]
+    default_to_square: Optional[bool]
+    resample: Optional[Union["PILImageResampling", "F.InterpolationMode"]]
+    do_center_crop: Optional[bool]
+    crop_size: Optional[dict[str, int]]
+    do_rescale: Optional[bool]
+    rescale_factor: Optional[Union[int, float]]
+    do_normalize: Optional[bool]
+    image_mean: Optional[Union[float, list[float]]]
+    image_std: Optional[Union[float, list[float]]]
+    do_convert_rgb: Optional[bool]
+    return_tensors: Optional[Union[str, TensorType]]
+    data_format: Optional[ChannelDimension]
+    input_data_format: Optional[Union[str, ChannelDimension]]
+    device: Optional["torch.device"]
+class Phi4MultimodalFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
+    image_size: Optional[int]
+    patch_size: Optional[int]
+    dynamic_hd: Optional[int]
+class Phi4MultimodalImageProcessorFast(BaseImageProcessorFast):
+    r"""
+    Constructs a Phi4Multimodal image processor.
+    """
+    image_size = 448
+    patch_size = 14
+    dynamic_hd = 36
+    image_mean = [0.5, 0.5, 0.5]
+    image_std = [0.5, 0.5, 0.5]
+    valid_init_kwargs = Phi4MultimodalFastImageProcessorKwargs
+    model_input_names = ["image_pixel_values", "image_sizes", "image_attention_mask"]
+    def __init__(self, **kwargs: Unpack[Phi4MultimodalFastImageProcessorKwargs]):
+        super().__init__(**kwargs)
+    def find_closest_aspect_ratio(self, aspect_ratio, target_ratios, width, height):
+        best_ratio_diff = float("inf")
+        best_ratio = (1, 1)
+        area = width * height
+        for ratio in target_ratios:
+            target_aspect_ratio = ratio[0] / ratio[1]
+            ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+            if ratio_diff < best_ratio_diff:
+                best_ratio_diff = ratio_diff
+                best_ratio = ratio
+            elif ratio_diff == best_ratio_diff:
+                if area > 0.5 * self.image_size * self.image_size * ratio[0] * ratio[1]:
+                    best_ratio = ratio
+        return best_ratio
+    def dynamic_preprocess(self, image, max_num=36, min_num=1):
+        image_size = self.image_size
+        patch_size = self.patch_size
+        mask_size = image_size // patch_size
+        orig_width, orig_height = image.size
+        w_crop_num = math.ceil(orig_width / float(image_size))
+        h_crop_num = math.ceil(orig_height / float(image_size))
+        if w_crop_num * h_crop_num > max_num:
+            aspect_ratio = orig_width / orig_height
+            # calculate the existing image aspect ratio
+            target_ratios = {
+                (i, j)
+                for n in range(min_num, max_num + 1)
+                for i in range(1, n + 1)
+                for j in range(1, n + 1)
+                if i * j <= max_num and i * j >= min_num
+            }
+            target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+            # find the closest aspect ratio to the target
+            target_aspect_ratio = self.find_closest_aspect_ratio(aspect_ratio, target_ratios, orig_width, orig_height)
+            # calculate the target width and height
+            target_width = image_size * target_aspect_ratio[0]
+            target_height = image_size * target_aspect_ratio[1]
+        else:
+            target_width = image_size * w_crop_num
+            target_height = image_size * h_crop_num
+            target_aspect_ratio = (w_crop_num, h_crop_num)
+        # Calculate the ratio
+        ratio_width = target_width / orig_width
+        ratio_height = target_height / orig_height
+        if ratio_width < ratio_height:
+            new_size = (target_width, int(orig_height * ratio_width))
+            padding_width = 0
+            padding_height = target_height - int(orig_height * ratio_width)
+        else:
+            new_size = (int(orig_width * ratio_height), target_height)
+            padding_width = target_width - int(orig_width * ratio_height)
+            padding_height = 0
+        attention_mask = torch.ones((int(mask_size * target_aspect_ratio[1]), int(mask_size * target_aspect_ratio[0])))
+        if padding_width >= patch_size:
+            attention_mask[:, -math.floor(padding_width / patch_size) :] = 0
+        if padding_height >= patch_size:
+            attention_mask[-math.floor(padding_height / patch_size) :, :] = 0
+        if min(new_size[1], target_height) < 10 or min(new_size[0], target_width) < 10:
+            raise ValueError(f"the aspect ratio is very extreme {new_size}")
+        image = F.resize(image, [new_size[1], new_size[0]])
+        resized_img = F.pad(image, [0, 0, padding_width, padding_height], fill=[255, 255, 255])
+        return resized_img, attention_mask
+    def pad_to_max_num_crops(self, images, max_crops=5):
+        """
+        images: B x 3 x H x W, B<=max_crops
+        """
+        B, _, H, W = images.shape
+        if B < max_crops:
+            pad = torch.zeros(max_crops - B, 3, H, W, dtype=images.dtype, device=images.device)
+            images = torch.cat([images, pad], dim=0)
+        return images
+    def pad_mask_to_max_num_crops(self, masks, max_crops=5):
+        B, H, W = masks.shape
+        if B < max_crops:
+            pad = torch.ones(max_crops - B, H, W, dtype=masks.dtype, device=masks.device)
+            masks = torch.cat([masks, pad], dim=0)
+        return masks
+    def preprocess(
+        self,
+        images: ImageInput,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+    ):
+        """
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Mean to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Standard deviation to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+        """
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        images = make_flat_list_of_images(images)
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        images = [convert_to_rgb(image) for image in images]
+        image_size = self.image_size
+        patch_size = self.patch_size
+        mask_size = image_size // patch_size
+        imgs_and_masks = [self.dynamic_preprocess(image, max_num=self.dynamic_hd) for image in images]
+        images, image_attention_masks = [x[0] for x in imgs_and_masks], [x[1] for x in imgs_and_masks]
+        images = [F.to_tensor(image) for image in images]
+        hd_images = [F.normalize(image, image_mean, image_std) for image in images]
+        global_image = [
+            torch.nn.functional.interpolate(
+                image.unsqueeze(0).float(),
+                size=(image_size, image_size),
+                mode="bicubic",
+            ).to(image.dtype)
+            for image in hd_images
+        ]
+        shapes = [[image.size(1), image.size(2)] for image in hd_images]
+        mask_shapes = [[mask.size(0), mask.size(1)] for mask in image_attention_masks]
+        global_attention_mask = [torch.ones((1, mask_size, mask_size)) for _ in hd_images]
+        hd_images_reshape = []
+        for im, (h, w) in zip(hd_images, shapes):
+            im = im.reshape(1, 3, h // image_size, image_size, w // image_size, image_size)
+            im = im.permute(0, 2, 4, 1, 3, 5)
+            im = im.reshape(-1, 3, image_size, image_size)
+            hd_images_reshape.append(im.contiguous())
+        attention_masks_reshape = []
+        for mask, (h, w) in zip(image_attention_masks, mask_shapes):
+            mask = mask.reshape(h // mask_size, mask_size, w // mask_size, mask_size)
+            mask = mask.transpose(1, 2)
+            mask = mask.reshape(-1, mask_size, mask_size)
+            attention_masks_reshape.append(mask.contiguous())
+        downsample_attention_masks = []
+        for mask, (h, w) in zip(attention_masks_reshape, mask_shapes):
+            mask = mask[:, 0::2, 0::2]
+            mask = mask.reshape(
+                h // mask_size, w // mask_size, mask_size // 2 + mask_size % 2, mask_size // 2 + mask_size % 2
+            )
+            mask = mask.transpose(1, 2)
+            mask = mask.reshape(mask.size(0) * mask.size(1), mask.size(2) * mask.size(3))
+            downsample_attention_masks.append(mask)
+        num_img_tokens = [
+            256 + 1 + int(mask.sum().item()) + int(mask[:, 0].sum().item()) + 16 for mask in downsample_attention_masks
+        ]
+        hd_images_reshape = [
+            torch.cat([_global_image] + [_im], dim=0) for _global_image, _im in zip(global_image, hd_images_reshape)
+        ]
+        hd_masks_reshape = [
+            torch.cat([_global_mask] + [_mask], dim=0)
+            for _global_mask, _mask in zip(global_attention_mask, attention_masks_reshape)
+        ]
+        max_crops = max([img.size(0) for img in hd_images_reshape])
+        image_transformed = [self.pad_to_max_num_crops(im, max_crops) for im in hd_images_reshape]
+        image_transformed = torch.stack(image_transformed, dim=0)
+        mask_transformed = [self.pad_mask_to_max_num_crops(mask, max_crops) for mask in hd_masks_reshape]
+        mask_transformed = torch.stack(mask_transformed, dim=0)
+        returned_input_image_embeds = image_transformed
+        returned_image_sizes = torch.tensor(shapes, dtype=torch.long)
+        returned_image_attention_mask = mask_transformed
+        returned_num_img_tokens = num_img_tokens
+        data = {
+            "image_pixel_values": returned_input_image_embeds,
+            "image_sizes": returned_image_sizes,
+            "image_attention_mask": returned_image_attention_mask,
+            "num_img_tokens": returned_num_img_tokens,
+        }
+        return BatchFeature(data=data, tensor_type=return_tensors)
+__all__ = ["Phi4MultimodalImageProcessorFast"]
+Phi4MultimodalImageProcessorFast.register_for_auto_class()

model-00001-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:57b93f5d0c9422c0b76b68119660187989bd8bb47848994376be3ac53eb61a95
+size 4903637712

model-00002-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fd6f60df08041b5c48afe7d7624d4de6e9d7d86162dec7a7e908a71d595e2967
+size 4584575136

model.safetensors.index.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

modeling_phi4_multimodal.py ADDED Viewed

The diff for this file is too large to render. See raw diff

preprocessor_config.json CHANGED Viewed

@@ -1,14 +1,21 @@
 {
   "auto_map": {
-    "AutoProcessor": "processing_phi4mm.Phi4MMProcessor",
-    "AutoImageProcessor": "processing_phi4mm.Phi4MMImageProcessor",
-    "AutoFeatureExtractor": "processing_phi4mm.Phi4MMAudioFeatureExtractor"
   },
-  "image_processor_type": "Phi4MMImageProcessor",
-  "processor_class": "Phi4MMProcessor",
-  "feature_extractor_type": "Phi4MMAudioFeatureExtractor",
   "audio_compression_rate": 8,
   "audio_downsample_rate": 1,
   "audio_feat_stride": 1,
-  "dynamic_hd": 36
 }

 {
   "auto_map": {
+    "AutoProcessor": "processing_phi4_multimodal.Phi4MultimodalProcessor",
+    "AutoImageProcessor": "image_processing_phi4_multimodal_fast.Phi4MultimodalImageProcessorFast",
+    "AutoFeatureExtractor": "feature_extraction_phi4_multimodal.Phi4MultimodalFeatureExtractor"
   },
   "audio_compression_rate": 8,
   "audio_downsample_rate": 1,
   "audio_feat_stride": 1,
+  "feature_extractor_type": "Phi4MultimodalFeatureExtractor",
+  "feature_size": 80,
+  "hop_length": 160,
+  "n_fft": 512,
+  "padding_side": "right",
+  "padding_value": 0.0,
+  "preemphasis": 0.97,
+  "processor_class": "Phi4MultimodalProcessor",
+  "return_attention_mask": true,
+  "sampling_rate": 16000,
+  "win_length": 400
 }

processing_phi4_multimodal.py ADDED Viewed

	@@ -0,0 +1,541 @@

+# Copyright 2025 Microsoft and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for Phi4Multimodal
+"""
+import re
+import os
+import requests
+import base64
+from io import BytesIO
+from typing import List, Optional, Union, TypedDict
+import librosa
+import numpy as np
+import PIL.Image
+from transformers.image_processing_utils import BatchFeature
+from transformers.image_utils import ImageInput
+from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack, TextKwargs, ImagesKwargs, VideosKwargs, AudioKwargs, CommonKwargs, ProcessorChatTemplateKwargs
+from transformers.tokenization_utils_base import TextInput
+from transformers.utils import logging
+from .feature_extraction_phi4_multimodal import AudioInput
+logger = logging.get_logger(__name__)
+class ChatTemplateLoadKwargs(TypedDict, total=False):
+    """
+    Keyword arguments used to load multimodal data in processor chat templates.
+    num_frames (`int`, *optional*):
+        Number of frames to sample uniformly. If not passed, the whole video is loaded.
+    video_load_backend (`str`, *optional*, defaults to `"pyav"`):
+        The backend to use when loading the video which will be used only when there are videos in the conversation.
+        Can be any of ["decord", "pyav", "opencv", "torchvision"]. Defaults to "pyav" because it is the only backend
+        that supports all types of sources to load from.
+    video_fps (`int`, *optional*):
+        Number of frames to sample per second. Should be passed only when `num_frames=None`.
+        If not specified and `num_frames==None`, all frames are sampled.
+    sample_indices_fn (`Callable`, *optional*):
+            A callable function that will return indices at which the video should be sampled. If the video has to be loaded using
+            by a different sampling technique than provided by `num_frames` or `fps` arguments, one should provide their own `sample_indices_fn`.
+            If not provided, simple uniformt sampling with fps is performed, otherwise `sample_indices_fn` has priority over other args.
+            The function expects at input the all args along with all kwargs passed to `load_video` and should output valid
+            indices at which the video should be sampled. For example:
+            def sample_indices_fn(num_frames, fps, metadata, **kwargs):
+                # add you sampling logic here ...
+                return np.linspace(start_idx, end_idx, num_frames, dtype=int)
+    """
+    num_frames: Optional[int] = None
+    video_load_backend: Optional[str] = "pyav"
+    video_fps: Optional[int] = None
+    sampling_rate: Optional[int] = 16_000
+    load_audio_from_video: Optional[bool] = False
+class AllKwargsForChatTemplate(
+    TextKwargs, ImagesKwargs, VideosKwargs, AudioKwargs, CommonKwargs, ProcessorChatTemplateKwargs
+):
+    processor_kwargs: ProcessingKwargs = {
+        **ProcessingKwargs.__annotations__,
+    }
+    mm_load_kwargs: ChatTemplateLoadKwargs = {
+        **TextKwargs.__annotations__,
+    }
+    template_kwargs: ProcessorChatTemplateKwargs = {
+        **ProcessorChatTemplateKwargs.__annotations__,
+    }
+class Phi4MultimodalProcessorKwargs(ProcessingKwargs, total=False):
+    _defaults = {
+        "audio_kwargs": {
+            "device": "cpu",
+        },
+    }
+def load_audio(audio: Union[str, np.ndarray], sampling_rate=16000, timeout=None) -> np.ndarray:
+    """
+    Loads `audio` to an np.ndarray object.
+    Args:
+        audio (`str` or `np.ndarray`):
+            The audio to be laoded to the numpy array format.
+        sampling_rate (`int`, *optional*, defaults to 16000):
+            The samlping rate to be used when loading the audio. It should be same as the
+            sampling rate the model you will be using further was trained with.
+        timeout (`float`, *optional*):
+            The timeout value in seconds for the URL request.
+    Returns:
+        `np.ndarray`: A numpy artay representing the audio.
+    """
+    if isinstance(audio, str):
+        # Load audio from URL (e.g https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/translate_to_chinese.wav)
+        if audio.startswith("http://") or audio.startswith("https://"):
+            audio = librosa.load(BytesIO(requests.get(audio, timeout=timeout).content), sr=sampling_rate)[0]
+        elif os.path.isfile(audio):
+            audio = librosa.load(audio, sr=sampling_rate)[0]
+    elif isinstance(audio, np.ndarray):
+        audio = audio
+    else:
+        raise TypeError(
+            "Incorrect format used for `audio`. Should be an url linking to an audio, a local path, or numpy array."
+        )
+    return audio
+def load_image(image: Union[str, "PIL.Image.Image"], timeout: Optional[float] = None) -> "PIL.Image.Image":
+    """
+    Loads `image` to a PIL Image.
+    Args:
+        image (`str` or `PIL.Image.Image`):
+            The image to convert to the PIL Image format.
+        timeout (`float`, *optional*):
+            The timeout value in seconds for the URL request.
+    Returns:
+        `PIL.Image.Image`: A PIL Image.
+    """
+    if isinstance(image, str):
+        if image.startswith("http://") or image.startswith("https://"):
+            # We need to actually check for a real protocol, otherwise it's impossible to use a local file
+            # like http_huggingface_co.png
+            image = PIL.Image.open(BytesIO(requests.get(image, timeout=timeout).content))
+        elif os.path.isfile(image):
+            image = PIL.Image.open(image)
+        else:
+            if image.startswith("data:image/"):
+                image = image.split(",")[1]
+            # Try to load as base64
+            try:
+                b64 = base64.decodebytes(image.encode())
+                image = PIL.Image.open(BytesIO(b64))
+            except Exception as e:
+                raise ValueError(
+                    f"Incorrect image source. Must be a valid URL starting with `http://` or `https://`, a valid path to an image file, or a base64 encoded string. Got {image}. Failed with {e}"
+                )
+    elif isinstance(image, PIL.Image.Image):
+        image = image
+    else:
+        raise TypeError(
+            "Incorrect format used for image. Should be an url linking to an image, a base64 string, a local path, or a PIL image."
+        )
+    image = PIL.ImageOps.exif_transpose(image)
+    image = image.convert("RGB")
+    return image
+class Phi4MultimodalProcessor(ProcessorMixin):
+    r"""
+    Constructs a Phi4Multimodal processor which raps an image processor, a audio processor, and a GPT tokenizer into a single processor.
+    [`Phi4MultimodalProcessor`] offers all the functionalities of [`Phi4MultimodalImageProcessorFast`] and [`GPT2Tokenizer`]. See the
+    [`~Phi4MultimodalProcessor.__call__`] and [`~Phi4MultimodalProcessor.decode`] for more information.
+    Args:
+        image_processor (`Phi4MultimodalImageProcessorFast`):
+            The image processor to use for images.
+        audio_processor (`Phi4MultimodalFeatureExtractor`):
+            The audio processor to use for audio inputs.
+        tokenizer (`GPT2TokenizerFast`):
+            The tokenizer to use for text.
+        fake_image_token_pattern (`str`, *optional*, defaults to `r"<\|image_\d+\|>"`):
+            The fake image token pattern.
+        fake_audio_token_pattern (`str`, *optional*, defaults to `r"<\|audio_\d+\|>"`):
+            The fake audio token pattern.
+    """
+    attributes = ["image_processor", "audio_processor", "tokenizer"]
+    tokenizer_class = "GPT2TokenizerFast"
+    image_processor_class = "AutoImageProcessor"
+    audio_processor_class = "AutoFeatureExtractor"
+    valid_kwargs = ["chat_template"]
+    def __init__(
+        self,
+        image_processor,
+        audio_processor,
+        tokenizer,
+        **kwargs,
+    ):
+        self.image_token = tokenizer.image_token
+        self.image_token_id = tokenizer.image_token_id
+        self.audio_token = tokenizer.audio_token
+        self.audio_token_id = tokenizer.audio_token_id
+        super().__init__(image_processor, audio_processor, tokenizer, **kwargs)
+    def __call__(
+        self,
+        text: Union[TextInput, List[TextInput]],
+        images: Optional[ImageInput] = None,
+        audio: Optional[AudioInput] = None,
+        **kwargs: Unpack[ProcessingKwargs],
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forards the `text`
+        and `kwargs` arguments to GPT2Tokenizer's [`~GPT2Tokenizer.__call__`] if `text` is not `None` to encode
+        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
+        Phi4MultimodalImageProcessorFast's [`~Phi4MultimodalImageProcessorFast.__call__`] if `images` is not `None`. Please refer to the doctsring
+        of the above two methods for more information.
+        Args:
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. Both channels-first and channels-last formats are supported.
+            audio (`List[Union[np.ndarray, torch.Tensor]]`):
+                List of the audios to be prepared.
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+            - **input_ids** -- List of token ids to be fed to a model.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model.
+            - **input_image_embeds** -- Pixel values to be fed to a model.
+            - **image_sizes** -- List of tuples specifying the size of each image in `input_image_embeds`.
+            - **image_attention_mask** -- List of attention masks for each image in `input_image_embeds`.
+            - **input_audio_embeds** -- Audio embeddings to be fed to a model.
+            - **audio_embed_sizes** -- List of integers specifying the size of each audio in `input_audio_embeds`.
+        """
+        output_kwargs = self._merge_kwargs(Phi4MultimodalProcessorKwargs, self.tokenizer.init_kwargs, **kwargs)
+        image_kwargs = output_kwargs["images_kwargs"]
+        audio_kwargs = output_kwargs["audio_kwargs"]
+        image_inputs = self.image_processor(images, **image_kwargs) if images is not None else {}
+        audio_inputs = self.audio_processor(audio, **audio_kwargs) if audio is not None else {}
+        # We pop here for images as we don't need it later
+        num_img_tokens = image_inputs.pop("num_img_tokens", [])
+        audio_embed_sizes = audio_inputs.get("audio_embed_sizes", [])
+        # Replace certain special tokens for compatibility
+        if isinstance(text, str):
+            text = [text]
+        elif not isinstance(text, list) and not isinstance(text[0], str):
+            raise ValueError("Invalid input text. Please provide a string, or a list of strings")
+        image_token = self.tokenizer.image_token
+        audio_token = self.tokenizer.audio_token
+        # Check that the number of special tokens is sound
+        concatenated_prompt = "".join(text)
+        if concatenated_prompt.count(image_token) != len(num_img_tokens):
+            raise ValueError(
+                "You should add as much image tokens `<|image|>` in your prompt as you pass `images` to the processor. ",
+                f"Input contains {concatenated_prompt.count(image_token)} tokens != {len(num_img_tokens)} images",
+            )
+        if concatenated_prompt.count(audio_token) != len(audio_embed_sizes):
+            raise ValueError(
+                "You should add as much audio tokens `<|audio|>` in your prompt as you pass `audios` to the processor. "
+                f"Input contains {concatenated_prompt.count(audio_token)} tokens != {len(audio_embed_sizes)} audios"
+            )
+        # Add appropriate number of image/audio tokens (note that the count of replacement is dynamic)
+        image_count_iter = iter(num_img_tokens)
+        audio_count_iter = iter(audio_embed_sizes)
+        processed_text = [
+            re.sub(re.escape(image_token), lambda _: image_token * next(image_count_iter), t) for t in text
+        ]
+        processed_text = [
+            re.sub(re.escape(audio_token), lambda _: audio_token * next(audio_count_iter), t) for t in processed_text
+        ]
+        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
+        text_inputs = self.tokenizer(processed_text, **output_kwargs["text_kwargs"])
+        self._check_special_mm_tokens(processed_text, text_inputs, modalities=["image"])
+        # prepare batch feature
+        data = {
+            **text_inputs,
+            **image_inputs,
+            **audio_inputs,
+        }
+        return BatchFeature(data=data, tensor_type=return_tensors)
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to GPT2Tokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to GPT2Tokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        audio_processor_input_names = self.audio_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names + audio_processor_input_names))
+    def _check_special_mm_tokens(self, text: list[str], text_inputs: "BatchFeature", modalities: list[str]):
+        """
+        Checks that number of special tokens in text and processed text is same. The count can be different
+        if tokenized text was truncated, leading to issues in model code.
+        """
+        for modality in modalities:
+            token_str = getattr(self, f"{modality}_token")
+            token_id = getattr(self, f"{modality}_token_id")
+            ids_count = [list(ids).count(token_id) for ids in text_inputs["input_ids"]]
+            text_count = [sample.count(token_str) for sample in text]
+            if ids_count != text_count:
+                raise ValueError(
+                    f"Mismatch in `{modality}` token count between text and `input_ids`. Got ids={ids_count} and text={text_count}. "
+                    "Likely due to `truncation='max_length'`. Please disable truncation or increase `max_length`."
+                )
+    def apply_chat_template(
+        self,
+        conversation: Union[list[dict[str, str]], list[list[dict[str, str]]]],
+        chat_template: Optional[str] = None,
+        **kwargs: Unpack[AllKwargsForChatTemplate],
+    ) -> str:
+        """
+        Similar to the `apply_chat_template` method on tokenizers, this method applies a Jinja template to input
+        conversations to turn them into a single tokenizable string.
+        The input is expected to be in the following format, where each message content is a list consisting of text and
+        optionally image or video inputs. One can also provide an image, video, URL or local path which will be used to form
+        `pixel_values` when `return_dict=True`. If not provided, one will get only the formatted text, optionally tokenized text.
+        conversation = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image", "image": "https://www.ilankelman.org/stopsigns/australia.jpg"},
+                    {"type": "text", "text": "Please describe this image in detail."},
+                ],
+            },
+        ]
+        Args:
+            conversation (`Union[List[Dict, [str, str]], List[List[Dict[str, str]]]]`):
+                The conversation to format.
+            chat_template (`Optional[str]`, *optional*):
+                The Jinja template to use for formatting the conversation. If not provided, the tokenizer's
+                chat template is used.
+        """
+        if chat_template is None:
+            if isinstance(self.chat_template, dict) and "default" in self.chat_template:
+                chat_template = self.chat_template["default"]
+            elif isinstance(self.chat_template, dict):
+                raise ValueError(
+                    'The processor has multiple chat templates but none of them are named "default". You need to specify'
+                    " which one to use by passing the `chat_template` argument. Available templates are: "
+                    f"{', '.join(self.chat_template.keys())}"
+                )
+            elif self.chat_template is not None:
+                chat_template = self.chat_template
+            else:
+                raise ValueError(
+                    "Cannot use apply_chat_template because this processor does not have a chat template."
+                )
+        else:
+            if isinstance(self.chat_template, dict) and chat_template in self.chat_template:
+                # It's the name of a template, not a full template string
+                chat_template = self.chat_template[chat_template]
+            else:
+                # It's a template string, render it directly
+                chat_template = chat_template
+        # Fill sets of kwargs that should be used by different parts of template
+        processed_kwargs = {
+            "mm_load_kwargs": {},
+            "template_kwargs": {},
+        }
+        for kwarg_type in processed_kwargs:
+            for key in AllKwargsForChatTemplate.__annotations__[kwarg_type].__annotations__.keys():
+                kwarg_type_defaults = AllKwargsForChatTemplate.__annotations__[kwarg_type]
+                default_value = getattr(kwarg_type_defaults, key, None)
+                value = kwargs.pop(key, default_value)
+                if value is not None and not isinstance(value, dict):
+                    processed_kwargs[kwarg_type][key] = value
+        if isinstance(conversation, (list, tuple)) and (
+            isinstance(conversation[0], (list, tuple)) or hasattr(conversation[0], "content")
+        ):
+            is_batched = True
+            conversations = conversation
+        else:
+            is_batched = False
+            conversations = [conversation]
+        tokenize = processed_kwargs["template_kwargs"].pop("tokenize", False)
+        return_dict = processed_kwargs["template_kwargs"].pop("return_dict", False)
+        mm_load_kwargs = processed_kwargs["mm_load_kwargs"]
+        if tokenize:
+            batch_images, batch_videos = [], []
+            batch_audios = []
+            batch_video_metadata = []
+            for conversation in conversations:
+                images, videos = [], []
+                video_metadata = []
+                for message in conversation:
+                    visuals = [content for content in message["content"] if content["type"] in ["image", "video"]]
+                    audio_fnames = [
+                        content[key]
+                        for content in message["content"]
+                        for key in ["audio", "url", "path"]
+                        if key in content and content["type"] == "audio"
+                    ]
+                    image_fnames = [
+                        vision_info[key]
+                        for vision_info in visuals
+                        for key in ["image", "url", "path", "base64"]
+                        if key in vision_info and vision_info["type"] == "image"
+                    ]
+                    video_fnames = [
+                        vision_info[key]
+                        for vision_info in visuals
+                        for key in ["video", "url", "path"]
+                        if key in vision_info and vision_info["type"] == "video"
+                    ]
+                    for fname in image_fnames:
+                        images.append(load_image(fname))
+                    # Audio models do not accept nested list of audios (yet!) so we construct a flat input audio list
+                    if not mm_load_kwargs["load_audio_from_video"]:
+                        for fname in audio_fnames:
+                            batch_audios.append(load_audio(fname, sampling_rate=mm_load_kwargs["sampling_rate"]))
+                    else:
+                        for fname in video_fnames:
+                            batch_audios.append(load_audio(fname, sampling_rate=mm_load_kwargs["sampling_rate"]))
+                    for fname in video_fnames:
+                        if isinstance(fname, (list, tuple)) and isinstance(fname[0], str):
+                            video = [np.array(load_image(image_fname)) for image_fname in fname]
+                            # create a 4D video because `load_video` always returns a 4D array
+                            video = np.stack(video)
+                            metadata = None
+                            logger.warning(
+                                "When loading the video from list of images, we cannot infer metadata such as `fps` or `duration`. "
+                                "If your model uses this metadata during processing, please load the whole video and let the model sample frames instead."
+                            )
+                        else:
+                            # TODO: raushan, should be `self.video_processor.load_video_for_model` when API is added
+                            video, metadata = self._load_video_for_model(
+                                fname,
+                                num_frames=mm_load_kwargs.get("num_frames", None),
+                                fps=mm_load_kwargs.get("video_fps", None),
+                                backend=mm_load_kwargs["video_load_backend"],
+                                **kwargs,
+                            )
+                        videos.append(video)
+                        video_metadata.append(metadata)
+                # Currently all processors can accept nested list of batches, but not flat list of visuals
+                # So we'll make a batched list of images and let the processor handle it
+                if images:
+                    batch_images.append(images)
+                if videos:
+                    batch_videos.append(videos)
+                    batch_video_metadata.append(video_metadata)
+            # Process conversation with video/image information if needed. Then convert into a prompt using Jinja template
+            conversations = self._process_messages_for_chat_template(
+                conversations,
+                batch_images=batch_images,
+                batch_videos=batch_videos,
+                batch_video_metadata=batch_video_metadata,
+                **processed_kwargs["mm_load_kwargs"],
+            )
+        prompt = self.tokenizer.apply_chat_template(
+            conversations,
+            chat_template=chat_template,
+            tokenize=False,
+            return_dict=False,
+            **processed_kwargs["template_kwargs"],
+        )
+        if not is_batched:
+            prompt = prompt[0]
+        if tokenize:
+            # Tokenizer's `apply_chat_template` never adds special tokens when tokenizing
+            # But processor's `apply_chat_template` didn't have an option to tokenize, so users had to format the prompt
+            # and pass it to the processor. Users thus never worried about special tokens relying on processor handling
+            # everything internally. The below line is to keep BC for that and be able to work with model that have
+            # special tokens in the template (consistent with tokenizers). We dont want to raise warning, it will flood command line
+            # without actionable solution for users
+            single_prompt = prompt[0] if is_batched else prompt
+            if self.tokenizer.bos_token is not None and single_prompt.startswith(self.tokenizer.bos_token):
+                kwargs["add_special_tokens"] = False
+            out = self(
+                text=prompt,
+                images=batch_images if batch_images else None,
+                videos=batch_videos if batch_videos else None,
+                audio=batch_audios if batch_audios else None,
+                **kwargs,
+            )
+            if return_dict:
+                return out
+            else:
+                return out["input_ids"]
+        return prompt
+__all__ = ["Phi4MultimodalProcessor"]
+Phi4MultimodalProcessor.register_for_auto_class()

special_tokens_map.json CHANGED Viewed

@@ -13,7 +13,13 @@
     "rstrip": false,
     "single_word": false
   },
-  "pad_token": "<|endoftext|>",
   "unk_token": {
     "content": "<|endoftext|>",
     "lstrip": false,

     "rstrip": false,
     "single_word": false
   },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
   "unk_token": {
     "content": "<|endoftext|>",
     "lstrip": false,

speech-lora/adapter_config.json CHANGED Viewed

@@ -1,23 +1,31 @@
 {
   "auto_mapping": null,
-  "base_model_name_or_path": "TBA",
   "bias": "none",
   "fan_in_fan_out": false,
-  "inference_mode": true,
   "init_lora_weights": true,
   "layers_pattern": null,
   "layers_to_transform": null,
   "lora_alpha": 640,
   "lora_dropout": 0.01,
-  "modules_to_save": [],
   "peft_type": "LORA",
   "r": 320,
   "revision": null,
-  "target_modules": [
-    "qkv_proj",
-    "o_proj",
-    "gate_up_proj",
-    "down_proj"
-  ],
-  "task_type": "CAUSAL_LM"
 }

 {
+  "alpha_pattern": {},
   "auto_mapping": null,
+  "base_model_name_or_path": null,
   "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
   "fan_in_fan_out": false,
+  "inference_mode": false,
   "init_lora_weights": true,
+  "layer_replication": null,
   "layers_pattern": null,
   "layers_to_transform": null,
+  "loftq_config": {},
   "lora_alpha": 640,
+  "lora_bias": false,
   "lora_dropout": 0.01,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
   "peft_type": "LORA",
   "r": 320,
+  "rank_pattern": {},
   "revision": null,
+  "target_modules": "model.layers.\\d+.((self_attn.(qkv|o)_proj)|(mlp.(gate_up|down)_proj))",
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_rslora": false
 }

speech-lora/adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1c2237461a4d1f9292cd128147bd3f0f70326a48d5d79c8e0f7583b26c095b30
-size 922782296

 version https://git-lfs.github.com/spec/v1
+oid sha256:16f70b0aba566f6c30e67a11e90033453e9375d102e031cec40956a2a0e9771e
+size 922777944

tokenizer.json CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4c1b9f641d4f8b7247b8d5007dd3b6a9f6a87cb5123134fe0d326f14d10c0585
-size 15524479

 version https://git-lfs.github.com/spec/v1
+oid sha256:57589a5827b578065aecc0a91cc1e4e9a0bac0a17fb02539bea63bb9beb889a2
+size 13303259

tokenizer_config.json CHANGED Viewed

@@ -1,125 +1 @@
-{
-  "add_prefix_space": false,
-  "added_tokens_decoder": {
-    "200010": {
-      "content": "<|endoftext10|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "200011": {
-      "content": "<|endoftext11|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "199999": {
-      "content": "<|endoftext|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "200018": {
-      "content": "<|endofprompt|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "200019": {
-      "content": "<|assistant|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": true,
-      "single_word": false,
-      "special": true
-    },
-    "200020": {
-      "content": "<|end|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": true,
-      "single_word": false,
-      "special": true
-    },
-    "200021": {
-      "content": "<|user|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": true,
-      "single_word": false,
-      "special": true
-    },
-    "200022": {
-      "content": "<|system|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": true,
-      "single_word": false,
-      "special": true
-    },
-    "200023": {
-      "content": "<|tool|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": true,
-      "single_word": false,
-      "special": false
-    },
-    "200024": {
-      "content": "<|/tool|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": true,
-      "single_word": false,
-      "special": false
-    },
-    "200025": {
-      "content": "<|tool_call|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": true,
-      "single_word": false,
-      "special": false
-    },
-    "200026": {
-      "content": "<|/tool_call|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": true,
-      "single_word": false,
-      "special": false
-    },
-    "200027": {
-      "content": "<|tool_response|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": true,
-      "single_word": false,
-      "special": false
-    },
-    "200028": {
-      "content": "<|tag|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": true,
-      "single_word": false,
-      "special": true
-    }
-  },
-  "bos_token": "<|endoftext|>",
-  "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and 'tools' in message and message['tools'] is not none %}{{ '<|' + message['role'] + '|>' + message['content'] + '<|tool|>' + message['tools'] + '<|/tool|>' + '<|end|>' }}{% else %}{{ '<|' + message['role'] + '|>' + message['content'] + '<|end|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>' }}{% else %}{{ eos_token }}{% endif %}",
-  "clean_up_tokenization_spaces": false,
-  "eos_token": "<|endoftext|>",
-  "model_max_length": 131072,
-  "pad_token": "<|endoftext|>",
-  "tokenizer_class": "GPT2TokenizerFast",
-  "unk_token": "<|endoftext|>"
-}


1	+ {"add_prefix_space": false, "added_tokens_decoder": {"199999": {"content": "<\|endoftext\|>", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true}, "200010": {"content": "<\|image\|>", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true}, "200011": {"content": "<\|audio\|>", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true}, "200018": {"content": "<\|endofprompt\|>", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true}, "200019": {"content": "<\|assistant\|>", "lstrip": false, "normalized": false, "rstrip": true, "single_word": false, "special": true}, "200020": {"content": "<\|end\|>", "lstrip": false, "normalized": false, "rstrip": true, "single_word": false, "special": true}, "200021": {"content": "<\|user\|>", "lstrip": false, "normalized": false, "rstrip": true, "single_word": false, "special": true}, "200022": {"content": "<\|system\|>", "lstrip": false, "normalized": false, "rstrip": true, "single_word": false, "special": true}, "200023": {"content": "<\|tool\|>", "lstrip": false, "normalized": false, "rstrip": true, "single_word": false, "special": false}, "200024": {"content": "<\|/tool\|>", "lstrip": false, "normalized": false, "rstrip": true, "single_word": false, "special": false}, "200025": {"content": "<\|tool_call\|>", "lstrip": false, "normalized": false, "rstrip": true, "single_word": false, "special": false}, "200026": {"content": "<\|/tool_call\|>", "lstrip": false, "normalized": false, "rstrip": true, "single_word": false, "special": false}, "200027": {"content": "<\|tool_response\|>", "lstrip": false, "normalized": false, "rstrip": true, "single_word": false, "special": false}, "200028": {"content": "<\|tag\|>", "lstrip": false, "normalized": false, "rstrip": true, "single_word": false, "special": true}}, "audio_token": "<\|audio\|>", "bos_token": "<\|endoftext\|>", "clean_up_tokenization_spaces": false, "eos_token": "<\|endoftext\|>", "extra_special_tokens": {"audio_token": "<\|audio\|>", "image_token": "<\|image\|>"}, "image_token": "<\|image\|>", "model_max_length": 131072, "pad_token": "<\|endoftext\|>", "processor_class": "Phi4MultimodalProcessor", "tokenizer_class": "GPT2Tokenizer", "unk_token": "<\|endoftext\|>"}

vision-lora/adapter_config.json CHANGED Viewed

@@ -1,23 +1,31 @@
 {
   "auto_mapping": null,
-  "base_model_name_or_path": "TBA",
   "bias": "none",
   "fan_in_fan_out": false,
-  "inference_mode": true,
   "init_lora_weights": true,
   "layers_pattern": null,
   "layers_to_transform": null,
   "lora_alpha": 512,
   "lora_dropout": 0.0,
-  "modules_to_save": [],
   "peft_type": "LORA",
   "r": 256,
   "revision": null,
-  "target_modules": [
-    "qkv_proj",
-    "o_proj",
-    "gate_up_proj",
-    "down_proj"
-  ],
-  "task_type": "CAUSAL_LM"
 }

 {
+  "alpha_pattern": {},
   "auto_mapping": null,
+  "base_model_name_or_path": null,
   "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
   "fan_in_fan_out": false,
+  "inference_mode": false,
   "init_lora_weights": true,
+  "layer_replication": null,
   "layers_pattern": null,
   "layers_to_transform": null,
+  "loftq_config": {},
   "lora_alpha": 512,
+  "lora_bias": false,
   "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
   "peft_type": "LORA",
   "r": 256,
+  "rank_pattern": {},
   "revision": null,
+  "target_modules": "model.layers.\\d+.((self_attn.(qkv|o)_proj)|(mlp.(gate_up|down)_proj))",
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_rslora": false
 }

vision-lora/adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1620b16722edf701038bf66e3cd46412c7cc5458e58df89e9f92cedb71fcbde8
-size 738232904

 version https://git-lfs.github.com/spec/v1
+oid sha256:76facf464ca0246e9f5dc409520e83764e0b73fa66fdb561526e064133728f8a
+size 738228552

vocab.json CHANGED Viewed

The diff for this file is too large to render. See raw diff