huggingface · vasqu · Jul 22, 2025 · Jul 22, 2025 · Jul 22, 2025 · Jul 22, 2025
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
@@ -1019,6 +1019,8 @@
         title: Donut
       - local: model_doc/emu3
         title: Emu3
+      - local: model_doc/ernie4_5_vl
+        title: ernie4_5_vl
       - local: model_doc/evolla
         title: Evolla
       - local: model_doc/flava

diff --git a/docs/source/en/model_doc/ernie4_5_vl.md b/docs/source/en/model_doc/ernie4_5_vl.md
@@ -0,0 +1,64 @@
+<!--Copyright 2025 The Qwen Team and The HuggingFace Inc. team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">    </div>
+</div>
+
+# ernie4_5_vl
+
+## Overview
+
+The ernie4_5_vl model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>) by <INSERT AUTHORS HERE>.
+<INSERT SHORT SUMMARY HERE>
+
+The abstract from the paper is the following:
+
+*<INSERT PAPER ABSTRACT HERE>*
+
+Tips:
+
+<INSERT TIPS ABOUT MODEL HERE>
+
+This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface.co/<INSERT YOUR HF USERNAME HERE>).
+The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
+
+
+## Ernie4_5_VLConfig
+
+[[autodoc]] Ernie4_5_VLConfig
+
+## Ernie4_5_VLTextConfig
+
+[[autodoc]] Ernie4_5_VLTextConfig
+
+## Ernie4_5_VLTextModel
+
+[[autodoc]] Ernie4_5_VLTextModel
+    - forward
+
+## Ernie4_5_VLModel
+
+[[autodoc]] Ernie4_5_VLModel
+    - forward
+
+## Ernie4_5_VLForConditionalGeneration
+
+[[autodoc]] Ernie4_5_VLForConditionalGeneration
+    - forward
diff --git a/src/transformers/modeling_rope_utils.py b/src/transformers/modeling_rope_utils.py
@@ -376,6 +376,39 @@ def _compute_llama3_parameters(
     return inv_freq_llama, attention_factor
 
 
+def _comput_ernie_3d_parameters(
+    config: PretrainedConfig, device: "torch.device", seq_len: Optional[int] = None
+) -> tuple["torch.Tensor", float]:
+    """
+    Computes the inverse frequencies for the Ernie 4.5 VL models.
+
+    Args:
+        config ([`~transformers.PretrainedConfig`]):
+            The model configuration.
+        device (`torch.device`):
+            The device to use for initialization of the inverse frequencies.
+        seq_len (`int`, *optional*):
+            The current sequence length. Unused for this type of RoPE.
+    Returns:
+        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
+        post-processing scaling factor applied to the computed cos/sin.
+    """
+    # Gets the default RoPE parameters
+    inv_freq, attention_factor = _compute_default_rope_parameters(config, device, seq_len)
+
+    # Divide frequency allocation based on `freq_allocation`
+    # and apply necessary (pre-)rotations
+    t_dim = config.rope_scaling["freq_allocation"]  # time dimension
+    hw_dim = inv_freq.shape[-1] - t_dim  # height and width dimension
+
+    inv_freq_3d = torch.empty_like(inv_freq)
+    # (Pre-)Rotate to avoid another rotation during the forward
+    inv_freq_3d[:hw_dim] = torch.cat([inv_freq[:-t_dim][0::2], inv_freq[:-t_dim][1::2]])
+    inv_freq_3d[-t_dim:] = inv_freq[-t_dim:]
+
+    return inv_freq_3d, attention_factor
+
+
 # This maps the "rope_type" string field in rope config to the corresponding function to compute the RoPE parameters
 # from the model config. You can append new {'rope_type': callable} pairs to this dictionary to enable custom RoPE
 # parameterizations, as long as the callable has the same signature.
@@ -386,6 +419,7 @@ def _compute_llama3_parameters(
     "yarn": _compute_yarn_parameters,
     "longrope": _compute_longrope_parameters,
     "llama3": _compute_llama3_parameters,
+    "ernie_3d": _comput_ernie_3d_parameters,
 }
 
 
@@ -604,6 +638,29 @@ def _validate_llama3_parameters(config: PretrainedConfig, ignore_keys: Optional[
         )
 
 
+def _validate_ernie_3d_parameters(config: PretrainedConfig, ignore_keys: Optional[set] = None):
+    rope_scaling = config.rope_scaling
+    rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None))  # BC: "rope_type" was originally "type"
+    required_keys = {"rope_type", "freq_allocation"}
+    received_keys = set(rope_scaling.keys())
+    _check_received_keys(rope_type, received_keys, required_keys, ignore_keys=ignore_keys)
+
+    partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
+    head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+    dim = int(head_dim * partial_rotary_factor)
+
+    freq_allocation = rope_scaling["freq_allocation"]
+    t_dim = freq_allocation
+    h_dim = (dim - t_dim) // 2
+    reconstructed_dim = t_dim + 2 * h_dim
+    if freq_allocation is None or not isinstance(freq_allocation, int) or reconstructed_dim != dim:
+        logger.warning(
+            "`rope_scaling`'s freq_allocation field must be an int that can evenly be split into three dimensions: "
+            f"`freq_allocation` and 2 * (dim - freq_allocation). However, we found the following splits {t_dim}, {h_dim}, {h_dim};"
+            f"this does not split evenly into the total dim of {dim} vs. {reconstructed_dim}."
+        )
+
+
 # Like `ROPE_INIT_FUNCTIONS`, this validation function mapping can be dynamically updated for custom RoPE types.
 ROPE_VALIDATION_FUNCTIONS = {
     "default": _validate_default_rope_parameters,
@@ -612,6 +669,7 @@ def _validate_llama3_parameters(config: PretrainedConfig, ignore_keys: Optional[
     "yarn": _validate_yarn_parameters,
     "longrope": _validate_longrope_parameters,
     "llama3": _validate_llama3_parameters,
+    "ernie_3d": _validate_ernie_3d_parameters,
 }
 
 

diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
@@ -137,6 +137,7 @@
         ("ernie", "ErnieConfig"),
         ("ernie4_5", "Ernie4_5Config"),
         ("ernie4_5_moe", "Ernie4_5_MoeConfig"),
+        ("ernie4_5_vl", "Ernie4_5_VLConfig"),
         ("ernie_m", "ErnieMConfig"),
         ("esm", "EsmConfig"),
         ("evolla", "EvollaConfig"),
@@ -560,6 +561,7 @@
         ("ernie", "ERNIE"),
         ("ernie4_5", "Ernie4_5"),
         ("ernie4_5_moe", "Ernie4_5_MoE"),
+        ("ernie4_5_vl", "Ernie4_5_VL"),
         ("ernie_m", "ErnieM"),
         ("esm", "ESM"),
         ("evolla", "Evolla"),

diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
@@ -139,6 +139,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
         ("ernie", "ErnieModel"),
         ("ernie4_5", "Ernie4_5Model"),
         ("ernie4_5_moe", "Ernie4_5_MoeModel"),
+        ("ernie4_5_vl", "Ernie4_5_VLModel"),
         ("ernie_m", "ErnieMModel"),
         ("esm", "EsmModel"),
         ("evolla", "EvollaModel"),
@@ -956,6 +957,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
         ("blip", "BlipForConditionalGeneration"),
         ("blip-2", "Blip2ForConditionalGeneration"),
         ("chameleon", "ChameleonForConditionalGeneration"),
+        ("ernie4_5_vl", "Ernie4_5_VLForConditionalGeneration"),
         ("git", "GitForCausalLM"),
         ("idefics2", "Idefics2ForConditionalGeneration"),
         ("idefics3", "Idefics3ForConditionalGeneration"),
@@ -997,6 +999,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
         ("deepseek_vl", "DeepseekVLForConditionalGeneration"),
         ("deepseek_vl_hybrid", "DeepseekVLHybridForConditionalGeneration"),
         ("emu3", "Emu3ForConditionalGeneration"),
+        ("ernie4_5_vl", "Ernie4_5_VLForConditionalGeneration"),
         ("evolla", "EvollaForProteinText2Text"),
         ("florence2", "Florence2ForConditionalGeneration"),
         ("fuyu", "FuyuForCausalLM"),

diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py
@@ -67,6 +67,7 @@
         ("deepseek_vl_hybrid", "DeepseekVLHybridProcessor"),
         ("dia", "DiaProcessor"),
         ("emu3", "Emu3Processor"),
+        ("ernie4_5_vl", "Ernie4_5_VLProcessor"),
         ("evolla", "EvollaProcessor"),
         ("flava", "FlavaProcessor"),
         ("florence2", "Florence2Processor"),

diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
@@ -229,6 +229,7 @@
         ("ernie", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
         ("ernie4_5", (None, "LlamaTokenizerFast" if is_tokenizers_available() else None)),
         ("ernie4_5_moe", (None, "LlamaTokenizerFast" if is_tokenizers_available() else None)),
+        ("ernie4_5_vl", (None, "LlamaTokenizerFast" if is_tokenizers_available() else None)),
         ("ernie_m", ("ErnieMTokenizer" if is_sentencepiece_available() else None, None)),
         ("esm", ("EsmTokenizer", None)),
         (

diff --git a/src/transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py b/src/transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py
@@ -268,12 +268,9 @@ class Ernie4_5_MoeStatics(nn.Module):
         - Additionally, usage per expert in the original codebase
     """
 
-    def __init__(self, config):
+    def __init__(self, num_experts_groups, num_experts):
         super().__init__()
 
-        num_experts_groups = 1
-        num_experts = config.moe_num_experts
-
         self.e_score_correction_bias = nn.Parameter(
             torch.zeros(num_experts_groups, num_experts, dtype=torch.float32),
             requires_grad=False,
@@ -303,25 +300,22 @@ class Ernie4_5_MoeSparseMoeBlock(nn.Module):
     (optional) shared experts and a corrections bias during gating.
     """
 
-    def __init__(self, config):
+    def __init__(self, config, num_experts, intermediate_size):
         super().__init__()
-        self.num_experts = config.moe_num_experts
+        self.num_experts = num_experts
         self.top_k = config.moe_k
 
         # correction bias (yes it seems to be a typo with statics <> statistics)
-        self.moe_statics = Ernie4_5_MoeStatics(config)
+        self.moe_statics = Ernie4_5_MoeStatics(num_experts_groups=1, num_experts=self.num_experts)
 
         # gating
-        self.gate = nn.Linear(config.hidden_size, config.moe_num_experts, bias=False, dtype=torch.float32)
-        self.experts = nn.ModuleList(
-            [Ernie4_5_MoeMLP(config, config.moe_intermediate_size) for _ in range(config.moe_num_experts)]
-        )
+        self.gate = nn.Linear(config.hidden_size, self.num_experts, bias=False, dtype=torch.float32)
+        self.experts = nn.ModuleList([Ernie4_5_MoeMLP(config, intermediate_size) for _ in range(self.num_experts)])
         self.norm_min = config.moe_norm_min
 
-        # (optional) shared experts for all forwards
         self.shared_experts = None
         if config.moe_num_shared_experts > 0:
-            self.shared_experts = Ernie4_5_MoeMLP(config, config.moe_intermediate_size * config.moe_num_shared_experts)
+            self.shared_experts = Ernie4_5_MoeMLP(config, intermediate_size * config.moe_num_shared_experts)
 
     def forward(
         self,
@@ -395,7 +389,9 @@ def __init__(self, config, layer_idx):
             and layer_idx >= config.moe_layer_start_index
             and layer_idx <= config.moe_layer_end_index
         ):
-            self.mlp = Ernie4_5_MoeSparseMoeBlock(config)
+            self.mlp = Ernie4_5_MoeSparseMoeBlock(
+                config, num_experts=config.moe_num_experts, intermediate_size=config.moe_intermediate_size
+            )
         else:
             self.mlp = Ernie4_5_MoeMLP(config)
 

diff --git a/src/transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py b/src/transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py
@@ -76,12 +76,9 @@ class Ernie4_5_MoeStatics(nn.Module):
         - Additionally, usage per expert in the original codebase
     """
 
-    def __init__(self, config):
+    def __init__(self, num_experts_groups, num_experts):
         super().__init__()
 
-        num_experts_groups = 1
-        num_experts = config.moe_num_experts
-
         self.e_score_correction_bias = nn.Parameter(
             torch.zeros(num_experts_groups, num_experts, dtype=torch.float32),
             requires_grad=False,
@@ -111,25 +108,22 @@ class Ernie4_5_MoeSparseMoeBlock(nn.Module):
     (optional) shared experts and a corrections bias during gating.
     """
 
-    def __init__(self, config):
+    def __init__(self, config, num_experts, intermediate_size):
         super().__init__()
-        self.num_experts = config.moe_num_experts
+        self.num_experts = num_experts
         self.top_k = config.moe_k
 
         # correction bias (yes it seems to be a typo with statics <> statistics)
-        self.moe_statics = Ernie4_5_MoeStatics(config)
+        self.moe_statics = Ernie4_5_MoeStatics(num_experts_groups=1, num_experts=self.num_experts)
 
         # gating
-        self.gate = nn.Linear(config.hidden_size, config.moe_num_experts, bias=False, dtype=torch.float32)
-        self.experts = nn.ModuleList(
-            [Ernie4_5_MoeMLP(config, config.moe_intermediate_size) for _ in range(config.moe_num_experts)]
-        )
+        self.gate = nn.Linear(config.hidden_size, self.num_experts, bias=False, dtype=torch.float32)
+        self.experts = nn.ModuleList([Ernie4_5_MoeMLP(config, intermediate_size) for _ in range(self.num_experts)])
         self.norm_min = config.moe_norm_min
 
-        # (optional) shared experts for all forwards
         self.shared_experts = None
         if config.moe_num_shared_experts > 0:
-            self.shared_experts = Ernie4_5_MoeMLP(config, config.moe_intermediate_size * config.moe_num_shared_experts)
+            self.shared_experts = Ernie4_5_MoeMLP(config, intermediate_size * config.moe_num_shared_experts)
 
     def forward(
         self,
@@ -203,7 +197,9 @@ def __init__(self, config, layer_idx):
             and layer_idx >= config.moe_layer_start_index
             and layer_idx <= config.moe_layer_end_index
         ):
-            self.mlp = Ernie4_5_MoeSparseMoeBlock(config)
+            self.mlp = Ernie4_5_MoeSparseMoeBlock(
+                config, num_experts=config.moe_num_experts, intermediate_size=config.moe_intermediate_size
+            )
         else:
             self.mlp = Ernie4_5_MoeMLP(config)
 

diff --git a/src/transformers/models/ernie4_5_vl/Roboto-Regular.ttf b/src/transformers/models/ernie4_5_vl/Roboto-Regular.ttf
diff --git a/src/transformers/models/ernie4_5_vl/__init__.py b/src/transformers/models/ernie4_5_vl/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2025 The Qwen Team and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_ernie4_5_vl import *
+    from .modeling_ernie4_5_vl import *
+    from .processing_ernie4_5_vl import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)