Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
dee4372
[Model] Add Ernie4.5 VL
CSWYF3634076 Aug 8, 2025
af1d864
[Model] Add Ernie4.5 VL v2 annotation organization
CSWYF3634076 Aug 8, 2025
99773ff
[Model] Add Ernie4.5 VL v2 annotation organization
CSWYF3634076 Aug 8, 2025
8442d5d
[Model] Add Ernie4.5 VL v3 fix variable name
CSWYF3634076 Aug 8, 2025
42004bc
[Model] Add Ernie4.5 VL v4 fix code-assist issue
CSWYF3634076 Aug 11, 2025
9a509cd
fix format by pre-commit
CSWYF3634076 Aug 12, 2025
8d3d62b
[Model] Add Ernie4.5 VL v5 fix format by pre-commit
CSWYF3634076 Aug 12, 2025
c227368
[Model] Add Ernie4.5 VL v5 fix format by pre-commit
CSWYF3634076 Aug 12, 2025
080f818
[Model] Add Ernie4.5 VL v5 fix format by pre-commit
CSWYF3634076 Aug 12, 2025
01f2231
[Model] Add Ernie4.5 VL v5 add trust_remote_code tag
CSWYF3634076 Aug 15, 2025
d4ee345
[Model] Add Ernie4.5 VL v6 rename and fix comments
CSWYF3634076 Aug 15, 2025
7ea25db
[Model] Add Ernie4.5 VL v7 vit qkv replace with QKVParallelinear
CSWYF3634076 Aug 15, 2025
e124b87
[Model] Add Ernie4.5 VL v8 delete processor file
CSWYF3634076 Aug 15, 2025
0fb8105
[Model] Add Ernie4.5 VL v9 pixel_values norm
CSWYF3634076 Aug 17, 2025
35fe906
[Model] Add Ernie4.5 VL v9 delete _get_image_processor_kwargs
CSWYF3634076 Aug 18, 2025
02754b7
Merge branch 'main' into vl
CSWYF3634076 Aug 18, 2025
4943465
Merge branch 'main' into vl
CSWYF3634076 Aug 18, 2025
9c6a49d
[Model] Add Ernie4.5 VL v10 adapt main
CSWYF3634076 Aug 18, 2025
7e5ac16
[Model] Add Ernie4.5 VL v11 test file
CSWYF3634076 Aug 21, 2025
0bedaa6
[Model] Add Ernie4.5 VL v12 pre-commit
CSWYF3634076 Aug 22, 2025
98bd72f
Merge branch 'main' into vl
CSWYF3634076 Aug 22, 2025
69a2902
Merge branch 'main' into vl
CSWYF3634076 Aug 25, 2025
faad7fe
[Model] Add Ernie4.5 VL v13 no test_common
CSWYF3634076 Aug 25, 2025
a4a1817
[Model] Add Ernie4.5 VL v14 add model_id to test_common
CSWYF3634076 Aug 25, 2025
4c5abbb
[Model] Add Ernie4.5 VL v15 skip test_can_initialize due to processor…
CSWYF3634076 Aug 26, 2025
3b70302
[Model] Add Ernie4.5 VL v16 add decord to test.in
CSWYF3634076 Aug 26, 2025
a08137c
[Model] Add Ernie4.5 VL v17 update test.txt by pre-commit
CSWYF3634076 Aug 26, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/models/supported_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -616,6 +616,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
| `Cohere2VisionForConditionalGeneration` | Command A Vision | T + I<sup>+</sup> | `CohereLabs/command-a-vision-07-2025`, etc. | | ✅︎ | ✅︎ |
| `DeepseekVLV2ForCausalLM`<sup>^</sup> | DeepSeek-VL2 | T + I<sup>+</sup> | `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2`, etc. | | ✅︎ | ✅︎ |
| `DonutForConditionalGeneration`<sup>^</sup> | Donut | T + I | `ByteDance/Dolphin`, `naver-clova-ix/donut-base-finetuned-docvqa`, etc. | | | |
| `Ernie4_5_VLMoeForConditionalGeneration` | Ernie4.5-VL | T + I<sup>+</sup>/ V<sup>+</sup> | `baidu/ERNIE-4.5-VL-28B-A3B-PT`, `baidu/ERNIE-4.5-VL-424B-A47B-PT` | | ✅︎ | ✅︎ |
| `Florence2ForConditionalGeneration` | Florence-2 | T + I | `microsoft/Florence-2-base`, `microsoft/Florence-2-large`, etc. | | | |
| `FuyuForCausalLM` | Fuyu | T + I | `adept/fuyu-8b`, etc. | | ✅︎ | ✅︎ |
| `Gemma3ForConditionalGeneration` | Gemma 3 | T + I<sup>+</sup> | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. | ✅︎ | ✅︎ | ⚠️ |
Expand Down
32 changes: 32 additions & 0 deletions examples/offline_inference/vision_language.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,37 @@ def run_deepseek_vl2(questions: list[str], modality: str) -> ModelRequestData:
)


# Ernie4.5-VL
def run_ernie45_vl(questions: list[str], modality: str) -> ModelRequestData:
model_name = "baidu/ERNIE-4.5-VL-28B-A3B-PT"

engine_args = EngineArgs(
model=model_name,
max_model_len=4096,
max_num_seqs=5,
limit_mm_per_prompt={modality: 1},
trust_remote_code=True,
)

if modality == "image":
placeholder = "Picture 1:<|IMAGE_START|><|image@placeholder|><|IMAGE_END|>"
elif modality == "video":
placeholder = "Video 1:<|VIDEO_START|><|video@placeholder|><|VIDEO_END|>"

prompts = [
(
f"<|begin_of_sentence|>User: {question}{placeholder}\n"
"Assistant: <think></think>"
)
for question in questions
]

return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)


# Florence2
def run_florence2(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
Expand Down Expand Up @@ -1602,6 +1633,7 @@ def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData:
"chameleon": run_chameleon,
"command_a_vision": run_command_a_vision,
"deepseek_vl_v2": run_deepseek_vl2,
"ernie45_vl": run_ernie45_vl,
"florence2": run_florence2,
"fuyu": run_fuyu,
"gemma3": run_gemma3,
Expand Down
1 change: 1 addition & 0 deletions requirements/test.in
Original file line number Diff line number Diff line change
Expand Up @@ -54,3 +54,4 @@ runai-model-streamer-s3==0.11.0
fastsafetensors>=0.1.10
pydantic>=2.10 # 2.9 leads to error on python 3.10
terratorch==1.1rc2 # required for PrithviMAE test
decord==0.6.0
3 changes: 3 additions & 0 deletions requirements/test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,8 @@ datasets==3.0.2
# mteb
decorator==5.1.1
# via librosa
decord==0.6.0
# via -r requirements/test.in
dill==0.3.8
# via
# datasets
Expand Down Expand Up @@ -493,6 +495,7 @@ numpy==1.26.4
# contourpy
# cupy-cuda12x
# datasets
# decord
# einx
# encodec
# evaluate
Expand Down
1 change: 1 addition & 0 deletions tests/models/multimodal/processing/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,7 @@ def _test_processing_correctness_one(
"CohereLabs/command-a-vision-07-2025",
"deepseek-ai/deepseek-vl2-tiny",
"naver-clova-ix/donut-base-finetuned-docvqa",
"baidu/ERNIE-4.5-VL-28B-A3B-PT",
"microsoft/Florence-2-base",
"adept/fuyu-8b",
"google/gemma-3-4b-it",
Expand Down
2 changes: 2 additions & 0 deletions tests/models/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -396,6 +396,8 @@ def check_available_online(
transformers_version_reason="HF model is not compatible.", # noqa: E501
hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]}), # noqa: E501
"Emu3ForConditionalGeneration": _HfExamplesInfo("BAAI/Emu3-Chat-hf"),
"Ernie4_5_VLMoeForConditionalGeneration": _HfExamplesInfo("baidu/ERNIE-4.5-VL-28B-A3B-PT", # noqa: E501
trust_remote_code=True),
"FuyuForCausalLM": _HfExamplesInfo("adept/fuyu-8b"),
"Gemma3ForConditionalGeneration": _HfExamplesInfo("google/gemma-3-4b-it"),
"Gemma3nForConditionalGeneration": _HfExamplesInfo("google/gemma-3n-E2B-it", # noqa: E501
Expand Down
72 changes: 72 additions & 0 deletions vllm/model_executor/layers/rotary_embedding/ernie45_vl_rope.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

from typing import Optional

import torch

from .common import apply_rotary_emb_dispatch
from .mrope import MRotaryEmbedding


class Ernie4_5_VLRotaryEmbedding(MRotaryEmbedding):
"""3D rotary positional embedding. 3D is t:time h:height w:width"""

def forward(
self,
positions: torch.Tensor,
query: torch.Tensor,
key: Optional[torch.Tensor] = None,
) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
assert positions.ndim == 1 or positions.ndim == 2
assert key is not None

num_tokens = positions.shape[-1]
cos_sin = self.cos_sin_cache[positions]
cos, sin = cos_sin.chunk(2, dim=-1)
if positions.ndim == 2:
assert self.mrope_section

section_h = self.mrope_section[0] # 22
section_w = self.mrope_section[1] # 22
section_t = self.mrope_section[2] # 20
assert section_h == section_w
# Split according to [h w h w h w h w... t t t...]
section_cos_t = cos[..., -section_t:]
section_cos_h = cos[..., :section_h + section_w:2]
section_cos_w = cos[..., 1:section_h + section_w:2]

cos_t, cos_h, cos_w = section_cos_t[0], section_cos_h[
1], section_cos_w[2]
cos_hw = torch.stack([cos_h, cos_w],
dim=-1).reshape(cos_h.shape[:-1] +
(cos_h.shape[-1] * 2, ))
cos = torch.cat([cos_hw, cos_t], dim=-1)

section_sin_t = sin[..., -section_t:]
section_sin_h = sin[..., :section_h + section_w:2]
section_sin_w = sin[..., 1:section_h + section_w:2]

sin_t, sin_h, sin_w = section_sin_t[0], section_sin_h[
1], section_sin_w[2]
sin_hw = torch.stack([sin_h, sin_w],
dim=-1).reshape(sin_h.shape[:-1] +
(sin_h.shape[-1] * 2, ))
sin = torch.cat([sin_hw, sin_t], dim=-1)

query_shape = query.shape
query = query.view(num_tokens, -1, self.head_size)
query_rot = query[..., :self.rotary_dim]
query_pass = query[..., self.rotary_dim:]
query_rot = apply_rotary_emb_dispatch(query_rot, cos, sin,
self.is_neox_style)
query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)

key_shape = key.shape
key = key.view(num_tokens, -1, self.head_size)
key_rot = key[..., :self.rotary_dim]
key_pass = key[..., self.rotary_dim:]
key_rot = apply_rotary_emb_dispatch(key_rot, cos, sin,
self.is_neox_style)
key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
return query, key
123 changes: 123 additions & 0 deletions vllm/model_executor/layers/rotary_embedding/mrope.py
Original file line number Diff line number Diff line change
Expand Up @@ -393,6 +393,15 @@ def get_input_positions_tensor(
context_len=context_len,
seq_len=seq_len,
)
elif hf_config.model_type in ["ernie4_5_moe_vl", "ernie4_5_vl"]:
return cls._ernie_get_input_positions_tensor(
input_tokens=input_tokens,
hf_config=hf_config,
image_grid_thw=image_grid_thw,
video_grid_thw=video_grid_thw,
context_len=context_len,
seq_len=seq_len,
)
else:
return cls._vl_get_input_positions_tensor(
input_tokens=input_tokens,
Expand Down Expand Up @@ -513,6 +522,120 @@ def _glm4v_get_input_positions_tensor(
len(input_tokens)).item()
return llm_positions, mrope_position_delta

@classmethod
def _ernie_get_input_positions_tensor(
Comment on lines +525 to +526
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I also have a feeling that a lot of operations in here can be optimized by vectorization, but we can revisit this later

cls,
input_tokens: list[int],
hf_config: PretrainedConfig,
image_grid_thw: Union[list[list[int]], torch.Tensor],
video_grid_thw: Union[list[list[int]], torch.Tensor],
context_len: int = 0,
seq_len: Optional[int] = None,
) -> tuple[torch.Tensor, int]:
"""Get mrope input positions and delta value for Ernie VL."""

image_token_id = hf_config.im_patch_id
video_start_token_id = hf_config.video_start_token_id
video_end_token_id = hf_config.video_end_token_id
spatial_conv_size = hf_config.spatial_conv_size
temporal_conv_size = hf_config.temporal_conv_size
llm_pos_ids_list: list = []

if not (image_grid_thw is None and video_grid_thw is None):
if isinstance(image_grid_thw, torch.Tensor):
image_grid_thw = image_grid_thw.tolist()

input_token_type: list[str] = []
video_check_flg = False
for token in input_tokens:
if token == video_start_token_id:
video_check_flg = True
elif token == video_end_token_id:
video_check_flg = False

if (token == image_token_id) and (video_check_flg is False):
input_token_type.append("image")
elif (token == image_token_id) and (video_check_flg is True):
input_token_type.append("video")
else:
input_token_type.append("text")

input_type_group: list[tuple[str, int, int]] = []
for key, group_iter in itertools.groupby(
enumerate(input_token_type), lambda x: x[1]):
group_list = list(group_iter)
start_index = group_list[0][0]
end_index = group_list[-1][0] + 1
input_type_group.append((key, start_index, end_index))

video_frame_num = 1
mm_data_idx = 0
for modality_type, start_idx, end_idx in input_type_group:
st_idx = llm_pos_ids_list[-1].max() + 1 if len(
llm_pos_ids_list) > 0 else 0
if modality_type == "image":
t, h, w = (
image_grid_thw[mm_data_idx][0],
image_grid_thw[mm_data_idx][1],
image_grid_thw[mm_data_idx][2],
)
llm_grid_t, llm_grid_h, llm_grid_w = \
t, h // spatial_conv_size, w // spatial_conv_size

t_index = torch.arange(llm_grid_t).view(-1, 1).expand(
-1, llm_grid_h * llm_grid_w).flatten()
h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(
llm_grid_t, -1, llm_grid_w).flatten()
w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(
llm_grid_t, llm_grid_h, -1).flatten()
llm_pos_ids_list.append(
torch.stack([t_index, h_index, w_index]) + st_idx)
mm_data_idx += 1

elif modality_type == "video":
t, h, w = (
video_grid_thw[mm_data_idx][0],
video_grid_thw[mm_data_idx][1],
video_grid_thw[mm_data_idx][2],
)
llm_grid_t, llm_grid_h, llm_grid_w = (t //
temporal_conv_size,
h //
spatial_conv_size,
w //
spatial_conv_size)

for t_idx in range(llm_grid_t):
t_index = torch.tensor(t_idx).view(-1, 1).expand(
-1, llm_grid_h * llm_grid_w).flatten()
h_index = torch.arange(llm_grid_h).view(
1, -1, 1).expand(1, -1, llm_grid_w).flatten()
w_index = torch.arange(llm_grid_w).view(
1, 1, -1).expand(1, llm_grid_h, -1).flatten()
llm_pos_ids_list.append(
torch.stack([t_index, h_index, w_index]) + st_idx)

mm_data_idx += 1
video_frame_num += 1

else:
text_len = end_idx - start_idx
llm_pos_ids_list.append(
torch.arange(text_len).view(1, -1).expand(3, -1) +
st_idx)
video_frame_num = 1

else:
text_len = len(input_tokens)
llm_pos_ids_list.append(
torch.arange(text_len).view(1, -1).expand(3, -1))

llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
llm_positions = llm_positions[:, context_len:seq_len]
mrope_position_delta = (llm_positions.max() + 1 -
len(input_tokens)).item()
return llm_positions, mrope_position_delta

@classmethod
def _vl_get_input_positions_tensor(
cls,
Expand Down
Loading