From 271086d8a0c6b094910e5e63467ae3cad2c1c68f Mon Sep 17 00:00:00 2001
From: Christopher Manteuffel <christopher.manteuffel@intel.com>
Date: Fri, 22 Aug 2025 11:52:52 -0700
Subject: [PATCH 01/13] Initial import of ovis_2_5 work

---
 .../vision_language_multi_image.py            |  30 +
 .../generation/vlm_utils/model_utils.py       |  57 ++
 vllm/entrypoints/chat_utils.py                |   2 +-
 vllm/model_executor/models/ovis2_5.py         | 567 ++++++++++++++++
 vllm/model_executor/models/registry.py        |   1 +
 vllm/model_executor/models/siglip2navit.py    | 606 ++++++++++++++++++
 vllm/transformers_utils/config.py             |  14 +
 .../transformers_utils/processors/__init__.py |   3 +-
 vllm/transformers_utils/processors/ovis2_5.py | 458 +++++++++++++
 9 files changed, 1736 insertions(+), 2 deletions(-)
 create mode 100644 vllm/model_executor/models/ovis2_5.py
 create mode 100644 vllm/model_executor/models/siglip2navit.py
 create mode 100644 vllm/transformers_utils/processors/ovis2_5.py
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index ea7a793d026b..bd71b021320d 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -459,6 +459,36 @@ def load_ovis(question: str, image_urls: list[str]) -> ModelRequestData:
         image_data=[fetch_image(url) for url in image_urls],
     )
 
+# ovis2_5
+def load_ovis2_5(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "AIDC-AI/Ovis2.5-2B"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=2,
+        trust_remote_code=True,
+        dtype="half",
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    placeholders = "\n".join(
+        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
+    )
+    messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    prompt = tokenizer.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+
+    import pdb;pdb.set_trace()
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=[fetch_image(url) for url in image_urls],
+    )
+
 
 def load_pixtral_hf(question: str, image_urls: list[str]) -> ModelRequestData:
     model_name = "mistral-community/pixtral-12b"
diff --git a/tests/models/multimodal/generation/vlm_utils/model_utils.py b/tests/models/multimodal/generation/vlm_utils/model_utils.py
index af4c72f44b67..ce62e95bdf97 100644
--- a/tests/models/multimodal/generation/vlm_utils/model_utils.py
+++ b/tests/models/multimodal/generation/vlm_utils/model_utils.py
@@ -787,6 +787,63 @@ def processor(*args, text="", images=None, **kwargs):
     hf_model.processor = processor
     return hf_model
 
+def ovis2_5_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+    """Patches and returns an instance of the HfRunner to use for Ovis2."""
+    hf_model.model.get_output_embeddings = lambda: \
+        hf_model.model.llm.get_output_embeddings()
+
+    def processor(*args, text="", images=None, videos=None, **kwargs):
+        if images is None:
+            images = []
+        else:
+            images = [images] if isinstance(images, Image) else images
+        if videos is None:
+            videos = []
+        else:
+            videos = [videos] if isinstance(videos, np.ndarray) else videos
+            videos = [[PIL.Image.fromarray(frame) for frame in vid]
+                      for vid in videos]
+
+        import pdb;pdb.set_trace()
+        prompt_start_and_end = {
+            "qwen2": ("<|im_start|>user\n", "<|im_end|>\n"),
+            "llama":
+            ("<|start_header_id|>user<|end_header_id|>\n\n", "<|eot_id|>"),
+            "gemma2": ("<start_of_turn>user\n", "<end_of_turn>\n"),
+        }
+        for start, end in prompt_start_and_end.values():
+            if start in text and end in text:
+                text = text.split(start)[1].split(end)[0]
+                break
+
+        images_message = [{"type": "image", "image": img} for img in images]
+        videos_message = [{"type": "video", "video": vid} for vid in videos]
+
+        messages = [{
+            "role":
+            "user",
+            "content": [
+                *images_message,
+                *videos_message,
+                {
+                    "type": "text",
+                    "text": text
+                },
+            ],
+        }]
+
+        input_ids, pixel_values, grid_thws = hf_model.model.preprocess_inputs(
+            messages=messages, enable_thinking=True)
+        inputs = {
+            "inputs": input_ids,
+            "pixel_values": pixel_values,
+            "grid_thws": grid_thws,
+        }
+        return BatchFeature(data=inputs, tensor_type="pt")
+
+    hf_model.processor = processor
+    return hf_model
+
 
 def qwen2_5_omni_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
     """Patches and returns an instance of the HfRunner for Qwen2.5-Omni."""
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index f5f45a62ca2f..83d66c0d662f 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -527,7 +527,7 @@ def _placeholder_str(self, modality: ModalityStr,
 
             if model_type in ("aya_vision", "chameleon", "deepseek_vl_v2",
                               "internvl_chat", "ovis", "skywork_chat",
-                              "NVLM_D", "h2ovl_chat", "idefics3", "smolvlm"):
+                              "NVLM_D", "h2ovl_chat", "idefics3", "smolvlm", "ovis2_5"):
                 return "<image>"
             if model_type in ("mllama", "llama4"):
                 return "<|image|>"
diff --git a/vllm/model_executor/models/ovis2_5.py b/vllm/model_executor/models/ovis2_5.py
new file mode 100644
index 000000000000..b6fbd9ab82d2
--- /dev/null
+++ b/vllm/model_executor/models/ovis2_5.py
@@ -0,0 +1,567 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+""" PyTorch Ovis model."""
+from collections.abc import Iterable, Mapping
+from functools import partial
+from typing import Optional, Union
+
+import torch
+import torch.nn as nn
+from transformers import BaseImageProcessor, BatchFeature, PretrainedConfig
+
+from vllm.config import VllmConfig
+from vllm.model_executor.layers.linear import ReplicatedLinear
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.models.ovis import (OvisImagePatchInputs,
+                                             VisualEmbedding)
+from vllm.model_executor.models.siglip2navit import Siglip2NavitModel
+from vllm.model_executor.models.utils import (AutoWeightsLoader, flatten_bn,
+                                              init_vllm_registered_model,
+                                              maybe_prefix)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs)
+from vllm.multimodal.parse import ImageSize, MultiModalDataItems
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, PromptReplacement)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.processors.ovis2_5 import Ovis2_5Processor
+
+from .interfaces import MultiModalEmbeddings, SupportsMultiModal
+
+IMAGE_TOKEN = "<image>"
+VIDEO_TOKEN = "<video>"
+INDICATOR_IDS = [-301, -302, -303, -304]
+
+IMAGE_PAD_TOKEN_MAP = {
+    "gemma2": "<unused0>",
+    "llama": "<|reserved_special_token_0|>",
+    "qwen2": "<|image_pad|>",
+    "qwen3": "<|image_pad|>",
+}
+IMAGE_PAD_TOKEN_ID_MAP = {
+    "gemma2": 7,
+    "llama": 128002,
+    "qwen2": 151655,
+    "qwen3": 151655,
+}
+
+
+def _ovis2_5_field_config():
+    return dict(pixel_values=MultiModalFieldConfig.batched("image"),
+                grids=MultiModalFieldConfig.batched("image"),
+                indicator_tokens=MultiModalFieldConfig.batched("image"),
+                video_pixel_values=MultiModalFieldConfig.batched("video"),
+                video_indicator_tokens=MultiModalFieldConfig.batched("video"),
+                video_grids=MultiModalFieldConfig.batched("video"))
+
+
+class VisualTokenizer(torch.nn.Module):
+    """
+    VIT
+    """
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        visual_vocab_size: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.vit = self._init_backbone(
+            config=config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.vit",
+        )
+        # reserved tokens for INDICATOR_IDS
+        head_dim = visual_vocab_size - len(INDICATOR_IDS)
+        self.head = torch.nn.Sequential(
+            ReplicatedLinear(
+                self.config.hidden_size * self.config.hidden_stride**2,
+                head_dim,
+                bias=False,
+                return_bias=False,
+            ), torch.nn.LayerNorm(head_dim))
+
+    def _init_backbone(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        model_type = config.model_type
+        if model_type == "siglip2_navit":
+            return Siglip2NavitModel(config=config, )
+        raise ValueError(
+            f"Unsupported visual tokenizer model_type: {model_type}")
+
+    @property
+    def dtype(self):
+        return next(self.head.parameters()).dtype
+
+    @property
+    def device(self):
+        return next(self.head.parameters()).device
+
+    def tokenize(self, logits):
+        tokens = torch.softmax(logits, dim=-1,
+                               dtype=torch.float32).to(logits.dtype)
+        return tokens
+
+    def encode(self, pixel_values, grid_thws):
+        features = self.vit(pixel_values,
+                            grid_thws,
+                            output_hidden_states=True,
+                            return_dict=True)
+        # refer to qwen2.5-vl patchmerger
+        seq_len, _ = features.shape
+        features = features.reshape(seq_len // (self.config.hidden_stride**2),
+                                    -1)
+
+        return features
+
+    def forward(self, pixel_values, grid_thws) -> torch.Tensor:
+        features = self.encode(pixel_values, grid_thws)
+        logits = self.head(features)
+        tokens = self.tokenize(logits)
+        # tokens' shape is [#Token, VocabSize-4],
+        # so padding with [#Token, 4], after which,
+        # tokens' shape should become [#Token, VocabSize];
+        tokens = torch.nn.functional.pad(
+            tokens,
+            (0, len(INDICATOR_IDS)),
+            mode="constant",
+            value=0,
+        )
+        return tokens
+
+
+class Ovis2_5ProcessingInfo(BaseProcessingInfo):
+
+    def get_hf_config(self):
+        return self.ctx.get_hf_config()
+
+    def get_hf_processor(self, **kwargs):
+        vit_config = self.get_hf_config().vit_config
+        return self.ctx.get_hf_processor(
+            Ovis2_5Processor,
+            image_pad_token=self.get_image_pad_token(),
+            patch_size=vit_config.patch_size,
+            hidden_stride=vit_config.hidden_stride,
+            temporal_patch_size=vit_config.temporal_patch_size,
+        )
+
+    def get_image_pad_token(self) -> str:
+        hf_text_config = self.get_hf_config().get_text_config()
+        text_model_type = hf_text_config.model_type
+        return IMAGE_PAD_TOKEN_MAP.get(text_model_type)
+
+    def get_image_processor(self) -> BaseImageProcessor:
+        return self.get_hf_processor().image_processor  # type: ignore
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None, "video": 1}
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        # NOTE(myselvess): max_pixels 1792 * 1792 hardcoded in original code
+        # TODO(myselvess): Be adjusted based on the max_pixels
+        return ImageSize(width=1792, height=1792)
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        num_frames: int = 1,
+    ) -> tuple[ImageSize, int]:
+        hf_config = self.get_hf_config()
+        vit_config = hf_config.vit_config
+        patch_size = vit_config.patch_size
+        temporal_patch_size = vit_config.temporal_patch_size
+        # NOTE: Frames are padded to be divisible by `temporal_patch_size`
+        # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py#L294
+        padded_num_frames = num_frames + (-num_frames % temporal_patch_size)
+        grid_t = max(padded_num_frames // temporal_patch_size, 1)
+        grid_h = image_height // patch_size
+        grid_w = image_width // patch_size
+        num_patches = grid_t * grid_h * grid_w
+        num_vision_tokens = num_patches
+        return num_vision_tokens
+
+    def get_max_image_tokens(self) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
+        return self.get_num_image_tokens(image_width=target_width,
+                                         image_height=target_height)
+
+    def _get_max_video_frames(self, max_tokens: int) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
+        num_frames = 0
+        while True:
+            next_num_frames = num_frames + 1
+            next_max_tokens = self.get_num_video_tokens(
+                image_width=target_width,
+                image_height=target_height,
+                num_frames=next_num_frames,
+                image_processor=None,
+            )
+            if next_max_tokens > max_tokens:
+                break
+            num_frames = next_num_frames
+        return num_frames
+
+    def get_num_frames_with_most_features(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> int:
+        max_images = mm_counts.get("image", 0)
+        max_videos = mm_counts.get("video", 0)
+        max_image_tokens = self.get_max_image_tokens() * max_images
+        max_total_frames = self._get_max_video_frames(seq_len -
+                                                      max_image_tokens)
+        max_frames_per_video = max_total_frames // max(max_videos, 1)
+        return max(max_frames_per_video, 1)
+
+    def get_num_video_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        num_frames: int,
+        image_processor: Optional[BaseImageProcessor],
+    ) -> int:
+        num_video_tokens = self.get_num_image_tokens(image_width=image_width,
+                                                     image_height=image_height,
+                                                     num_frames=num_frames)
+        return num_video_tokens
+
+    def get_max_video_tokens(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
+        return self.get_num_video_tokens(
+            image_width=target_width,
+            image_height=target_height,
+            num_frames=self.get_num_frames_with_most_features(
+                seq_len, mm_counts),
+            image_processor=None,
+        )
+
+
+class Ovis2_5DummyInputsBuilder(BaseDummyInputsBuilder[Ovis2_5ProcessingInfo]):
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+        num_videos = mm_counts.get("video", 0)
+        return IMAGE_TOKEN * num_images + VIDEO_TOKEN * num_videos
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        num_images = mm_counts.get("image", 0)
+        num_videos = mm_counts.get("video", 0)
+
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
+        target_num_frames = \
+            self.info.get_num_frames_with_most_features(seq_len, mm_counts)
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images),
+            "video":
+            self._get_dummy_videos(
+                width=target_width,
+                height=target_height,
+                num_frames=target_num_frames,
+                num_videos=num_videos,
+            )
+        }
+        return mm_data
+
+
+class Ovis2_5MultiModalProcessor(BaseMultiModalProcessor[Ovis2_5ProcessingInfo]
+                                 ):
+
+    def visual_indicators_to_visual_tokens(
+        self,
+        visual_indicators: list[int],
+    ) -> list[int]:
+        """
+        Filter image indicators placeholders and convert them to corresponding
+        tokens in visual tokenizer.
+        """
+        hf_config = self.info.get_hf_config()
+        vte_vocab_size = hf_config.visual_vocab_size
+        return [
+            vte_vocab_size - len(INDICATOR_IDS) + abs(x + 300) - 1
+            for x in visual_indicators if x < -300
+        ]
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+        tok_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        if not mm_data:
+            # Avoid warning from HF logger for text-only input
+            tokenizer = self.info.get_tokenizer()
+            prompt_ids = tokenizer.encode(prompt, add_special_tokens=False)
+            return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
+
+        processed_outputs = super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+            tok_kwargs=tok_kwargs,
+        )
+        hf_processor = self.info.get_hf_processor()
+
+        if "videos" in mm_data:
+            visual_indicators = [
+                hf_processor.construct_visual_indicators((1, 1, 1), True)
+                for grid in processed_outputs["video_grids"]
+            ]
+            indicator_tokens = [
+                self.visual_indicators_to_visual_tokens(indicator)
+                for indicator in visual_indicators
+            ]
+            processed_outputs["video_indicator_tokens"] = indicator_tokens
+        if "images" in mm_data:
+            visual_indicators = [
+                hf_processor.construct_visual_indicators((1, 1, 1), False)
+                for grid in processed_outputs["grids"]
+            ]
+            indicator_tokens = [
+                self.visual_indicators_to_visual_tokens(indicator)
+                for indicator in visual_indicators
+            ]
+
+            processed_outputs["indicator_tokens"] = indicator_tokens
+        return processed_outputs
+
+    def _apply_hf_processor_tokens_only(
+        self,
+        prompt_tokens: list[int],
+    ) -> list[int]:
+
+        return prompt_tokens
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return _ovis2_5_field_config()
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> list[PromptReplacement]:
+
+        def get_replacement_ovis(item_idx, modality: str):
+            if modality == "image":
+                grid = out_mm_kwargs["grids"][item_idx]
+            elif modality == "video":
+                grid = out_mm_kwargs["video_grids"][item_idx]
+            hf_processor = self.info.get_hf_processor()
+            return hf_processor.construct_visual_placeholders(grid[0], )
+
+        return [
+            PromptReplacement(
+                modality=modality,
+                target=IMAGE_TOKEN if modality == "image" else VIDEO_TOKEN,
+                replacement=partial(get_replacement_ovis, modality=modality),
+            ) for modality in ("image", "video")
+        ]
+
+
+@MULTIMODAL_REGISTRY.register_processor(Ovis2_5MultiModalProcessor,
+                                        info=Ovis2_5ProcessingInfo,
+                                        dummy_inputs=Ovis2_5DummyInputsBuilder)
+class Ovis2_5(nn.Module, SupportsMultiModal):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+
+        self.config: PretrainedConfig = config
+        self.llm = init_vllm_registered_model(
+            vllm_config=vllm_config.with_hf_config(config.text_config),
+            prefix=maybe_prefix(prefix, "llm"),
+        )
+
+        self.visual_tokenizer = VisualTokenizer(
+            config=config.vit_config,
+            visual_vocab_size=config.visual_vocab_size,
+            quant_config=quant_config,
+            prefix=f"{prefix}.visual_tokenizer",
+        )
+
+        self.vte = VisualEmbedding(config.visual_vocab_size,
+                                   config.hidden_size)
+
+        text_model_type = self.config.get_text_config().model_type
+        self.image_pad_token_id = IMAGE_PAD_TOKEN_ID_MAP[text_model_type]
+
+        # TODO(Isotr0py): PP support
+        # self.make_empty_intermediate_tensors = (
+        #    self.language_model.make_empty_intermediate_tensors)
+
+    def _parse_and_validate_visual_input(
+            self, is_video,
+            **kwargs: object) -> Optional[OvisImagePatchInputs]:
+        if is_video:
+            pixel_values = kwargs.pop("video_pixel_values", None)
+            indicator_tokens = kwargs.pop("video_indicator_tokens", None)
+            grids = kwargs.pop("video_grids", None)
+        else:
+            pixel_values = kwargs.pop("pixel_values", None)
+            indicator_tokens = kwargs.pop("indicator_tokens", None)
+            grids = kwargs.pop("grids", None)
+        if pixel_values is None and indicator_tokens is None:
+            return None
+
+        if pixel_values is not None and indicator_tokens is not None:
+            if not isinstance(pixel_values, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of pixel values. "
+                                 f"Got type: {type(pixel_values)}")
+
+            if not isinstance(indicator_tokens, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of indicator_tokens. "
+                                 f"Got type: {type(indicator_tokens)}")
+
+            return OvisImagePatchInputs(
+                type="image_patches",
+                flat_data=flatten_bn(flatten_bn(pixel_values), concat=True),
+                patches_per_image=[
+                    x.shape[0] // (self.config.vit_config.hidden_stride**2)
+                    for x in flatten_bn(pixel_values)
+                ],
+                indicator_tokens=flatten_bn(flatten_bn(indicator_tokens),
+                                            concat=True),
+                grids=flatten_bn(flatten_bn(grids), concat=True),
+            )
+
+        raise AssertionError("This line should be unreachable.")
+
+    def _process_image_input(
+            self, image_input: OvisImagePatchInputs) -> MultiModalEmbeddings:
+        image_patches_flat = image_input["flat_data"]
+        patches_per_image = image_input["patches_per_image"]
+        indicator_tokens = image_input["indicator_tokens"]
+        grid_thws = image_input["grids"]
+
+        indicator_per_image = list(
+            map(lambda x: 2 if x > 1 else x + 2, patches_per_image))
+
+        target_dtype = self.visual_tokenizer.dtype
+        visual_tokens = self.visual_tokenizer(
+            image_patches_flat.to(target_dtype), grid_thws)
+
+        visual_embeds = self.vte(visual_tokens)  # 1:1 numeric eq.
+        indicator_embeds = self.vte(indicator_tokens)
+
+        visual_embeds_per_image = visual_embeds.split(patches_per_image, dim=0)
+        indicator_embeds_per_image = indicator_embeds.split(
+            indicator_per_image)
+
+        vision_embeddings = []
+        for indicator, visual in zip(indicator_embeds_per_image,
+                                     visual_embeds_per_image):
+            vision_embeddings_per_image = []
+            visual = visual.unsqueeze(0)
+            for i in range(visual.shape[0]):
+                vision_embeddings_per_image.append(
+                    torch.cat([indicator[i:i + 1], visual[i]], dim=0))
+            vision_embeddings_per_image.append(indicator[i + 1:])
+            vision_embeddings.append(
+                torch.cat(vision_embeddings_per_image, dim=0))
+        return tuple(vision_embeddings)
+
+    def get_multimodal_embeddings(
+            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
+        embeddings = []
+
+        # NOTE: _parse_and_validate_visual_input has side-effects and pops
+        # keys from kwargs. We process images first, then videos.
+        image_input = self._parse_and_validate_visual_input(False, **kwargs)
+        if image_input:
+            embeddings.extend(self._process_image_input(image_input))
+
+        video_input = self._parse_and_validate_visual_input(True, **kwargs)
+        if video_input:
+            embeddings.extend(self._process_image_input(video_input))
+
+        return tuple(embeddings) if embeddings else None
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.llm.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None:
+            tmp = torch.concat(multimodal_embeddings, dim=0)
+            inputs_embeds[input_ids == self.image_pad_token_id] = tmp
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
+
+        # up until here we have a inputs_embeds 100% numerical identity
+        # between the OG HF Transformers implementation and ours
+        hidden_states = self.llm(
+            input_ids=input_ids,
+            positions=positions,
+            intermediate_tensors=intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+        )
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.llm.compute_logits(hidden_states, sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.llm
\ No newline at end of file
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index e82e36638069..560a7191de0b 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -202,6 +202,7 @@
     "MolmoForCausalLM": ("molmo", "MolmoForCausalLM"),
     "NVLM_D": ("nvlm_d", "NVLM_D_Model"),
     "Ovis": ("ovis", "Ovis"),
+    "Ovis2_5": ("ovis2_5", "Ovis2_5"),
     "PaliGemmaForConditionalGeneration": ("paligemma", "PaliGemmaForConditionalGeneration"),  # noqa: E501
     "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
     "PixtralForConditionalGeneration": ("pixtral", "PixtralForConditionalGeneration"),  # noqa: E501
diff --git a/vllm/model_executor/models/siglip2navit.py b/vllm/model_executor/models/siglip2navit.py
new file mode 100644
index 000000000000..36390a08be08
--- /dev/null
+++ b/vllm/model_executor/models/siglip2navit.py
@@ -0,0 +1,606 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Implementation of SiglipVisionModel intended to be only used
+within a vision language model."""
+
+from typing import Optional, Union
+
+import torch
+from einops import rearrange, repeat
+from torch import nn
+from torch.nn import functional as F
+from transformers.activations import ACT2FN
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_outputs import BaseModelOutputWithNoAttention
+
+from vllm.platforms import _Backend
+
+from .vision import get_vit_attn_backend
+
+
+class VisionRotaryEmbedding(nn.Module):
+
+    def __init__(self, dim: int, theta: float = 10000.0) -> None:
+        super().__init__()
+        inv_freq = 1.0 / (theta
+                          **(torch.arange(0, dim, 2, dtype=torch.float) / dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+    def forward(self, seqlen: int) -> torch.Tensor:
+        seq = torch.arange(seqlen,
+                           device=self.inv_freq.device,
+                           dtype=self.inv_freq.dtype)
+        freqs = torch.outer(seq, self.inv_freq)
+        return freqs
+
+
+class Siglip2VisionEmbeddings(nn.Module):
+
+    def __init__(self, config: PretrainedConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.patch_size = config.patch_size
+        self.image_size = config.image_size
+        self.num_patches = config.num_patches
+        self.preserve_original_pe = config.preserve_original_pe
+        self.hidden_stride = config.hidden_stride
+
+        # siglip2 naflex
+        if self.num_patches > 0:
+            self.patch_embedding = nn.Linear(
+                in_features=config.num_channels * self.patch_size *
+                self.patch_size,
+                out_features=self.embed_dim,
+            )
+            if self.preserve_original_pe:
+                self.position_embedding_size = int(self.num_patches**0.5)
+                self.position_embedding = nn.Embedding(self.num_patches,
+                                                       self.embed_dim)
+
+        else:
+            self.patch_embedding = nn.Conv2d(
+                in_channels=config.num_channels,
+                out_channels=self.embed_dim,
+                kernel_size=self.patch_size,
+                stride=self.patch_size,
+                padding="valid",
+            )
+            if self.preserve_original_pe:
+                self.num_patches = (self.image_size // self.patch_size)**2
+                self.position_embedding_size = (self.image_size //
+                                                self.patch_size)
+                self.position_embedding = nn.Embedding(self.num_patches,
+                                                       self.embed_dim)
+
+    def forward(self,
+                pixel_values: torch.FloatTensor,
+                grid_thws: Optional[torch.LongTensor] = None) -> torch.Tensor:
+        """
+        Args:
+            pixel_values (`torch.FloatTensor`):
+                Pixel values of shape (
+                    num_patches,
+                    num_channels * temporal_patch_size * patch_size * patch_size
+                )
+            grid_thws: (`torch.LongTensor`):
+                grid shape (num_patches, 3)
+        """
+
+        # Apply patch embeddings to already patchified pixel values
+        target_dtype = self.patch_embedding.weight.dtype
+        if isinstance(self.patch_embedding, nn.Linear):
+            patch_embeds = self.patch_embedding(
+                pixel_values.to(dtype=target_dtype))
+        elif isinstance(self.patch_embedding, nn.Conv2d):
+            pixel_values = pixel_values.view(
+                -1, self.config.num_channels * self.config.temporal_patch_size,
+                self.patch_size, self.patch_size)
+            patch_embeds = self.patch_embedding(
+                pixel_values.to(dtype=target_dtype))
+            patch_embeds = patch_embeds.reshape(-1, self.embed_dim)
+
+        if self.preserve_original_pe:
+            assert grid_thws is not None
+            pos_embed_new = torch.zeros_like(patch_embeds)
+            positional_embeddings = self.position_embedding.weight.reshape(
+                self.position_embedding_size, self.position_embedding_size,
+                -1).unsqueeze(0).permute(0, 3, 1, 2)
+            cnt = 0
+            for t, h, w in grid_thws:
+                volume = t * h * w
+                pe = F.interpolate(positional_embeddings,
+                                   size=(h, w),
+                                   mode='bicubic',
+                                   align_corners=False)
+                pe = pe.permute(0, 2, 3, 1).reshape(1, h * w, -1)
+                pe = pe[0].repeat(t, 1)
+                pe = pe.reshape(t, h // self.hidden_stride, self.hidden_stride,
+                                w // self.hidden_stride, self.hidden_stride,
+                                -1)
+                pe = pe.permute(0, 1, 3, 2, 4, 5).reshape(volume, -1)
+                pos_embed_new[cnt:cnt + volume] = pe
+                cnt += volume
+            patch_embeds = patch_embeds + pos_embed_new
+
+        return patch_embeds
+
+
+# copy from flash_attn/layers/rotary.py
+def rotate_half(x, interleaved=False):
+    if not interleaved:
+        x1, x2 = x.chunk(2, dim=-1)
+        return torch.cat((-x2, x1), dim=-1)
+    else:
+        x1, x2 = x[..., ::2], x[..., 1::2]
+        return rearrange(torch.stack((-x2, x1), dim=-1),
+                         "... d two -> ... (d two)",
+                         two=2)
+
+
+def apply_rotary_emb_torch(x, cos, sin, interleaved=False):
+    """
+    x: (batch_size, seqlen, nheads, headdim)
+    cos, sin: (seqlen, rotary_dim / 2) or (batch_size, seqlen, rotary_dim / 2)
+    """
+    ro_dim = cos.shape[-1] * 2
+    assert ro_dim <= x.shape[-1]
+    cos = repeat(
+        cos,
+        "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)")
+    sin = repeat(
+        sin,
+        "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)")
+    return torch.cat(
+        [
+            x[..., :ro_dim] * cos +
+            rotate_half(x[..., :ro_dim], interleaved) * sin, x[..., ro_dim:]
+        ],
+        dim=-1,
+    )
+
+
+def apply_rotary_pos_emb(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    is_flash_attn_backend: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    cos = cos.chunk(2, dim=-1)[0].contiguous()
+    sin = sin.chunk(2, dim=-1)[0].contiguous()
+    if is_flash_attn_backend:
+        from flash_attn.layers.rotary import apply_rotary_emb
+        apply_rotary_emb_func = apply_rotary_emb
+    else:
+        apply_rotary_emb_func = apply_rotary_emb_torch
+    q_embed = apply_rotary_emb_func(q.float(), cos.float(),
+                                    sin.float()).type_as(q)
+    k_embed = apply_rotary_emb_func(k.float(), cos.float(),
+                                    sin.float()).type_as(k)
+    return q_embed, k_embed
+
+
+class Siglip2Attention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads "
+                f"(got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads}).")
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+        self.is_causal = False
+
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
+
+        self.use_rope = config.use_rope
+
+        # Detect attention implementation.
+        self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
+        if self.attn_backend not in {
+                _Backend.FLASH_ATTN, _Backend.TORCH_SDPA,
+                _Backend.ROCM_AITER_FA
+        }:
+            self.attn_backend = _Backend.TORCH_SDPA
+        self.is_flash_attn_backend = self.attn_backend in {
+            _Backend.FLASH_ATTN, _Backend.ROCM_AITER_FA
+        }
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        position_embeddings: Optional[tuple[torch.Tensor,
+                                            torch.Tensor]] = None,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """Input shape: Batch x Time x Channel"""
+
+        seq_length, embed_dim = hidden_states.shape
+
+        queries = self.q_proj(hidden_states)
+        keys = self.k_proj(hidden_states)
+        values = self.v_proj(hidden_states)
+
+        queries = queries.view(seq_length, self.num_heads, self.head_dim)
+        keys = keys.view(seq_length, self.num_heads, self.head_dim)
+        values = values.view(seq_length, self.num_heads, self.head_dim)
+
+        if self.use_rope:
+            cos, sin = position_embeddings
+            queries, keys = apply_rotary_pos_emb(queries.unsqueeze(0),
+                                                 keys.unsqueeze(0), cos, sin,
+                                                 self.is_flash_attn_backend)
+            queries = queries.squeeze(0)
+            keys = keys.squeeze(0)
+
+        max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
+        if self.is_flash_attn_backend:
+            if self.attn_backend == _Backend.ROCM_AITER_FA:
+                from aiter import flash_attn_varlen_func
+            else:
+                from flash_attn import flash_attn_varlen_func
+            attn_output = flash_attn_varlen_func(
+                queries, keys, values, cu_seqlens, cu_seqlens, max_seqlen,
+                max_seqlen).reshape(seq_length, -1)
+        elif self.attn_backend == _Backend.TORCH_SDPA:
+            # Execute attention entry by entry for speed & less VRAM.
+            batch_size = cu_seqlens.shape[0] - 1
+            outputs = []
+            cu = cu_seqlens.tolist()
+            for i in range(batch_size):
+                start_idx = cu[i]
+                end_idx = cu[i + 1]
+
+                # Each sequence is processed independently.
+                q_i = queries[start_idx:end_idx].unsqueeze(0)
+                k_i = keys[start_idx:end_idx].unsqueeze(0)
+                v_i = values[start_idx:end_idx].unsqueeze(0)
+
+                # (1, seq_len, num_heads, head_dim) ->
+                # (1, num_heads, seq_len, head_dim)
+                q_i, k_i, v_i = [x.transpose(1, 2) for x in (q_i, k_i, v_i)]
+
+                output_i = F.scaled_dot_product_attention(q_i,
+                                                          k_i,
+                                                          v_i,
+                                                          dropout_p=0.0)
+                # (1, num_heads, seq_len, head_dim) -> (seq_len, embed_dim)
+                output_i = output_i.transpose(1, 2).reshape(-1, self.embed_dim)
+                outputs.append(output_i)
+
+            attn_output = torch.cat(outputs, dim=0)
+        attn_output = self.out_proj(attn_output)
+        return attn_output
+
+
+class Siglip2MLP(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+class Siglip2EncoderLayer(nn.Module):
+
+    def __init__(self, config: PretrainedConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim,
+                                        eps=config.layer_norm_eps)
+        self.self_attn = Siglip2Attention(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim,
+                                        eps=config.layer_norm_eps)
+        self.mlp = Siglip2MLP(config)
+
+    def forward(self, hidden_states: torch.Tensor, cu_seqlens: torch.Tensor,
+                position_embeddings: torch.Tensor) -> tuple[torch.FloatTensor]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`):
+                Input to the layer of shape `(batch, seq_len, embed_dim)`.
+            output_attentions (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the attentions tensors of all
+                attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states = self.self_attn(hidden_states=hidden_states,
+                                       cu_seqlens=cu_seqlens,
+                                       position_embeddings=position_embeddings)
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+class Siglip2Encoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers`
+    self attention layers. Each layer is a [`Siglip2EncoderLayer`].
+    Args:
+        config: PretrainedConfig
+    """
+
+    def __init__(self, config: PretrainedConfig):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([
+            Siglip2EncoderLayer(config)
+            for _ in range(config.num_hidden_layers)
+        ])
+        self.gradient_checkpointing = False
+
+        self.rotary_pos_emb = VisionRotaryEmbedding(
+            config.hidden_size // config.num_attention_heads // 2)
+        self.patch_size = config.patch_size
+        self.hidden_stride = config.hidden_stride
+        self.window_size = config.window_size
+        self.spatial_merge_unit = config.hidden_stride * config.hidden_stride
+        if config.fullatt_block_indexes is None:
+            self.fullatt_block_indexes = None
+        else:
+            self.fullatt_block_indexes = [
+                int(i) for i in config.fullatt_block_indexes.split('|')
+            ]
+
+    # copied from qwen2.5_vl
+    def rot_pos_emb(self, grid_thw):
+        pos_ids = []
+        for t, h, w in grid_thw:
+            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
+            hpos_ids = hpos_ids.reshape(
+                h // self.hidden_stride,
+                self.hidden_stride,
+                w // self.hidden_stride,
+                self.hidden_stride,
+            )
+            hpos_ids = hpos_ids.permute(0, 2, 1, 3)
+            hpos_ids = hpos_ids.flatten()
+
+            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
+            wpos_ids = wpos_ids.reshape(
+                h // self.hidden_stride,
+                self.hidden_stride,
+                w // self.hidden_stride,
+                self.hidden_stride,
+            )
+            wpos_ids = wpos_ids.permute(0, 2, 1, 3)
+            wpos_ids = wpos_ids.flatten()
+            pos_ids.append(
+                torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
+        pos_ids = torch.cat(pos_ids, dim=0)
+        max_grid_size = grid_thw[:, 1:].max()
+        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
+        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
+        return rotary_pos_emb
+
+    def get_window_index(self, grid_thw):
+        window_index: list = []
+        cu_window_seqlens: list = [0]
+        window_index_id = 0
+        # patch (after merge) number in each window
+        vit_merger_window_size = (self.window_size // self.hidden_stride //
+                                  self.patch_size)
+
+        for grid_t, grid_h, grid_w in grid_thw:
+            llm_grid_h, llm_grid_w = (
+                grid_h // self.hidden_stride,  # number of patch after merge
+                grid_w // self.hidden_stride,
+            )
+            index = torch.arange(grid_t * llm_grid_h * llm_grid_w).reshape(
+                grid_t, llm_grid_h, llm_grid_w)
+            pad_h = vit_merger_window_size - llm_grid_h % vit_merger_window_size
+            pad_w = vit_merger_window_size - llm_grid_w % vit_merger_window_size
+            num_windows_h = (llm_grid_h + pad_h) // vit_merger_window_size
+            num_windows_w = (llm_grid_w + pad_w) // vit_merger_window_size
+            index_padded = F.pad(index, (0, pad_w, 0, pad_h), "constant", -100)
+            index_padded = index_padded.reshape(
+                grid_t,
+                num_windows_h,
+                vit_merger_window_size,
+                num_windows_w,
+                vit_merger_window_size,
+            )
+            index_padded = index_padded.permute(0, 1, 3, 2, 4).reshape(
+                grid_t,
+                num_windows_h * num_windows_w,
+                vit_merger_window_size,
+                vit_merger_window_size,
+            )
+            seqlens = (index_padded != -100).sum([2, 3]).reshape(-1)
+            index_padded = index_padded.reshape(-1)
+            index_new = index_padded[index_padded != -100]
+            window_index.append(index_new + window_index_id)
+            cu_seqlens_tmp = seqlens.cumsum(
+                0) * self.spatial_merge_unit + cu_window_seqlens[-1]
+            cu_window_seqlens.extend(cu_seqlens_tmp.tolist())
+            window_index_id += (grid_t * llm_grid_h * llm_grid_w).item()
+        window_index = torch.cat(window_index, dim=0)
+
+        return window_index, cu_window_seqlens
+
+    # Ignore copy
+    def forward(
+        self,
+        inputs_embeds,
+        grid_thws: torch.Tensor,
+        output_hidden_states: bool = False,
+    ) -> tuple[torch.Tensor, Optional[tuple[torch.Tensor, ...]]]:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape
+                `(batch_size, sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to
+                directly pass an embedded representation. This is useful if
+                you want more control over how to convert `input_ids` indices
+                into associated vectors than the model's internal embedding
+                lookup matrix.
+            grid_thws (`torch.LongTensor`):
+                grid shape (num_patches, 3)
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See
+                `hidden_states` under returned tensors for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of
+                a plain tuple.
+        """
+        rotary_pos_emb = self.rot_pos_emb(grid_thws)
+        window_index, cu_window_seqlens = self.get_window_index(grid_thws)
+        cu_window_seqlens = torch.tensor(
+            cu_window_seqlens,
+            device=inputs_embeds.device,
+            dtype=grid_thws.dtype if torch.jit.is_tracing() else torch.int32,
+        )
+        cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens)
+
+        seq_len, _ = inputs_embeds.size()
+        inputs_embeds = inputs_embeds.reshape(
+            seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
+        inputs_embeds = inputs_embeds[window_index, :, :]
+        inputs_embeds = inputs_embeds.reshape(seq_len, -1)
+        rotary_pos_emb = rotary_pos_emb.reshape(
+            seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
+        rotary_pos_emb = rotary_pos_emb[window_index, :, :]
+        rotary_pos_emb = rotary_pos_emb.reshape(seq_len, -1)
+        emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
+        position_embeddings = (emb.cos(), emb.sin())
+
+        cu_seqlens = torch.repeat_interleave(
+            grid_thws[:, 1] * grid_thws[:, 2], grid_thws[:, 0]
+        ).cumsum(
+            dim=0,
+            # Select dtype based on the following factors:
+            #  - FA2 requires that cu_seqlens_q must have dtype int32
+            #  - torch.onnx.export requires that cu_seqlens_q must have
+            #    same dtype as grid_thw
+            # See https://github.com/huggingface/transformers/pull/34852
+            # for more information
+            dtype=grid_thws.dtype if torch.jit.is_tracing() else torch.int32,
+        )
+        cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
+
+        reverse_indices = torch.argsort(window_index)
+        encoder_states = () if output_hidden_states else None
+
+        hidden_states = inputs_embeds
+        for index, block in enumerate(self.layers):
+            if (not self.fullatt_block_indexes
+                    or index in self.fullatt_block_indexes):
+                cu_seqlens_tmp = cu_seqlens
+            else:
+                cu_seqlens_tmp = cu_window_seqlens
+            hidden_states = block(hidden_states, cu_seqlens_tmp,
+                                  position_embeddings)
+            if output_hidden_states:
+                hidden_states_ = hidden_states.reshape(
+                    seq_len // self.spatial_merge_unit,
+                    self.spatial_merge_unit, -1)
+                encoder_states += (hidden_states_[reverse_indices, :].reshape(
+                    seq_len, -1), )
+        # tokens = self.post_trunk_norm(tokens)
+        hidden_states = hidden_states.reshape(
+            seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
+        hidden_states = hidden_states[reverse_indices, :].reshape(seq_len, -1)
+
+        return hidden_states, encoder_states
+
+
+class Siglip2VisionTransformer(nn.Module):
+
+    def __init__(self, config: PretrainedConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = Siglip2VisionEmbeddings(config)
+        self.encoder = Siglip2Encoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim,
+                                           eps=config.layer_norm_eps)
+        self._use_flash_attention_2 = \
+            (config._attn_implementation == "flash_attention_2")
+
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        grid_thws: torch.LongTensor,
+        output_hidden_states: Optional[bool] = True,
+        return_dict: Optional[bool] = True,
+    ) -> Union[
+            tuple[torch.Tensor],
+            tuple[torch.Tensor, tuple[torch.Tensor, ...]],
+            BaseModelOutputWithNoAttention,
+    ]:
+        r"""
+        spatial_shapes (`torch.LongTensor` of shape `(batch_size, 2)`):
+            Tensor containing the spatial dimensions (height, width)
+            of the input images.
+        """
+        hidden_states = self.embeddings(pixel_values, grid_thws)
+
+        last_hidden_state, hidden_states = self.encoder(
+            hidden_states, grid_thws, output_hidden_states)
+        last_hidden_state = self.post_layernorm(last_hidden_state)
+
+        if not return_dict:
+            output = (last_hidden_state, )
+            output += (hidden_states, ) if output_hidden_states else ()
+            return output
+
+        return last_hidden_state
+
+
+class Siglip2NavitModel(torch.nn.Module):
+
+    def __init__(self, config: PretrainedConfig):
+        super().__init__()
+
+        self.vision_model = Siglip2VisionTransformer(config)
+
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        grid_thws: torch.LongTensor,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[
+            tuple[torch.Tensor],
+            tuple[torch.Tensor, tuple[torch.Tensor, ...]],
+            BaseModelOutputWithNoAttention,
+    ]:
+
+        if output_hidden_states is None:
+            output_hidden_states = self.config.output_hidden_states
+        if return_dict is None:
+            return_dict = self.config.use_return_dict
+
+        return self.vision_model(
+            pixel_values=pixel_values,
+            grid_thws=grid_thws,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
\ No newline at end of file
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 9bc3b8e09ada..0f6331ffb9a3 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -87,6 +87,9 @@
     **_CONFIG_REGISTRY_OVERRIDE_HF
 }
 
+_CONFIG_ATTRS_MAPPING: dict[str, str] = {
+    "llm_config": "text_config",
+}
 
 class ConfigFormat(str, enum.Enum):
     AUTO = "auto"
@@ -239,6 +242,16 @@ def _uses_mrope(config: PretrainedConfig) -> bool:
 
     return "mrope_section" in rope_scaling
 
+def _maybe_remap_hf_config_attrs(config: PretrainedConfig) -> PretrainedConfig:
+    """Remap config attributes to match the expected names."""
+    for old_attr, new_attr in _CONFIG_ATTRS_MAPPING.items():
+        if hasattr(config, old_attr):
+            if not hasattr(config, new_attr):
+                config.update({new_attr: getattr(config, old_attr)})
+            logger.debug("Remapped config attribute '%s' to '%s'", old_attr,
+                         new_attr)
+    return config
+
 
 def uses_mrope(config: PretrainedConfig) -> bool:
     """Detect if the model with this config uses M-ROPE."""
@@ -357,6 +370,7 @@ def get_config(
                     raise RuntimeError(err_msg) from e
                 else:
                     raise e
+            config = _maybe_remap_hf_config_attrs(config)
 
     elif config_format == ConfigFormat.MISTRAL:
         config = load_params_config(model, revision, **kwargs)
diff --git a/vllm/transformers_utils/processors/__init__.py b/vllm/transformers_utils/processors/__init__.py
index 14d15f2bc167..0bf1b7c58be1 100644
--- a/vllm/transformers_utils/processors/__init__.py
+++ b/vllm/transformers_utils/processors/__init__.py
@@ -4,5 +4,6 @@
 from vllm.transformers_utils.processors.deepseek_vl2 import (
     DeepseekVLV2Processor)
 from vllm.transformers_utils.processors.ovis import OvisProcessor
+from vllm.transformers_utils.processors.ovis2_5 import Ovis2_5Processor
 
-__all__ = ["DeepseekVLV2Processor", "OvisProcessor"]
+__all__ = ["DeepseekVLV2Processor", "OvisProcessor", "Ovis2_5Processor"]
diff --git a/vllm/transformers_utils/processors/ovis2_5.py b/vllm/transformers_utils/processors/ovis2_5.py
new file mode 100644
index 000000000000..c0bace5ae52f
--- /dev/null
+++ b/vllm/transformers_utils/processors/ovis2_5.py
@@ -0,0 +1,458 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import math
+from functools import cached_property
+from typing import Optional, Union
+
+import numpy as np
+import PIL
+import torch
+from transformers import AutoProcessor, BatchFeature
+from transformers.image_utils import ImageInput
+from transformers.processing_utils import (ProcessingKwargs, ProcessorMixin,
+                                           Unpack)
+from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
+
+__all__ = ['Ovis2_5Processor']
+IMAGE_TOKEN = "<image>"
+VIDEO_TOKEN = "<video>"
+MIN_PIXELS = 448 * 448
+MAX_PIXELS = 1792 * 1792
+
+
+class Ovis2_5ProcessorKwargs(ProcessingKwargs,
+                             total=False):  # type: ignore[call-arg]
+    _defaults = {
+        "text_kwargs": {
+            "padding": False,
+        },
+        "images_kwargs": {
+            'convert_to_rgb': True,
+            'min_pixels': MIN_PIXELS,
+            'max_pixels': MAX_PIXELS,
+        },
+        "videos_kwargs": {
+            'convert_to_rgb': True,
+            'min_pixels': MIN_PIXELS,
+            'max_pixels': MAX_PIXELS,
+        }
+    }
+
+
+class Ovis2_5Processor(ProcessorMixin):
+    r"""
+    Constructs a Ovis processor which wraps a Ovis image processor
+    and a Qwen2 tokenizer into a single processor.
+    [`OvisProcessor`] offers all the functionalities of
+    [`Qwen2VLImageProcessor`] and [`Qwen2TokenizerFast`].
+    See the [`~OvisProcessor.__call__`] and [`~OvisProcessor.decode`]
+    for more information.
+    Args:
+        image_processor ([`Qwen2VLImageProcessor`], *optional*):
+            The image processor is a required input.
+        tokenizer ([`Qwen2TokenizerFast`], *optional*):
+            The tokenizer is a required input.
+        chat_template (`str`, *optional*): A Jinja template which will
+            be used to convert lists of messages in a chat into
+            a tokenizable string.
+    """
+
+    attributes = ["image_processor", "tokenizer"]
+    valid_kwargs = ["chat_template", "image_pad_token"]
+
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+
+    def __init__(
+        self,
+        image_processor=None,
+        tokenizer=None,
+        chat_template=None,
+        image_pad_token=None,
+        patch_size=16,
+        hidden_stride=2,
+        temporal_patch_size=1,
+        **kwargs,
+    ):
+        self.image_token = IMAGE_TOKEN
+        self.video_token = VIDEO_TOKEN
+        self.image_pad_token = "<|image_pad|>"
+
+        self.patch_size = patch_size
+        self.hidden_stride = hidden_stride
+        self.temporal_patch_size = temporal_patch_size
+        super().__init__(image_processor,
+                         tokenizer,
+                         chat_template=chat_template)
+
+    @cached_property
+    def extra_special_tokens(self):
+        image_pad_token_id = self.tokenizer.get_vocab()[self.image_pad_token]
+        extra_special_tokens = {
+            "image_token": -200,
+            "video_token": -201,
+            "visual_atom": -300,
+            "image_start": -301,
+            "image_end": -302,
+            "video_start": -303,
+            "video_end": -304,
+            'image_pad': image_pad_token_id,
+        }
+        return extra_special_tokens
+
+    def __call__(
+        self,
+        images: ImageInput = None,
+        videos: Union[np.ndarray, list[ImageInput]] = None,
+        text: Union[TextInput, PreTokenizedInput, list[TextInput],
+                    list[PreTokenizedInput]] = None,
+        **kwargs: Unpack[Ovis2_5ProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several sequences(s)
+        and image(s). This method forwards the `text`and `kwargs` arguments
+        to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text`
+        is not `None` to encode the text. To prepare the vision inputs,
+        this method forwards the `vision_infos` and `kwrags` arguments to
+        Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`]
+        if `vision_infos` is not `None`.
+            Args:
+                images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`,
+                    `list[PIL.Image.Image]`, `list[np.ndarray]`,
+                    `list[torch.Tensor]`):
+                    The image or batch of images to be prepared.
+                    Each image can be a PIL image, NumPy array or PyTorch
+                    tensor. Both channels-first and channels-last formats
+                    are supported.
+                text (`str`, `list[str]`, `list[list[str]]`):
+                    The sequence or batch of sequences to be encoded.
+                    Each sequence can be a string or a list of strings
+                    (pretokenized string). If the sequences are provided as
+                    list of strings (pretokenized), you must set
+                    `is_split_into_words=True` (to lift the ambiguity with
+                    a batch of sequences).
+                videos (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`,
+                    `list[torch.Tensor]`):
+                    The image or batch of videos to be prepared. Each video
+                    can be a 4D NumPy array or PyTorch tensor, or a nested
+                    list of 3D frames. Both channels-first and channels-last
+                    formats are supported.
+                return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                    If set, will return tensors of a particular framework.
+                    Acceptable values are:
+                    - `'tf'`: Return TensorFlow `tf.constant` objects.
+                    - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                    - `'np'`: Return NumPy `np.ndarray` objects.
+                    - `'jax'`: Return JAX `jnp.ndarray` objects.
+            Returns:
+                [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+                - **input_ids** -- list of token ids to be fed to a model.
+                  Returned when `text` is not `None`.
+                - **attention_mask** -- list of indices specifying which tokens
+                  should be attended to by the model (when
+                  `return_attention_mask=True` or if *"attention_mask"*
+                  is in `self.model_input_names` and if `text` is not `None`).
+                - **pixel_values** -- Pixel values to be fed to a model.
+                  Returned when `images` is not `None`.
+                - **pixel_values_videos** -- Pixel values of videos to be fed to
+                  a model. Returned when `videos` is not `None`.
+                - **image_grid_thw** -- list of image 3D grid in LLM. Returned
+                  when `images` is not `None`.
+                - **video_grid_thw** -- list of video 3D grid in LLM. Returned
+                  when `videos` is not `None`.
+                - **second_per_grid_ts** -- list of video seconds per time grid.
+                  Returned when `videos` is not `None`.
+        """
+        output_kwargs = self._merge_kwargs(
+            Ovis2_5ProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        # Process all images first
+        visual_features = {}
+        output = BatchFeature()
+        if images is not None:
+            processed_images = []
+            image_placeholders_list = []
+            grids = []
+            # Process each image
+            for image in images if isinstance(images, list) else [images]:
+                pixel_values, image_placeholders, grid = (
+                    self.preprocess_multidata(
+                        images=image, **output_kwargs["images_kwargs"]))
+                processed_images.append(pixel_values)
+                image_placeholders_list.append(image_placeholders)
+                grids.append(grid)
+
+            # assign all processed images
+            if processed_images:
+                visual_features["image_placeholders"] = image_placeholders_list
+            output["pixel_values"] = processed_images
+            output["grids"] = grids
+
+        if videos is not None:
+            processed_videos = []
+            videos_placeholders_list = []
+            grids = []
+            # Process each video
+            for video in videos if isinstance(videos, list) else [videos]:
+                pixel_values, video_placeholders, grid = (
+                    self.preprocess_multidata(
+                        video=video, **output_kwargs["videos_kwargs"]))
+                processed_videos.append(pixel_values)
+                videos_placeholders_list.append(video_placeholders)
+                grids.append(grid)
+            # assign all processed videos
+            if processed_videos:
+                visual_features[
+                    "video_placeholders"] = videos_placeholders_list
+            output["video_pixel_values"] = processed_videos
+            output["video_grids"] = grids
+
+        # Process text input
+        if text is not None:
+            if not isinstance(text, list):
+                text = [text]
+            tokenized_batched_text = self._tokenize_with_visual_symbol(text)
+            image_token_id = self.get_token_value("image_token")
+            video_token_id = self.get_token_value("video_token")
+            replaced_ids_list = []
+            image_idx = 0
+            video_idx = 0
+            for ids_tensor in tokenized_batched_text:
+                has_image_tokens = (image_token_id in ids_tensor
+                                    and "image_placeholders" in visual_features
+                                    and image_idx < len(
+                                        visual_features["image_placeholders"]))
+                has_video_tokens = (video_token_id in ids_tensor
+                                    and "video_placeholders" in visual_features
+                                    and video_idx < len(
+                                        visual_features["video_placeholders"]))
+                if has_image_tokens or has_video_tokens:
+                    # Convert to list for easier manipulation
+                    ids_list = ids_tensor.tolist()
+                    new_ids = []
+
+                    # Replace placeholders
+                    for token_id in ids_list:
+                        if token_id == image_token_id:
+                            new_ids.extend(
+                                visual_features["image_placeholders"]
+                                [image_idx])
+                            image_idx += 1
+                        elif token_id == video_token_id:
+                            new_ids.extend(
+                                visual_features["video_placeholders"]
+                                [video_idx])
+                            video_idx += 1
+                        else:
+                            new_ids.append(token_id)
+                    # Convert back to tensor
+                    ids_tensor = torch.tensor(new_ids, dtype=torch.long)
+                replaced_ids_list.append(ids_tensor)
+            if replaced_ids_list:
+                replaced_and_tokenized_ids = torch.stack(replaced_ids_list)
+            else:
+                replaced_and_tokenized_ids = torch.tensor([], dtype=torch.long)
+            output["input_ids"] = replaced_and_tokenized_ids
+
+            return output
+        # If only images were provided
+        return BatchFeature(data=visual_features)
+
+    def _tokenize_with_visual_symbol(self,
+                                     text_list: list[str]) -> torch.LongTensor:
+        batch_token_ids = []
+        for text in text_list:
+            token_ids = []
+            video_token_id = self.get_token_value("video_token")
+            image_token_id = self.get_token_value("image_token")
+            video_split_texts = text.split(self.video_token)
+
+            for j, video_segment in enumerate(video_split_texts):
+                image_split_texts = video_segment.split(self.image_token)
+                text_chunks = [
+                    self.tokenizer(chunk, add_special_tokens=False).input_ids
+                    for chunk in image_split_texts
+                ]
+                segment_tokens = []
+                for i, chunk in enumerate(text_chunks):
+                    segment_tokens.extend(chunk)
+                    if i < len(text_chunks) - 1:
+                        segment_tokens.append(image_token_id)
+                token_ids.extend(segment_tokens)
+                if j < len(video_split_texts) - 1:
+                    token_ids.append(video_token_id)
+
+            batch_token_ids.append(token_ids)
+        return torch.tensor(batch_token_ids, dtype=torch.long)
+
+    # Copied from qwen2_vl
+    def smart_resize(self,
+                     height: int,
+                     width: int,
+                     factor: int = 28,
+                     min_pixels: int = MIN_PIXELS,
+                     max_pixels: int = MAX_PIXELS):
+        """Rescales the image so that the following conditions are met:
+        1. Both dimensions (height and width) are divisible by 'factor'.
+        2. The total number of pixels is within the range
+            ['min_pixels', 'max_pixels'].
+        3. The aspect ratio of the image is maintained as closely as possible.
+        """
+        if height < factor or width < factor:
+            print(f"height:{height} or width:{width} must be "
+                  f"larger than factor:{factor}")
+            if height < width:
+                width = round(factor / height * width)
+                height = factor
+            else:
+                height = round(factor / width * height)
+                width = factor
+
+        elif max(height, width) / min(height, width) > 200:
+            print(f"absolute aspect ratio must be smaller than 200, "
+                  f"got {max(height, width) / min(height, width)}")
+            if height > width:
+                height = 200 * width
+            else:
+                width = 200 * height
+
+        h_bar = round(height / factor) * factor
+        w_bar = round(width / factor) * factor
+        if h_bar * w_bar > max_pixels:
+            beta = math.sqrt((height * width) / max_pixels)
+            h_bar = math.floor(height / beta / factor) * factor
+            w_bar = math.floor(width / beta / factor) * factor
+        elif h_bar * w_bar < min_pixels:
+            beta = math.sqrt(min_pixels / (height * width))
+            h_bar = math.ceil(height * beta / factor) * factor
+            w_bar = math.ceil(width * beta / factor) * factor
+        return h_bar, w_bar
+
+    def get_token_value(self, tok):
+        return self.extra_special_tokens[tok]
+
+    def construct_visual_indicators(self, grid, is_video: bool = False):
+        if is_video:
+            start_token = self.get_token_value('video_start')
+            end_token = self.get_token_value('video_end')
+        else:
+            start_token = self.get_token_value('image_start')
+            end_token = self.get_token_value('image_end')
+
+        image_placeholders = [start_token, self.get_token_value('visual_atom')]
+        if grid[0] * grid[1] > 1:
+            for r in range(grid[0]):
+                for c in range(grid[1]):
+                    image_placeholders.append(
+                        self.get_token_value('visual_atom'))
+
+        image_placeholders.append(end_token)
+        return image_placeholders
+
+    def construct_visual_placeholders(self, grid, is_video: bool = False):
+        visual_placeholders = self.construct_visual_indicators((1, 1),
+                                                               is_video)
+
+        image_atom_token_id = self.get_token_value('visual_atom')
+        # Extract the padding token ID from tokenizer
+        image_padding_token_id = self.get_token_value('image_pad')
+
+        num_image_atoms = grid[0] * grid[1] * grid[2]
+        num_image_atoms //= self.hidden_stride**2
+        num_image_atoms //= self.temporal_patch_size
+
+        # Create a new list with padding tokens inserted
+        padded_placeholder_tokens = []
+        for token in visual_placeholders:
+            if token == image_atom_token_id:
+                padded_placeholder_tokens.extend([image_padding_token_id] *
+                                                 num_image_atoms)
+            else:
+                padded_placeholder_tokens.append(image_padding_token_id)
+        return padded_placeholder_tokens
+
+    def preprocess_multidata(
+        self,
+        images: Optional[Union[PIL.Image.Image, list[PIL.Image.Image]]] = None,
+        video: Optional[Union[list[PIL.Image.Image], np.ndarray]] = None,
+        convert_to_rgb: Optional[bool] = True,
+        min_pixels: int = MIN_PIXELS,
+        max_pixels: int = MAX_PIXELS,
+        return_tensors: Optional[str] = 'pt',
+    ):
+        is_video = False
+        if images is not None:
+            if not isinstance(images, list):
+                images = [images]
+        elif video is not None:
+            is_video = True
+            # type of vidoe in dummy_mm_data is np.ndarray
+            if isinstance(video, np.ndarray):
+                images = []
+                for i in range(video.shape[0]):
+                    image = PIL.Image.fromarray(video[i].astype(np.uint8))
+                    images.append(image)
+            elif isinstance(video, list):
+                images = video
+        min_pixels = min(max_pixels if max_pixels is not None else MAX_PIXELS,
+                         min_pixels if min_pixels is not None else MIN_PIXELS)
+        images = [
+            image.convert("RGB")
+            if convert_to_rgb and image.mode != 'RGB' else image
+            for image in images
+        ]
+
+        width, height = images[0].size
+        resized_height, resized_width = height, width
+        processed_images = []
+        for image in images:
+            resized_height, resized_width = self.smart_resize(
+                height,
+                width,
+                factor=self.patch_size * self.hidden_stride,
+                min_pixels=min_pixels,
+                max_pixels=max_pixels,
+            )
+            new_size = dict(height=resized_height, width=resized_width)
+            image_pt = self.image_processor.preprocess(
+                image, size=new_size, return_tensors="np")['pixel_values'][0]
+
+            processed_images.append(image_pt)
+
+        patches = np.array(processed_images)
+        if patches.shape[0] % self.temporal_patch_size != 0:
+            num_to_pad = self.temporal_patch_size - (patches.shape[0] %
+                                                     self.temporal_patch_size)
+            repeats = np.repeat(patches[-1][np.newaxis], num_to_pad, axis=0)
+            patches = np.concatenate([patches, repeats], axis=0)
+        channel = patches.shape[1]
+        grid_t = patches.shape[0] // self.temporal_patch_size
+        grid_h = resized_height // self.patch_size
+        grid_w = resized_width // self.patch_size
+
+        patches = patches.reshape(
+            grid_t,
+            self.temporal_patch_size,
+            channel,
+            grid_h // self.hidden_stride,
+            self.hidden_stride,
+            self.patch_size,
+            grid_w // self.hidden_stride,
+            self.hidden_stride,
+            self.patch_size,
+        )
+        patches = patches.transpose(0, 3, 6, 4, 7, 2, 1, 5, 8)
+        flatten_patches = patches.reshape(
+            grid_t * grid_h * grid_w, channel * self.temporal_patch_size *
+            self.patch_size * self.patch_size)
+
+        visual_placeholders = self.construct_visual_placeholders(
+            [grid_t, grid_h, grid_w], is_video)
+        return torch.tensor(
+            flatten_patches), visual_placeholders, torch.tensor(
+                [[grid_t, grid_h, grid_w]])
+
+
+AutoProcessor.register("Ovis2_5Processor", Ovis2_5Processor)
\ No newline at end of file

From 0cd6eac2a2c54ce10ae5b67a57346f2dbddc0245 Mon Sep 17 00:00:00 2001
From: Christopher Manteuffel <christopher.manteuffel@intel.com>
Date: Fri, 22 Aug 2025 12:31:31 -0700
Subject: [PATCH 02/13] Remove rocm backend

---
 vllm/model_executor/models/siglip2navit.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/models/siglip2navit.py b/vllm/model_executor/models/siglip2navit.py
index 36390a08be08..669fd5509580 100644
--- a/vllm/model_executor/models/siglip2navit.py
+++ b/vllm/model_executor/models/siglip2navit.py
@@ -210,11 +210,10 @@ def __init__(self, config):
         self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
         if self.attn_backend not in {
                 _Backend.FLASH_ATTN, _Backend.TORCH_SDPA,
-                _Backend.ROCM_AITER_FA
         }:
             self.attn_backend = _Backend.TORCH_SDPA
         self.is_flash_attn_backend = self.attn_backend in {
-            _Backend.FLASH_ATTN, _Backend.ROCM_AITER_FA
+            _Backend.FLASH_ATTN,
         }
 
     def forward(
@@ -246,10 +245,7 @@ def forward(
 
         max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
         if self.is_flash_attn_backend:
-            if self.attn_backend == _Backend.ROCM_AITER_FA:
-                from aiter import flash_attn_varlen_func
-            else:
-                from flash_attn import flash_attn_varlen_func
+            from flash_attn import flash_attn_varlen_func
             attn_output = flash_attn_varlen_func(
                 queries, keys, values, cu_seqlens, cu_seqlens, max_seqlen,
                 max_seqlen).reshape(seq_length, -1)

From 778a41ffd688e258a97c12f16a81ef5613270f9f Mon Sep 17 00:00:00 2001
From: Christopher Manteuffel <christopher.manteuffel@intel.com>
Date: Fri, 22 Aug 2025 13:18:39 -0700
Subject: [PATCH 03/13] remove tok_kwargs Fix the error for cpu fallback

---
 .../vision_language_multi_image.py            |   1 +
 hpu_test/quick_test.py                        | 166 ++++++++++++++++++
 hpu_test/server_ovis.sh                       |  36 ++++
 vllm/model_executor/models/ovis2_5.py         |   2 -
 vllm/model_executor/models/siglip2navit.py    |  26 ++-
 5 files changed, 226 insertions(+), 5 deletions(-)
 create mode 100644 hpu_test/quick_test.py
 create mode 100755 hpu_test/server_ovis.sh

diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index bd71b021320d..e228f0cbd51a 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -772,6 +772,7 @@ def load_tarsier(question: str, image_urls: list[str]) -> ModelRequestData:
     "mllama": load_mllama,
     "NVLM_D": load_nvlm_d,
     "ovis": load_ovis,
+    "ovis2_5": load_ovis2_5,
     "phi3_v": load_phi3v,
     "phi4_mm": load_phi4mm,
     "pixtral_hf": load_pixtral_hf,
diff --git a/hpu_test/quick_test.py b/hpu_test/quick_test.py
new file mode 100644
index 000000000000..cd59ecca6599
--- /dev/null
+++ b/hpu_test/quick_test.py
@@ -0,0 +1,166 @@
+import time
+import os
+
+os.environ["VLLM_SKIP_WARMUP"] = "true"
+os.environ["PT_HPU_LAZY_MODE"] = "1"
+os.environ["PT_HPUGRAPH_DISABLE_TENSOR_CACHE=False "] = "False"
+
+from vllm import LLM
+from vllm.assets.image import ImageAsset
+from vllm import SamplingParams
+from PIL import Image
+
+# Load the image
+newsize = (537, 612)
+image = ImageAsset("stop_sign").pil_image
+image = image.resize(newsize)
+newsize = (672, 784)
+image2 = ImageAsset("cherry_blossom").pil_image
+image2 = image2.resize(newsize)
+newsize = (1236, 1236)
+image3 = ImageAsset("cherry_blossom").pil_image
+image3 = image3.resize(newsize)
+
+test_image = Image.open("test_image.jpg").resize((560, 784), Image.Resampling.LANCZOS)
+test_image_d1 = Image.open("test_image.jpg").resize((580, 784), Image.Resampling.LANCZOS)
+test_image_d2 = Image.open("test_image.jpg").resize((560, 840), Image.Resampling.LANCZOS)
+test_image_d3 = Image.open("test_image.jpg").resize((580, 840), Image.Resampling.LANCZOS)
+#test_image_distorted = Image.open("test_image.jpg").resize((308, 224), Image.Resampling.LANCZOS)
+
+
+# Initialize the LLM with a multimodal model like LLaVA
+# Qwen/Qwen2-VL-7B-Instruct
+#model2_5 = "Qwen/Qwen2.5-VL-3B-Instruct"
+#model = model2_5
+model = "AIDC-AI/Ovis2.5-9B"
+enforce_eager = False
+
+kwargs={'trust_remote_code':'True'}
+llm = LLM(
+    model=model,
+    max_model_len=8000,#32768,
+    max_num_seqs=5,
+    limit_mm_per_prompt={
+        "image": 1,
+        "video": 0
+    },
+    enforce_eager=enforce_eager,
+    **kwargs
+)
+
+prompt = '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe this image.<|im_end|>\n<|im_start|>assistant\n'
+prompt2 = '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Can you please, tell me what do you see in this?<|im_end|>\n<|im_start|>assistant\n'
+prompt_text_only = '<|im_start|>system\nYou are a nice sytem.<|im_end|>\n<|im_start|>user\nTell me about you.<|im_end|>\n<|im_start|>assistant\n'
+prompt_question = 'Who is Barack Obama?\n'
+prompt_new = '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Can you tell me the street address on this image, without the city name and the state?.<|im_end|>\n<|im_start|>assistant\n'
+
+#OVIS
+prompt_new='<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<image>Please respond in English. Can you tell me the street address on this image, without the city name and the state?.<|im_end|>\n<|im_start|>assistant\n'
+batch_data = [
+   {"prompt": prompt_new, "multi_modal_data": {"image": test_image}},
+   {"prompt": prompt_new, "multi_modal_data": {"image": test_image_d1}},
+   {"prompt": prompt_new, "multi_modal_data": {"image": test_image_d2}},
+   {"prompt": prompt_new, "multi_modal_data": {"image": test_image_d3}},
+    # {"prompt": prompt, "multi_modal_data": {"image": image}},
+    # {"prompt": prompt_text_only},
+    # {"prompt": prompt2, "multi_modal_data": {"image": image2}},
+    # {"prompt": prompt_question},
+    # {"prompt": prompt, "multi_modal_data": {"image": image3}},
+    # {"prompt": prompt_question},
+    # {"prompt": prompt, "multi_modal_data": {"image": image3}},
+    # {"prompt": prompt_question},        
+]
+'''
+
+prompt = '<|im_start|>user \n<image>\nCan you tell me the street address on this image, without the city name and the state?.<|im_end|>\n<|im_start|>assistant\n'
+batch_data = [
+   {"prompt": prompt, "multi_modal_data": {"image": test_image}},
+]
+'''
+sampling_params = SamplingParams(temperature=0.0, max_tokens=20, seed=2025)
+
+REPS_TOTAL = len(batch_data)
+throughputs = []
+durations = []
+answers = []
+
+from transformers import AutoTokenizer
+placeholder = "<image>"
+question = "Can you tell me the street address on this image, without the city name and the state?"
+tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
+messages = [
+    [{"role": "user", "content": f"{placeholder}\n{question}"}]
+]
+prompt = tokenizer.apply_chat_template(
+    messages, tokenize=False, add_generation_prompt=True#, enable_thinking=False
+)
+
+
+
+for i in range(REPS_TOTAL):
+    print(f" ===================== ITER {i} ====================== ")
+
+    start_time = time.time()
+    outputs = llm.generate(batch_data[i], sampling_params=sampling_params)
+    elapsed_time = time.time() - start_time
+
+    num_tokens = sum(len(o.prompt_token_ids) for o in outputs)
+    throughput = num_tokens / elapsed_time
+    
+    durations.append(elapsed_time)
+    throughputs.append(throughput)
+
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        answers.append(generated_text)
+        print(" > OUTPUT", generated_text)
+
+print(f" ===================== Answers ====================== ")
+for ans in answers:
+  print(" ", ans)
+
+avg_throughputs = sum(throughputs) / REPS_TOTAL
+print(f"Throughputs: {avg_throughputs:0.2f}")
+
+# print(f" ==================== batch data ==================== ")
+
+# throughputs = []
+# durations = []
+# answers = []
+
+# start_time = time.time()
+# outputs = llm.generate(batch_data, sampling_params=sampling_params)
+# elapsed_time = time.time() - start_time
+
+# num_tokens = sum(len(o.prompt_token_ids) for o in outputs)
+# throughput = num_tokens / elapsed_time
+
+# durations.append(elapsed_time)
+# throughputs.append(throughput)
+
+# for o in outputs:
+#     generated_text = o.outputs[0].text
+#     answers.append(generated_text)
+#     print(" > OUTPUT", generated_text)
+
+# for ans in answers:
+#   print(" ", ans)
+
+# avg_throughputs = sum(throughputs) / REPS_TOTAL
+# print(f"Throughputs: {avg_throughputs:0.2f}")
+
+#  ===================== Answers ======================
+#   The image depicts a street scene in what appears to be a Chinatown area, characterized by traditional Chinese
+#   I am a large language model created by Alibaba Cloud. I am called Qwen. I am designed
+#   The image shows a beautiful scene of cherry blossoms in full bloom, with pink flowers covering the branches
+#   Barack Obama is an American politician who served as the 44th President of the United States
+# Throughputs: 100.17 (No warmup) -> (Throughputs: 337.50 with warmUp)
+
+
+# GPU 
+#  ===================== Answers ======================
+#   The image depicts a street scene in what appears to be an Asian neighborhood, likely in a Chinatown
+#   I am a large language model created by Alibaba Cloud. I am called Qwen. I am designed
+#   The image shows a beautiful scene of cherry blossoms in full bloom, with pink flowers covering the branches
+#   Barack Obama is an American politician who served as the 44th President of the United States
+# Throughputs: 725.20
diff --git a/hpu_test/server_ovis.sh b/hpu_test/server_ovis.sh
new file mode 100755
index 000000000000..9d587672414e
--- /dev/null
+++ b/hpu_test/server_ovis.sh
@@ -0,0 +1,36 @@
+export VLLM_SKIP_WARMUP=True 
+export VLLM_PROMPT_BS_BUCKET_MIN=1
+export VLLM_PROMPT_BS_BUCKET_STEP=1
+export VLLM_PROMPT_BS_BUCKET_MAX=8
+export VLLM_DECODE_BS_BUCKET_MIN=1
+export VLLM_DECODE_BS_BUCKET_STEP=4
+export VLLM_DECODE_BS_BUCKET_MAX=64 ## used to be 52
+export VLLM_PROMPT_SEQ_BUCKET_MIN=3072
+export VLLM_PROMPT_SEQ_BUCKET_STEP=256
+export VLLM_PROMPT_SEQ_BUCKET_MAX=3840 ## used to be 3328 
+export VLLM_DECODE_BLOCK_BUCKET_MIN=1024
+export VLLM_DECODE_BLOCK_BUCKET_STEP=512
+export VLLM_DECODE_BLOCK_BUCKET_MAX=2048
+
+export VLLM_MULTIMODAL_BUCKETS="6912"
+
+export VLLM_DELAYED_SAMPLING=true
+export TRANSFORMERS_VERBOSITY=info
+#export VLLM_FP32_SOFTMAX=true
+export VLLM_EXPONENTIAL_BUCKETING=false
+export VLLM_FUSED_BLOCK_SOFTMAX_ADJUSTMENT=False
+export PT_HPU_LAZY_MODE=1
+
+## newly added 
+export PT_HPU_ENABLE_LAZY_COLLECTIVES=true
+export EXPERIMENTAL_WEIGHT_SHARING=0
+export FUSER_ENABLE_LOW_UTILIZATION=true
+export ENABLE_FUSION_BEFORE_NORM=true
+
+# Ovis - need to run with this off for vision part 
+export PT_HPUGRAPH_DISABLE_TENSOR_CACHE=False
+
+python3 -m vllm.entrypoints.openai.api_server --model "AIDC-AI/Ovis2.5-9B" --task generate --trust-remote-code --host 0.0.0.0 --port 10900 --tensor-parallel-size 1 \
+    --mm-processor-kwargs '{"min_pixels": 401408, "max_pixels": 1605632}' --max-parallel-loading-workers 16 --max-model-len 8000 \
+    --gpu-memory-util 0.99 \
+    --max-num-prefill-seqs 1
diff --git a/vllm/model_executor/models/ovis2_5.py b/vllm/model_executor/models/ovis2_5.py
index b6fbd9ab82d2..4534da1b3423 100644
--- a/vllm/model_executor/models/ovis2_5.py
+++ b/vllm/model_executor/models/ovis2_5.py
@@ -313,7 +313,6 @@ def _call_hf_processor(
         prompt: str,
         mm_data: Mapping[str, object],
         mm_kwargs: Mapping[str, object],
-        tok_kwargs: Mapping[str, object],
     ) -> BatchFeature:
         if not mm_data:
             # Avoid warning from HF logger for text-only input
@@ -325,7 +324,6 @@ def _call_hf_processor(
             prompt=prompt,
             mm_data=mm_data,
             mm_kwargs=mm_kwargs,
-            tok_kwargs=tok_kwargs,
         )
         hf_processor = self.info.get_hf_processor()
 
diff --git a/vllm/model_executor/models/siglip2navit.py b/vllm/model_executor/models/siglip2navit.py
index 669fd5509580..17de1cc20b6a 100644
--- a/vllm/model_executor/models/siglip2navit.py
+++ b/vllm/model_executor/models/siglip2navit.py
@@ -13,10 +13,16 @@
 from transformers.configuration_utils import PretrainedConfig
 from transformers.modeling_outputs import BaseModelOutputWithNoAttention
 
-from vllm.platforms import _Backend
+from vllm.platforms import _Backend, current_platform
 
 from .vision import get_vit_attn_backend
 
+is_hpu = current_platform.is_hpu()
+
+if is_hpu:
+    import habana_frameworks.torch.core as htcore
+    from habana_frameworks.torch.hpex.kernels import FusedSDPA
+
 
 class VisionRotaryEmbedding(nn.Module):
 
@@ -267,10 +273,14 @@ def forward(
                 # (1, num_heads, seq_len, head_dim)
                 q_i, k_i, v_i = [x.transpose(1, 2) for x in (q_i, k_i, v_i)]
 
+                '''
                 output_i = F.scaled_dot_product_attention(q_i,
                                                           k_i,
                                                           v_i,
                                                           dropout_p=0.0)
+                '''
+                output_i = FusedSDPA.apply(q_i,k_i,v_i, None, 0.0,
+                                                   False, None)
                 # (1, num_heads, seq_len, head_dim) -> (seq_len, embed_dim)
                 output_i = output_i.transpose(1, 2).reshape(-1, self.embed_dim)
                 outputs.append(output_i)
@@ -467,12 +477,19 @@ def forward(
         """
         rotary_pos_emb = self.rot_pos_emb(grid_thws)
         window_index, cu_window_seqlens = self.get_window_index(grid_thws)
+        # NOTE: unique_consecutive is a dynamic operation
+        # we are using `remove_duplicates_cpu` instead
+        def remove_duplicates_cpu(a):
+            return [a[i] for i in range(len(a)) if i == 0 or a[i - 1] != a[i]]
+
+        cu_window_seqlens = remove_duplicates_cpu(cu_window_seqlens)
+
         cu_window_seqlens = torch.tensor(
             cu_window_seqlens,
             device=inputs_embeds.device,
             dtype=grid_thws.dtype if torch.jit.is_tracing() else torch.int32,
         )
-        cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens)
+        #cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens)
 
         seq_len, _ = inputs_embeds.size()
         inputs_embeds = inputs_embeds.reshape(
@@ -504,6 +521,8 @@ def forward(
         encoder_states = () if output_hidden_states else None
 
         hidden_states = inputs_embeds
+
+        htcore.mark_step()
         for index, block in enumerate(self.layers):
             if (not self.fullatt_block_indexes
                     or index in self.fullatt_block_indexes):
@@ -519,6 +538,7 @@ def forward(
                 encoder_states += (hidden_states_[reverse_indices, :].reshape(
                     seq_len, -1), )
         # tokens = self.post_trunk_norm(tokens)
+        htcore.mark_step()
         hidden_states = hidden_states.reshape(
             seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
         hidden_states = hidden_states[reverse_indices, :].reshape(seq_len, -1)
@@ -599,4 +619,4 @@ def forward(
             grid_thws=grid_thws,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
-        )
\ No newline at end of file
+        )

From 416045ab12cc9225d90ca95c10b9b88ca9a54c21 Mon Sep 17 00:00:00 2001
From: Christopher Manteuffel <christopher.manteuffel@intel.com>
Date: Wed, 3 Sep 2025 12:03:42 -0700
Subject: [PATCH 04/13] Clean up code, add tests Change ovis min/max prompt
 bucket

---
 .../lm-eval-harness/configs/models-ovis.txt   |   1 +
 .../lm-eval-harness/configs/ovis2_5-9b.yaml   |  12 ++
 .jenkins/test_config.yaml                     | 107 ++++++-----
 .jenkins/vision/configs/models-ovis.txt       |   1 +
 .jenkins/vision/configs/ovis2_5-9b.yaml       |   6 +
 hpu_test/quick_test.py                        | 166 ------------------
 hpu_test/server_ovis.sh                       |  36 ----
 vllm/model_executor/models/siglip2navit.py    |  17 +-
 8 files changed, 89 insertions(+), 257 deletions(-)
 create mode 100644 .jenkins/lm-eval-harness/configs/models-ovis.txt
 create mode 100644 .jenkins/lm-eval-harness/configs/ovis2_5-9b.yaml
 create mode 100644 .jenkins/vision/configs/models-ovis.txt
 create mode 100644 .jenkins/vision/configs/ovis2_5-9b.yaml
 delete mode 100644 hpu_test/quick_test.py
 delete mode 100755 hpu_test/server_ovis.sh

diff --git a/.jenkins/lm-eval-harness/configs/models-ovis.txt b/.jenkins/lm-eval-harness/configs/models-ovis.txt
new file mode 100644
index 000000000000..c20ecf534544
--- /dev/null
+++ b/.jenkins/lm-eval-harness/configs/models-ovis.txt
@@ -0,0 +1 @@
+ovis2_5-9b.yaml
\ No newline at end of file
diff --git a/.jenkins/lm-eval-harness/configs/ovis2_5-9b.yaml b/.jenkins/lm-eval-harness/configs/ovis2_5-9b.yaml
new file mode 100644
index 000000000000..8e7f7e7a63b3
--- /dev/null
+++ b/.jenkins/lm-eval-harness/configs/ovis2_5-9b.yaml
@@ -0,0 +1,12 @@
+model_name: "/mnt/weka/llm/google/gemma-3-4b-it"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.700
+  - name: "exact_match,flexible-extract"
+    value: 0.700
+limit: 256
+num_fewshot: 8
+dtype: "bfloat16"
+trust_remote_code: True
diff --git a/.jenkins/test_config.yaml b/.jenkins/test_config.yaml
index 0ac488e1b150..d2c55a038363 100644
--- a/.jenkins/test_config.yaml
+++ b/.jenkins/test_config.yaml
@@ -5,54 +5,59 @@ stages:
       - name: v0_gsm8k_small_g3_tp1_part1
         flavor: g3
         command: >-
-          export PT_HPU_LAZY_MODE=1 && 
+          export PT_HPU_LAZY_MODE=1 &&
           cd .jenkins/lm-eval-harness && bash run-tests.sh -c configs/models-small.txt -t 1
       - name: v0_gsm8k_small_g3_tp1_part2
         flavor: g3
         command: >-
-          export PT_HPU_LAZY_MODE=1 && 
+          export PT_HPU_LAZY_MODE=1 &&
           cd .jenkins/lm-eval-harness && bash run-tests.sh -c configs/models-small-2.txt -t 1
       - name: v0_gsm8k_small_g3_tp1_part3
         flavor: g3
         command: >-
-          export PT_HPU_LAZY_MODE=1 && 
+          export PT_HPU_LAZY_MODE=1 &&
           cd .jenkins/lm-eval-harness && bash run-tests.sh -c configs/models-small-3.txt -t 1
       - name: v0_gsm8k_small_g3_tp2
         flavor: g3.s
         command: >-
-          export PT_HPU_LAZY_MODE=1 && 
+          export PT_HPU_LAZY_MODE=1 &&
           cd .jenkins/lm-eval-harness && bash run-tests.sh -c configs/models-small.txt -t 2
       - name: v0_gsm8k_small_g2_tp1
         flavor: g2
         command: >-
-          export PT_HPU_LAZY_MODE=1 && 
+          export PT_HPU_LAZY_MODE=1 &&
           cd .jenkins/lm-eval-harness && bash run-tests.sh -c configs/models-small.txt -t 1
       - name: v0_gsm8k_small_g2_tp2
         flavor: g2.s
         command: >-
-          export PT_HPU_LAZY_MODE=1 && 
+          export PT_HPU_LAZY_MODE=1 &&
           cd .jenkins/lm-eval-harness && bash run-tests.sh -c configs/models-small.txt -t 2
       - name: v0_gsm8k_g2_deepseek-v2-lite_tp1
         flavor: g3
         command: >-
-          export PT_HPU_LAZY_MODE=1 && 
+          export PT_HPU_LAZY_MODE=1 &&
           cd .jenkins/lm-eval-harness && bash run-tests.sh -c configs/models-deepseek.txt -t 1
       - name: v0_gsm8k_g3_gemma3_tp1
         flavor: g3.s
         command: >-
           export PT_HPU_LAZY_MODE=1 &&
           cd .jenkins/lm-eval-harness && bash run-tests.sh -c configs/models-gemma.txt -t 1
+      - name: v0_gsm8k_g3_ovis2_5_tp1
+        flavor: g3.s
+        command: >-
+          export PT_HPU_LAZY_MODE=1 && export VLLM_SKIP_WARMUP=true &&
+          cd .jenkins/lm-eval-harness && bash run-tests.sh -c configs/models-ovis.txt -t 1
   - name: test_gsm8k_small_models_apc
     steps:
       - name: gsm8k_small_g3_tp1_apc
         flavor: g3
         command: >-
-          export VLLM_CONTIGUOUS_PA=false && 
+          export VLLM_CONTIGUOUS_PA=false &&
           cd .jenkins/lm-eval-harness && bash run-tests.sh -c configs/models-small.txt -t 1 -a
       - name: gsm8k_small_g2_tp1_apc
         flavor: g2
         command: >-
-          export VLLM_CONTIGUOUS_PA=false && 
+          export VLLM_CONTIGUOUS_PA=false &&
           cd .jenkins/lm-eval-harness && bash run-tests.sh -c configs/models-small.txt -t 1 -a
   - name: test_gsm8k_small_models_merged_prefill
     steps:
@@ -66,100 +71,100 @@ stages:
       - name: v0_gsm8k_large_g3_tp2_part1
         flavor: g3.s
         command: >-
-          export PT_HPU_LAZY_MODE=1 && 
+          export PT_HPU_LAZY_MODE=1 &&
           cd .jenkins/lm-eval-harness && bash run-tests.sh -c configs/models-large.txt -t 2
       - name: v0_gsm8k_large_g3_tp2_part2
         flavor: g3.s
         command: >-
-          export PT_HPU_LAZY_MODE=1 && 
+          export PT_HPU_LAZY_MODE=1 &&
           cd .jenkins/lm-eval-harness && bash run-tests.sh -c configs/models-large-2.txt -t 2
       - name: v0_gsm8k_large_g2_tp4
         flavor: g2.m
         command: >-
-          export PT_HPU_LAZY_MODE=1 && 
+          export PT_HPU_LAZY_MODE=1 &&
           cd .jenkins/lm-eval-harness && bash run-tests.sh -c configs/models-large.txt -t 4
   - name: test_gsm8k_fp8
     steps:
       - name: gsm8k_small_g3_tp1_fp8
         flavor: g3
         command: >-
-          cd .jenkins/lm-eval-harness && 
-          PT_HPU_LAZY_MODE=1 
+          cd .jenkins/lm-eval-harness &&
+          PT_HPU_LAZY_MODE=1
           bash run-tests.sh -c configs/models-fp8-g3-tp1.txt -t 1
       # - name: gsm8k_small_g3_tp2_fp8
       #   flavor: g3.s
       #   command: >-
-      #     cd .jenkins/lm-eval-harness && 
-      #     PT_HPU_LAZY_MODE=1 
+      #     cd .jenkins/lm-eval-harness &&
+      #     PT_HPU_LAZY_MODE=1
       #     bash run-tests.sh -c configs/models-fp8.txt -t 2
   - name: test_gsm8k_fp8_bypass_inc
     steps:
       - name: gsm8k_fp8_llama4_scout_g3_tp2_compressed_tensor
         flavor: g3.s
         command: >-
-          cd .jenkins/lm-eval-harness && 
-          PT_HPU_LAZY_MODE=1 
+          cd .jenkins/lm-eval-harness &&
+          PT_HPU_LAZY_MODE=1
           bash run-tests.sh -c configs/models-fp8-compressedtensor.txt -t 2
       - name: gsm8k_fp8_qwen3_30B_g3_tp1_block_scale_dynamic
         flavor: g3
         command: >-
-          cd .jenkins/lm-eval-harness && 
-          PT_HPU_LAZY_MODE=1 
+          cd .jenkins/lm-eval-harness &&
+          PT_HPU_LAZY_MODE=1
           bash run-tests.sh -c configs/models-fp8-blockfp8.txt -t 1
       - name: gsm8k_fp8_qwen3_30B_g3_tp1_block_scale_dequant
         flavor: g3
         command: >-
-          cd .jenkins/lm-eval-harness && 
-          PT_HPU_LAZY_MODE=1 VLLM_HPU_FORCE_CHANNEL_FP8=0 
+          cd .jenkins/lm-eval-harness &&
+          PT_HPU_LAZY_MODE=1 VLLM_HPU_FORCE_CHANNEL_FP8=0
           bash run-tests.sh -c configs/models-fp8-blockfp8.txt -t 1
   - name: test_gsm8k_mss
     steps:
       - name: gsm8k_small_g3_tp1_mss
         flavor: g3
         command: >-
-          cd .jenkins/lm-eval-harness && 
-          PT_HPU_LAZY_MODE=1 
+          cd .jenkins/lm-eval-harness &&
+          PT_HPU_LAZY_MODE=1
           bash run-tests.sh -c configs/models-mss.txt -t 1
       - name: gsm8k_small_g2_tp1_mss
         flavor: g2
         command: >-
-          cd .jenkins/lm-eval-harness && 
-          PT_HPU_LAZY_MODE=1 
+          cd .jenkins/lm-eval-harness &&
+          PT_HPU_LAZY_MODE=1
           bash run-tests.sh -c configs/models-mss.txt -t 1
       - name: gsm8k_small_g3_tp2_mss
         flavor: g3.s
         command: >-
-          cd .jenkins/lm-eval-harness && 
-          PT_HPU_LAZY_MODE=1 
+          cd .jenkins/lm-eval-harness &&
+          PT_HPU_LAZY_MODE=1
           bash run-tests.sh -c configs/models-mss.txt -t 2
       - name: gsm8k_small_g2_tp2_mss
         flavor: g2.s
         command: >-
-          cd .jenkins/lm-eval-harness && 
-          PT_HPU_LAZY_MODE=1 
+          cd .jenkins/lm-eval-harness &&
+          PT_HPU_LAZY_MODE=1
           bash run-tests.sh -c configs/models-mss.txt -t 2
       - name: gsm8k_small_g2_tp1_spec_decode
         flavor: g2
         command: >-
-          cd .jenkins/lm-eval-harness && 
-          PT_HPU_LAZY_MODE=1 
+          cd .jenkins/lm-eval-harness &&
+          PT_HPU_LAZY_MODE=1
           bash run-tests.sh -c configs/models-mss.txt -t 1
   - name: test_gsm8k_spec_decode
     steps:
       # - name: gsm8k_small_g2_tp1_mlp_spec_decode
       #   flavor: g2
       #   command: >-
-      #     PT_HPU_LAZY_MODE=1 VLLM_CONTIGUOUS_PA=false VLLM_SKIP_WARMUP=True 
+      #     PT_HPU_LAZY_MODE=1 VLLM_CONTIGUOUS_PA=false VLLM_SKIP_WARMUP=True
       #     pytest -v tests/spec_decode/e2e/test_mlp_correctness.py::test_mlp_e2e_greedy_correctness
       - name: gsm8k_small_g2_tp1_medusa_spec_decode
         flavor: g2
         command: >-
-          PT_HPU_LAZY_MODE=1 VLLM_CONTIGUOUS_PA=false VLLM_SKIP_WARMUP=True 
+          PT_HPU_LAZY_MODE=1 VLLM_CONTIGUOUS_PA=false VLLM_SKIP_WARMUP=True
           pytest -v tests/spec_decode/e2e/test_medusa_correctness.py::test_medusa_e2e_greedy_correctness
       - name: gsm8k_small_g2_tp1_eagle_spec_decode
         flavor: g2
         command: >-
-          PT_HPU_LAZY_MODE=1 VLLM_CONTIGUOUS_PA=false VLLM_SKIP_WARMUP=True 
+          PT_HPU_LAZY_MODE=1 VLLM_CONTIGUOUS_PA=false VLLM_SKIP_WARMUP=True
           pytest -v tests/spec_decode/e2e/test_eagle_correctness.py::test_eagle_e2e_greedy_correctness
   #TODO(kwisniewski98) temporary disable test, until model specific for Gaudi2 is uploaded to test infrastructure
   # - name: test_deepseek_mtp
@@ -167,38 +172,38 @@ stages:
   #     - name: test_deepseek_mtp_correctness
   #       flavor: g3
   #       command: >-
-  #         PT_HPU_LAZY_MODE=1 VLLM_CONTIGUOUS_PA=false VLLM_SKIP_WARMUP=True 
+  #         PT_HPU_LAZY_MODE=1 VLLM_CONTIGUOUS_PA=false VLLM_SKIP_WARMUP=True
   #         pytest -v tests/spec_decode/e2e/test_mtp_correctness.py::test_mtp_e2e_greedy_correctness
   - name: tests_lora
     steps:
       - name: test_llama_lora
         flavor: g2
         command: >-
-          PT_HPU_LAZY_MODE=1 VLLM_SKIP_WARMUP=true 
+          PT_HPU_LAZY_MODE=1 VLLM_SKIP_WARMUP=true
           pytest -v tests/lora/test_llama_hpu.py::test_llama_lora_1x
       - name: test_multilora
         flavor: g2
         command: >-
-          PT_HPU_LAZY_MODE=1 VLLM_SKIP_WARMUP=true 
+          PT_HPU_LAZY_MODE=1 VLLM_SKIP_WARMUP=true
           pytest -v tests/lora/test_multilora_hpu.py::test_llama_multilora_1x
       # - name: test_long_context
       #   flavor: g2
       #   command: >-
-      #     PT_HPU_LAZY_MODE=1 VLLM_SKIP_WARMUP=true 
+      #     PT_HPU_LAZY_MODE=1 VLLM_SKIP_WARMUP=true
       #     pytest -v tests/lora/test_long_context_hpu.py::test_quality
   - name: tests_multimodal
     steps:
       - name: multimodal_small_g3_tp1
         flavor: g3
         command: >-
-          cd .jenkins/vision && 
-          PT_HPU_LAZY_MODE=1 
+          cd .jenkins/vision &&
+          PT_HPU_LAZY_MODE=1
           bash run-tests.sh -c configs/models-small.txt -t 1
       - name: multimodal_small_g3_tp2
         flavor: g3.s
         command: >-
           cd .jenkins/vision &&
-          PT_HPU_LAZY_MODE=1 
+          PT_HPU_LAZY_MODE=1
           bash run-tests.sh -c configs/models-small.txt -t 2
       - name: multimodal_qwen_tp1
         flavor: g3.s
@@ -210,13 +215,13 @@ stages:
         flavor: g3
         command: >-
           cd .jenkins/vision &&
-          PT_HPU_LAZY_MODE=1 
+          PT_HPU_LAZY_MODE=1
           bash run-tests.sh -c configs/models-mss.txt -t 1
       - name: multimodal_small_g3_tp2_mss
         flavor: g3.s
         command: >-
           cd .jenkins/vision &&
-          PT_HPU_LAZY_MODE=1 
+          PT_HPU_LAZY_MODE=1
           bash run-tests.sh -c configs/models-mss.txt -t 2
       - name: multimodal_llama4_scout_g3_tp2_ep
         flavor: g3.s
@@ -230,17 +235,23 @@ stages:
           cd .jenkins/vision &&
           PT_HPU_LAZY_MODE=1
           bash run-tests.sh -c configs/models-gemma.txt -t 1
+      - name: multimodal_ovis2_5_g3_tp1_ep
+        flavor: g3.s
+        command: >-
+          cd .jenkins/vision &&
+          PT_HPU_LAZY_MODE=1 VLLM_SKIP_WARMUP=true
+          bash run-tests.sh -c configs/models-ovis.txt -t 1
   - name: tests_int4_quantization
     steps:
       - name: test_awq
         flavor: g2
         command: >-
-          PT_HPU_LAZY_MODE=1 VLLM_SKIP_WARMUP=true 
+          PT_HPU_LAZY_MODE=1 VLLM_SKIP_WARMUP=true
           pytest -v tests/quantization/test_awq.py::test_awq
       - name: test_gptq
         flavor: g2
         command: >-
-          PT_HPU_LAZY_MODE=1 VLLM_SKIP_WARMUP=true 
+          PT_HPU_LAZY_MODE=1 VLLM_SKIP_WARMUP=true
           pytest -v tests/quantization/test_gptq.py::test_gptq
   - name: tests_guided_decode
     steps:
@@ -248,8 +259,8 @@ stages:
       flavor: g2
       command: >-
         pip install -e tests/vllm_test_utils &&
-        export VLLM_SKIP_WARMUP=true && PT_HPU_LAZY_MODE=1 
-        pytest -v tests/entrypoints/llm/test_lazy_outlines.py -s -vvv --log-cli-level=INFO 
+        export VLLM_SKIP_WARMUP=true && PT_HPU_LAZY_MODE=1
+        pytest -v tests/entrypoints/llm/test_lazy_outlines.py -s -vvv --log-cli-level=INFO
     # - name: test_guided_generate
     #   flavor: g2
     #   command: >-
diff --git a/.jenkins/vision/configs/models-ovis.txt b/.jenkins/vision/configs/models-ovis.txt
new file mode 100644
index 000000000000..c20ecf534544
--- /dev/null
+++ b/.jenkins/vision/configs/models-ovis.txt
@@ -0,0 +1 @@
+ovis2_5-9b.yaml
\ No newline at end of file
diff --git a/.jenkins/vision/configs/ovis2_5-9b.yaml b/.jenkins/vision/configs/ovis2_5-9b.yaml
new file mode 100644
index 000000000000..77fcd4f3d735
--- /dev/null
+++ b/.jenkins/vision/configs/ovis2_5-9b.yaml
@@ -0,0 +1,6 @@
+model_name: "/mnt/weka/llm/google/gemma-3-4b-it"
+dtype: "bfloat16"
+max_model_len: 32768
+max_num_seqs: 32
+num_prompts: 4
+limit_mm_per_prompt_image: 5
diff --git a/hpu_test/quick_test.py b/hpu_test/quick_test.py
deleted file mode 100644
index cd59ecca6599..000000000000
--- a/hpu_test/quick_test.py
+++ /dev/null
@@ -1,166 +0,0 @@
-import time
-import os
-
-os.environ["VLLM_SKIP_WARMUP"] = "true"
-os.environ["PT_HPU_LAZY_MODE"] = "1"
-os.environ["PT_HPUGRAPH_DISABLE_TENSOR_CACHE=False "] = "False"
-
-from vllm import LLM
-from vllm.assets.image import ImageAsset
-from vllm import SamplingParams
-from PIL import Image
-
-# Load the image
-newsize = (537, 612)
-image = ImageAsset("stop_sign").pil_image
-image = image.resize(newsize)
-newsize = (672, 784)
-image2 = ImageAsset("cherry_blossom").pil_image
-image2 = image2.resize(newsize)
-newsize = (1236, 1236)
-image3 = ImageAsset("cherry_blossom").pil_image
-image3 = image3.resize(newsize)
-
-test_image = Image.open("test_image.jpg").resize((560, 784), Image.Resampling.LANCZOS)
-test_image_d1 = Image.open("test_image.jpg").resize((580, 784), Image.Resampling.LANCZOS)
-test_image_d2 = Image.open("test_image.jpg").resize((560, 840), Image.Resampling.LANCZOS)
-test_image_d3 = Image.open("test_image.jpg").resize((580, 840), Image.Resampling.LANCZOS)
-#test_image_distorted = Image.open("test_image.jpg").resize((308, 224), Image.Resampling.LANCZOS)
-
-
-# Initialize the LLM with a multimodal model like LLaVA
-# Qwen/Qwen2-VL-7B-Instruct
-#model2_5 = "Qwen/Qwen2.5-VL-3B-Instruct"
-#model = model2_5
-model = "AIDC-AI/Ovis2.5-9B"
-enforce_eager = False
-
-kwargs={'trust_remote_code':'True'}
-llm = LLM(
-    model=model,
-    max_model_len=8000,#32768,
-    max_num_seqs=5,
-    limit_mm_per_prompt={
-        "image": 1,
-        "video": 0
-    },
-    enforce_eager=enforce_eager,
-    **kwargs
-)
-
-prompt = '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe this image.<|im_end|>\n<|im_start|>assistant\n'
-prompt2 = '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Can you please, tell me what do you see in this?<|im_end|>\n<|im_start|>assistant\n'
-prompt_text_only = '<|im_start|>system\nYou are a nice sytem.<|im_end|>\n<|im_start|>user\nTell me about you.<|im_end|>\n<|im_start|>assistant\n'
-prompt_question = 'Who is Barack Obama?\n'
-prompt_new = '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Can you tell me the street address on this image, without the city name and the state?.<|im_end|>\n<|im_start|>assistant\n'
-
-#OVIS
-prompt_new='<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<image>Please respond in English. Can you tell me the street address on this image, without the city name and the state?.<|im_end|>\n<|im_start|>assistant\n'
-batch_data = [
-   {"prompt": prompt_new, "multi_modal_data": {"image": test_image}},
-   {"prompt": prompt_new, "multi_modal_data": {"image": test_image_d1}},
-   {"prompt": prompt_new, "multi_modal_data": {"image": test_image_d2}},
-   {"prompt": prompt_new, "multi_modal_data": {"image": test_image_d3}},
-    # {"prompt": prompt, "multi_modal_data": {"image": image}},
-    # {"prompt": prompt_text_only},
-    # {"prompt": prompt2, "multi_modal_data": {"image": image2}},
-    # {"prompt": prompt_question},
-    # {"prompt": prompt, "multi_modal_data": {"image": image3}},
-    # {"prompt": prompt_question},
-    # {"prompt": prompt, "multi_modal_data": {"image": image3}},
-    # {"prompt": prompt_question},        
-]
-'''
-
-prompt = '<|im_start|>user \n<image>\nCan you tell me the street address on this image, without the city name and the state?.<|im_end|>\n<|im_start|>assistant\n'
-batch_data = [
-   {"prompt": prompt, "multi_modal_data": {"image": test_image}},
-]
-'''
-sampling_params = SamplingParams(temperature=0.0, max_tokens=20, seed=2025)
-
-REPS_TOTAL = len(batch_data)
-throughputs = []
-durations = []
-answers = []
-
-from transformers import AutoTokenizer
-placeholder = "<image>"
-question = "Can you tell me the street address on this image, without the city name and the state?"
-tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
-messages = [
-    [{"role": "user", "content": f"{placeholder}\n{question}"}]
-]
-prompt = tokenizer.apply_chat_template(
-    messages, tokenize=False, add_generation_prompt=True#, enable_thinking=False
-)
-
-
-
-for i in range(REPS_TOTAL):
-    print(f" ===================== ITER {i} ====================== ")
-
-    start_time = time.time()
-    outputs = llm.generate(batch_data[i], sampling_params=sampling_params)
-    elapsed_time = time.time() - start_time
-
-    num_tokens = sum(len(o.prompt_token_ids) for o in outputs)
-    throughput = num_tokens / elapsed_time
-    
-    durations.append(elapsed_time)
-    throughputs.append(throughput)
-
-    for o in outputs:
-        generated_text = o.outputs[0].text
-        answers.append(generated_text)
-        print(" > OUTPUT", generated_text)
-
-print(f" ===================== Answers ====================== ")
-for ans in answers:
-  print(" ", ans)
-
-avg_throughputs = sum(throughputs) / REPS_TOTAL
-print(f"Throughputs: {avg_throughputs:0.2f}")
-
-# print(f" ==================== batch data ==================== ")
-
-# throughputs = []
-# durations = []
-# answers = []
-
-# start_time = time.time()
-# outputs = llm.generate(batch_data, sampling_params=sampling_params)
-# elapsed_time = time.time() - start_time
-
-# num_tokens = sum(len(o.prompt_token_ids) for o in outputs)
-# throughput = num_tokens / elapsed_time
-
-# durations.append(elapsed_time)
-# throughputs.append(throughput)
-
-# for o in outputs:
-#     generated_text = o.outputs[0].text
-#     answers.append(generated_text)
-#     print(" > OUTPUT", generated_text)
-
-# for ans in answers:
-#   print(" ", ans)
-
-# avg_throughputs = sum(throughputs) / REPS_TOTAL
-# print(f"Throughputs: {avg_throughputs:0.2f}")
-
-#  ===================== Answers ======================
-#   The image depicts a street scene in what appears to be a Chinatown area, characterized by traditional Chinese
-#   I am a large language model created by Alibaba Cloud. I am called Qwen. I am designed
-#   The image shows a beautiful scene of cherry blossoms in full bloom, with pink flowers covering the branches
-#   Barack Obama is an American politician who served as the 44th President of the United States
-# Throughputs: 100.17 (No warmup) -> (Throughputs: 337.50 with warmUp)
-
-
-# GPU 
-#  ===================== Answers ======================
-#   The image depicts a street scene in what appears to be an Asian neighborhood, likely in a Chinatown
-#   I am a large language model created by Alibaba Cloud. I am called Qwen. I am designed
-#   The image shows a beautiful scene of cherry blossoms in full bloom, with pink flowers covering the branches
-#   Barack Obama is an American politician who served as the 44th President of the United States
-# Throughputs: 725.20
diff --git a/hpu_test/server_ovis.sh b/hpu_test/server_ovis.sh
deleted file mode 100755
index 9d587672414e..000000000000
--- a/hpu_test/server_ovis.sh
+++ /dev/null
@@ -1,36 +0,0 @@
-export VLLM_SKIP_WARMUP=True 
-export VLLM_PROMPT_BS_BUCKET_MIN=1
-export VLLM_PROMPT_BS_BUCKET_STEP=1
-export VLLM_PROMPT_BS_BUCKET_MAX=8
-export VLLM_DECODE_BS_BUCKET_MIN=1
-export VLLM_DECODE_BS_BUCKET_STEP=4
-export VLLM_DECODE_BS_BUCKET_MAX=64 ## used to be 52
-export VLLM_PROMPT_SEQ_BUCKET_MIN=3072
-export VLLM_PROMPT_SEQ_BUCKET_STEP=256
-export VLLM_PROMPT_SEQ_BUCKET_MAX=3840 ## used to be 3328 
-export VLLM_DECODE_BLOCK_BUCKET_MIN=1024
-export VLLM_DECODE_BLOCK_BUCKET_STEP=512
-export VLLM_DECODE_BLOCK_BUCKET_MAX=2048
-
-export VLLM_MULTIMODAL_BUCKETS="6912"
-
-export VLLM_DELAYED_SAMPLING=true
-export TRANSFORMERS_VERBOSITY=info
-#export VLLM_FP32_SOFTMAX=true
-export VLLM_EXPONENTIAL_BUCKETING=false
-export VLLM_FUSED_BLOCK_SOFTMAX_ADJUSTMENT=False
-export PT_HPU_LAZY_MODE=1
-
-## newly added 
-export PT_HPU_ENABLE_LAZY_COLLECTIVES=true
-export EXPERIMENTAL_WEIGHT_SHARING=0
-export FUSER_ENABLE_LOW_UTILIZATION=true
-export ENABLE_FUSION_BEFORE_NORM=true
-
-# Ovis - need to run with this off for vision part 
-export PT_HPUGRAPH_DISABLE_TENSOR_CACHE=False
-
-python3 -m vllm.entrypoints.openai.api_server --model "AIDC-AI/Ovis2.5-9B" --task generate --trust-remote-code --host 0.0.0.0 --port 10900 --tensor-parallel-size 1 \
-    --mm-processor-kwargs '{"min_pixels": 401408, "max_pixels": 1605632}' --max-parallel-loading-workers 16 --max-model-len 8000 \
-    --gpu-memory-util 0.99 \
-    --max-num-prefill-seqs 1
diff --git a/vllm/model_executor/models/siglip2navit.py b/vllm/model_executor/models/siglip2navit.py
index 17de1cc20b6a..e87cc2a88717 100644
--- a/vllm/model_executor/models/siglip2navit.py
+++ b/vllm/model_executor/models/siglip2navit.py
@@ -273,14 +273,14 @@ def forward(
                 # (1, num_heads, seq_len, head_dim)
                 q_i, k_i, v_i = [x.transpose(1, 2) for x in (q_i, k_i, v_i)]
 
-                '''
-                output_i = F.scaled_dot_product_attention(q_i,
+                if is_hpu:
+                    output_i = FusedSDPA.apply(q_i,k_i,v_i, None, 0.0,
+                                                   False, None)
+                else:
+                    output_i = F.scaled_dot_product_attention(q_i,
                                                           k_i,
                                                           v_i,
                                                           dropout_p=0.0)
-                '''
-                output_i = FusedSDPA.apply(q_i,k_i,v_i, None, 0.0,
-                                                   False, None)
                 # (1, num_heads, seq_len, head_dim) -> (seq_len, embed_dim)
                 output_i = output_i.transpose(1, 2).reshape(-1, self.embed_dim)
                 outputs.append(output_i)
@@ -482,14 +482,17 @@ def forward(
         def remove_duplicates_cpu(a):
             return [a[i] for i in range(len(a)) if i == 0 or a[i - 1] != a[i]]
 
-        cu_window_seqlens = remove_duplicates_cpu(cu_window_seqlens)
+        if is_hpu:
+          cu_window_seqlens = remove_duplicates_cpu(cu_window_seqlens)
+
 
         cu_window_seqlens = torch.tensor(
             cu_window_seqlens,
             device=inputs_embeds.device,
             dtype=grid_thws.dtype if torch.jit.is_tracing() else torch.int32,
         )
-        #cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens)
+        if not is_hpu:
+          cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens)
 
         seq_len, _ = inputs_embeds.size()
         inputs_embeds = inputs_embeds.reshape(

From f64bd9c8eefbf6ce3e1aba0312e142ada9fb16b0 Mon Sep 17 00:00:00 2001
From: Christopher Manteuffel <christopher.manteuffel@intel.com>
Date: Wed, 3 Sep 2025 13:12:47 -0700
Subject: [PATCH 05/13] Fix precommit

---
 examples/offline_inference/vision_language_multi_image.py   | 1 -
 tests/models/multimodal/generation/vlm_utils/model_utils.py | 2 +-
 vllm/entrypoints/chat_utils.py                              | 3 ++-
 vllm/model_executor/models/ovis2_5.py                       | 2 +-
 vllm/model_executor/models/siglip2navit.py                  | 3 ++-
 5 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index e228f0cbd51a..0111b21969ea 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -482,7 +482,6 @@ def load_ovis2_5(question: str, image_urls: list[str]) -> ModelRequestData:
         messages, tokenize=False, add_generation_prompt=True
     )
 
-    import pdb;pdb.set_trace()
     return ModelRequestData(
         engine_args=engine_args,
         prompt=prompt,
diff --git a/tests/models/multimodal/generation/vlm_utils/model_utils.py b/tests/models/multimodal/generation/vlm_utils/model_utils.py
index ce62e95bdf97..09d1577ad0f6 100644
--- a/tests/models/multimodal/generation/vlm_utils/model_utils.py
+++ b/tests/models/multimodal/generation/vlm_utils/model_utils.py
@@ -801,7 +801,7 @@ def processor(*args, text="", images=None, videos=None, **kwargs):
             videos = []
         else:
             videos = [videos] if isinstance(videos, np.ndarray) else videos
-            videos = [[PIL.Image.fromarray(frame) for frame in vid]
+            videos = [[Image.fromarray(frame) for frame in vid]
                       for vid in videos]
 
         import pdb;pdb.set_trace()
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 83d66c0d662f..af09cf0580ba 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -527,7 +527,8 @@ def _placeholder_str(self, modality: ModalityStr,
 
             if model_type in ("aya_vision", "chameleon", "deepseek_vl_v2",
                               "internvl_chat", "ovis", "skywork_chat",
-                              "NVLM_D", "h2ovl_chat", "idefics3", "smolvlm", "ovis2_5"):
+                              "NVLM_D", "h2ovl_chat", "idefics3", "smolvlm",
+                              "ovis2_5"):
                 return "<image>"
             if model_type in ("mllama", "llama4"):
                 return "<|image|>"
diff --git a/vllm/model_executor/models/ovis2_5.py b/vllm/model_executor/models/ovis2_5.py
index 4534da1b3423..77ff7d194a46 100644
--- a/vllm/model_executor/models/ovis2_5.py
+++ b/vllm/model_executor/models/ovis2_5.py
@@ -562,4 +562,4 @@ def load_weights(self, weights: Iterable[tuple[str,
         return loader.load_weights(weights)
 
     def get_language_model(self) -> torch.nn.Module:
-        return self.llm
\ No newline at end of file
+        return self.llm
diff --git a/vllm/model_executor/models/siglip2navit.py b/vllm/model_executor/models/siglip2navit.py
index e87cc2a88717..4ed02c669ce6 100644
--- a/vllm/model_executor/models/siglip2navit.py
+++ b/vllm/model_executor/models/siglip2navit.py
@@ -215,7 +215,8 @@ def __init__(self, config):
         # Detect attention implementation.
         self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
         if self.attn_backend not in {
-                _Backend.FLASH_ATTN, _Backend.TORCH_SDPA,
+                _Backend.FLASH_ATTN,
+                _Backend.TORCH_SDPA,
         }:
             self.attn_backend = _Backend.TORCH_SDPA
         self.is_flash_attn_backend = self.attn_backend in {

From b45ba986ed8fe69e8bccd70da3167f630aa72eb3 Mon Sep 17 00:00:00 2001
From: Christopher Manteuffel <christopher.manteuffel@intel.com>
Date: Wed, 3 Sep 2025 13:40:55 -0700
Subject: [PATCH 06/13] Update pre-commit

---
 .../vision_language_multi_image.py                |  1 +
 .../generation/vlm_utils/model_utils.py           |  1 -
 vllm/model_executor/models/ovis2_5.py             |  1 +
 vllm/model_executor/models/siglip2navit.py        | 15 ++++++++-------
 vllm/transformers_utils/config.py                 |  2 ++
 5 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index 0111b21969ea..899f0a009532 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -459,6 +459,7 @@ def load_ovis(question: str, image_urls: list[str]) -> ModelRequestData:
         image_data=[fetch_image(url) for url in image_urls],
     )
 
+
 # ovis2_5
 def load_ovis2_5(question: str, image_urls: list[str]) -> ModelRequestData:
     model_name = "AIDC-AI/Ovis2.5-2B"
diff --git a/tests/models/multimodal/generation/vlm_utils/model_utils.py b/tests/models/multimodal/generation/vlm_utils/model_utils.py
index 09d1577ad0f6..0afb8f53d8f4 100644
--- a/tests/models/multimodal/generation/vlm_utils/model_utils.py
+++ b/tests/models/multimodal/generation/vlm_utils/model_utils.py
@@ -804,7 +804,6 @@ def processor(*args, text="", images=None, videos=None, **kwargs):
             videos = [[Image.fromarray(frame) for frame in vid]
                       for vid in videos]
 
-        import pdb;pdb.set_trace()
         prompt_start_and_end = {
             "qwen2": ("<|im_start|>user\n", "<|im_end|>\n"),
             "llama":
diff --git a/vllm/model_executor/models/ovis2_5.py b/vllm/model_executor/models/ovis2_5.py
index 77ff7d194a46..dd110f7c7cf9 100644
--- a/vllm/model_executor/models/ovis2_5.py
+++ b/vllm/model_executor/models/ovis2_5.py
@@ -392,6 +392,7 @@ def get_replacement_ovis(item_idx, modality: str):
                                         info=Ovis2_5ProcessingInfo,
                                         dummy_inputs=Ovis2_5DummyInputsBuilder)
 class Ovis2_5(nn.Module, SupportsMultiModal):
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
diff --git a/vllm/model_executor/models/siglip2navit.py b/vllm/model_executor/models/siglip2navit.py
index 4ed02c669ce6..8047368834f2 100644
--- a/vllm/model_executor/models/siglip2navit.py
+++ b/vllm/model_executor/models/siglip2navit.py
@@ -275,13 +275,13 @@ def forward(
                 q_i, k_i, v_i = [x.transpose(1, 2) for x in (q_i, k_i, v_i)]
 
                 if is_hpu:
-                    output_i = FusedSDPA.apply(q_i,k_i,v_i, None, 0.0,
-                                                   False, None)
+                    output_i = FusedSDPA.apply(q_i,k_i,v_i, None, 0.0, False,
+                                               None)
                 else:
                     output_i = F.scaled_dot_product_attention(q_i,
-                                                          k_i,
-                                                          v_i,
-                                                          dropout_p=0.0)
+                                                              k_i,
+                                                              v_i,
+                                                              dropout_p=0.0)
                 # (1, num_heads, seq_len, head_dim) -> (seq_len, embed_dim)
                 output_i = output_i.transpose(1, 2).reshape(-1, self.embed_dim)
                 outputs.append(output_i)
@@ -478,13 +478,14 @@ def forward(
         """
         rotary_pos_emb = self.rot_pos_emb(grid_thws)
         window_index, cu_window_seqlens = self.get_window_index(grid_thws)
+
         # NOTE: unique_consecutive is a dynamic operation
         # we are using `remove_duplicates_cpu` instead
         def remove_duplicates_cpu(a):
             return [a[i] for i in range(len(a)) if i == 0 or a[i - 1] != a[i]]
 
         if is_hpu:
-          cu_window_seqlens = remove_duplicates_cpu(cu_window_seqlens)
+            cu_window_seqlens = remove_duplicates_cpu(cu_window_seqlens)
 
 
         cu_window_seqlens = torch.tensor(
@@ -493,7 +494,7 @@ def remove_duplicates_cpu(a):
             dtype=grid_thws.dtype if torch.jit.is_tracing() else torch.int32,
         )
         if not is_hpu:
-          cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens)
+            cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens)
 
         seq_len, _ = inputs_embeds.size()
         inputs_embeds = inputs_embeds.reshape(
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 0f6331ffb9a3..a09e6c269174 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -91,6 +91,7 @@
     "llm_config": "text_config",
 }
 
+
 class ConfigFormat(str, enum.Enum):
     AUTO = "auto"
     HF = "hf"
@@ -242,6 +243,7 @@ def _uses_mrope(config: PretrainedConfig) -> bool:
 
     return "mrope_section" in rope_scaling
 
+
 def _maybe_remap_hf_config_attrs(config: PretrainedConfig) -> PretrainedConfig:
     """Remap config attributes to match the expected names."""
     for old_attr, new_attr in _CONFIG_ATTRS_MAPPING.items():

From a95fdbeb5149b41f32a0a87a978076594acc3158 Mon Sep 17 00:00:00 2001
From: Christopher Manteuffel <christopher.manteuffel@intel.com>
Date: Wed, 3 Sep 2025 13:51:15 -0700
Subject: [PATCH 07/13] Update pre-commit

---
 tests/models/multimodal/generation/vlm_utils/model_utils.py | 1 +
 vllm/model_executor/models/siglip2navit.py                  | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/models/multimodal/generation/vlm_utils/model_utils.py b/tests/models/multimodal/generation/vlm_utils/model_utils.py
index 0afb8f53d8f4..2a937c56fbb9 100644
--- a/tests/models/multimodal/generation/vlm_utils/model_utils.py
+++ b/tests/models/multimodal/generation/vlm_utils/model_utils.py
@@ -787,6 +787,7 @@ def processor(*args, text="", images=None, **kwargs):
     hf_model.processor = processor
     return hf_model
 
+
 def ovis2_5_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
     """Patches and returns an instance of the HfRunner to use for Ovis2."""
     hf_model.model.get_output_embeddings = lambda: \
diff --git a/vllm/model_executor/models/siglip2navit.py b/vllm/model_executor/models/siglip2navit.py
index 8047368834f2..f02c353fd951 100644
--- a/vllm/model_executor/models/siglip2navit.py
+++ b/vllm/model_executor/models/siglip2navit.py
@@ -275,7 +275,7 @@ def forward(
                 q_i, k_i, v_i = [x.transpose(1, 2) for x in (q_i, k_i, v_i)]
 
                 if is_hpu:
-                    output_i = FusedSDPA.apply(q_i,k_i,v_i, None, 0.0, False,
+                    output_i = FusedSDPA.apply(q_i, k_i, v_i, None, 0.0, False,
                                                None)
                 else:
                     output_i = F.scaled_dot_product_attention(q_i,

From c4ef4b635420bffd60f1a4bee3b160c9c608d565 Mon Sep 17 00:00:00 2001
From: Christopher Manteuffel <christopher.manteuffel@intel.com>
Date: Wed, 3 Sep 2025 14:08:56 -0700
Subject: [PATCH 08/13] Update pre-commit

---
 vllm/model_executor/models/siglip2navit.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/model_executor/models/siglip2navit.py b/vllm/model_executor/models/siglip2navit.py
index f02c353fd951..e728f94a3935 100644
--- a/vllm/model_executor/models/siglip2navit.py
+++ b/vllm/model_executor/models/siglip2navit.py
@@ -487,7 +487,6 @@ def remove_duplicates_cpu(a):
         if is_hpu:
             cu_window_seqlens = remove_duplicates_cpu(cu_window_seqlens)
 
-
         cu_window_seqlens = torch.tensor(
             cu_window_seqlens,
             device=inputs_embeds.device,

From e806a32d41c6d330ec8d50d591c4f79b78caef31 Mon Sep 17 00:00:00 2001
From: Christopher Manteuffel <christopher.manteuffel@intel.com>
Date: Wed, 3 Sep 2025 14:36:28 -0700
Subject: [PATCH 09/13] Fix up vision test

---
 .jenkins/vision/configs/ovis2_5-9b.yaml | 1 +
 .jenkins/vision/test_enc_dec_model.py   | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/.jenkins/vision/configs/ovis2_5-9b.yaml b/.jenkins/vision/configs/ovis2_5-9b.yaml
index 77fcd4f3d735..cfd7d7a8708f 100644
--- a/.jenkins/vision/configs/ovis2_5-9b.yaml
+++ b/.jenkins/vision/configs/ovis2_5-9b.yaml
@@ -4,3 +4,4 @@ max_model_len: 32768
 max_num_seqs: 32
 num_prompts: 4
 limit_mm_per_prompt_image: 5
+trust_remote_code: True
diff --git a/.jenkins/vision/test_enc_dec_model.py b/.jenkins/vision/test_enc_dec_model.py
index a1571c64f41a..2e021a2ebb70 100644
--- a/.jenkins/vision/test_enc_dec_model.py
+++ b/.jenkins/vision/test_enc_dec_model.py
@@ -24,6 +24,7 @@ def fail_on_exit():
 def launch_enc_dec_model(config, question, images):
     model_name = config.get('model_name')
     dtype = config.get('dtype', 'bfloat16')
+    trust_remote_code = config.get('trust_remote_code', False)
     max_num_seqs = config.get('max_num_seqs', 128)
     max_model_len = config.get('max_model_len', 4096)
     enforce_eager = config.get('enforce_eager', False)
@@ -41,6 +42,7 @@ def launch_enc_dec_model(config, question, images):
         enable_expert_parallel=enable_expert_parallel,
         enforce_eager=enforce_eager,
         limit_mm_per_prompt={"image": limit_mm_per_prompt_image},
+        trust_remote_code=trust_remote_code,
     )
 
     tokenizer = AutoTokenizer.from_pretrained(model_name)

From 2dfb33a019bf3df24e21785defa648870a4c26f1 Mon Sep 17 00:00:00 2001
From: Christopher Manteuffel <christopher.manteuffel@intel.com>
Date: Wed, 3 Sep 2025 17:45:30 -0700
Subject: [PATCH 10/13] Mark that model is not loaded yet explicitly

---
 .jenkins/lm-eval-harness/configs/ovis2_5-9b.yaml | 2 +-
 .jenkins/vision/configs/ovis2_5-9b.yaml          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.jenkins/lm-eval-harness/configs/ovis2_5-9b.yaml b/.jenkins/lm-eval-harness/configs/ovis2_5-9b.yaml
index 8e7f7e7a63b3..4b0fbb30dc4c 100644
--- a/.jenkins/lm-eval-harness/configs/ovis2_5-9b.yaml
+++ b/.jenkins/lm-eval-harness/configs/ovis2_5-9b.yaml
@@ -1,4 +1,4 @@
-model_name: "/mnt/weka/llm/google/gemma-3-4b-it"
+model_name: "/path/to/ovis/2/5/9b"
 tasks:
 - name: "gsm8k"
   metrics:
diff --git a/.jenkins/vision/configs/ovis2_5-9b.yaml b/.jenkins/vision/configs/ovis2_5-9b.yaml
index cfd7d7a8708f..4afd74d4e40a 100644
--- a/.jenkins/vision/configs/ovis2_5-9b.yaml
+++ b/.jenkins/vision/configs/ovis2_5-9b.yaml
@@ -1,4 +1,4 @@
-model_name: "/mnt/weka/llm/google/gemma-3-4b-it"
+model_name: "/path/to/ovis/2/5/9b"
 dtype: "bfloat16"
 max_model_len: 32768
 max_num_seqs: 32

From bdd6f7cdf32587dbe2f31699a136b99ba34ac1ad Mon Sep 17 00:00:00 2001
From: Christopher Manteuffel <christopher.manteuffel@intel.com>
Date: Wed, 17 Sep 2025 11:13:26 -0700
Subject: [PATCH 11/13] Update path to model on weka for CI

---
 .jenkins/lm-eval-harness/configs/ovis2_5-9b.yaml | 2 +-
 .jenkins/vision/configs/ovis2_5-9b.yaml          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.jenkins/lm-eval-harness/configs/ovis2_5-9b.yaml b/.jenkins/lm-eval-harness/configs/ovis2_5-9b.yaml
index 4b0fbb30dc4c..19546a4bd3e1 100644
--- a/.jenkins/lm-eval-harness/configs/ovis2_5-9b.yaml
+++ b/.jenkins/lm-eval-harness/configs/ovis2_5-9b.yaml
@@ -1,4 +1,4 @@
-model_name: "/path/to/ovis/2/5/9b"
+model_name: "/mnt/weka/data/llm/aidc-ai/ovis2.5-9b"
 tasks:
 - name: "gsm8k"
   metrics:
diff --git a/.jenkins/vision/configs/ovis2_5-9b.yaml b/.jenkins/vision/configs/ovis2_5-9b.yaml
index 4afd74d4e40a..8c86fc59245d 100644
--- a/.jenkins/vision/configs/ovis2_5-9b.yaml
+++ b/.jenkins/vision/configs/ovis2_5-9b.yaml
@@ -1,4 +1,4 @@
-model_name: "/path/to/ovis/2/5/9b"
+model_name: "/mnt/weka/data/llm/aidc-ai/ovis2.5-9b"
 dtype: "bfloat16"
 max_model_len: 32768
 max_num_seqs: 32

From a3d8fa9b29240579ed1db709c11dd522c77db4ab Mon Sep 17 00:00:00 2001
From: Jianhong-Zhang <jianhong.zhang@intel.com>
Date: Tue, 16 Sep 2025 21:36:00 -0700
Subject: [PATCH 12/13] Add bucket and padding for Ovis

This pr is depending on https://github.com/HabanaAI/vllm-fork/pull/1873
---
 vllm/model_executor/models/ovis2_5.py | 106 +++++++++++++++++++++++---
 vllm/worker/hpu_model_runner.py       |   2 +-
 2 files changed, 98 insertions(+), 10 deletions(-)

diff --git a/vllm/model_executor/models/ovis2_5.py b/vllm/model_executor/models/ovis2_5.py
index dd110f7c7cf9..483d9be5c437 100644
--- a/vllm/model_executor/models/ovis2_5.py
+++ b/vllm/model_executor/models/ovis2_5.py
@@ -10,6 +10,7 @@
 from transformers import BaseImageProcessor, BatchFeature, PretrainedConfig
 
 from vllm.config import VllmConfig
+from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import ReplicatedLinear
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
@@ -29,9 +30,12 @@
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.processors.ovis2_5 import Ovis2_5Processor
+from vllm.worker.hpu_model_runner import VisionBuckets
 
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal
 
+logger = init_logger(__name__)
+
 IMAGE_TOKEN = "<image>"
 VIDEO_TOKEN = "<video>"
 INDICATOR_IDS = [-301, -302, -303, -304]
@@ -416,6 +420,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         text_model_type = self.config.get_text_config().model_type
         self.image_pad_token_id = IMAGE_PAD_TOKEN_ID_MAP[text_model_type]
+        self.vision_buckets = VisionBuckets(is_batch_based=False)
 
         # TODO(Isotr0py): PP support
         # self.make_empty_intermediate_tensors = (
@@ -458,30 +463,113 @@ def _parse_and_validate_visual_input(
 
         raise AssertionError("This line should be unreachable.")
 
+    def find_factor(self, desired_patches, orig):
+        for i in range(orig + 1, desired_patches + 1):
+            if desired_patches % i == 0:
+                if i % 2 != 0:
+                    continue
+                else:
+                    return i
+        return None
+
+    def find_padding(self, h_orig, w_orig, desired_patches):
+        best_pad_h, best_pad_w = 0, 0
+        if desired_patches % h_orig == 0:
+            best_pad_h = 0
+            w_factor = desired_patches // h_orig
+            best_pad_w = w_factor - w_orig if (w_factor > w_orig
+                                               and w_factor % 2 == 0) else 0
+        elif desired_patches % w_orig == 0:
+            best_pad_w = 0
+            h_factor = desired_patches // w_orig
+            best_pad_h = h_factor - h_orig if (h_factor > h_orig
+                                               and h_factor % 2 == 0) else 0
+        elif desired_patches % h_orig != 0 and desired_patches % w_orig != 0:
+            if h_orig > w_orig:
+                w_factor = self.find_factor(desired_patches, w_orig)
+                if w_factor is not None:
+                    best_pad_w = w_factor - w_orig
+                    h_factor = desired_patches // w_factor
+                    if h_factor > h_orig:
+                        best_pad_h = h_factor - h_orig
+            else:
+                h_factor = self.find_factor(desired_patches, h_orig)
+                if h_factor is not None:
+                    best_pad_h = h_factor - h_orig
+                    w_factor = desired_patches // h_factor
+                    if w_factor > w_orig:
+                        best_pad_w = w_factor - w_orig
+
+        if (best_pad_h + h_orig) * (best_pad_w + w_orig) != desired_patches:
+            best_pad_h, best_pad_w = 0, 0
+
+        return best_pad_h, best_pad_w
+
+    def pad_multimodal_data(self, pixel_values, image_grid_thw,
+                            vision_buckets):
+        desired_number_of_pixels = vision_buckets.get_multimodal_bucket(
+            pixel_values.shape[0])
+        padding_len = desired_number_of_pixels - pixel_values.shape[0]
+        if padding_len <= 0:
+            return pixel_values, image_grid_thw
+
+        logger_msg = "Padding current number pixel " \
+            + str(pixel_values.shape[0]) \
+            + " to " \
+            + str(desired_number_of_pixels)
+        logger.info(logger_msg)
+
+        h_orig, w_orig = image_grid_thw[0, 1].item(), image_grid_thw[0,
+                                                                     2].item()
+        pad_h, pad_w = self.find_padding(h_orig, w_orig,
+                                         desired_number_of_pixels)
+        if pad_h == 0 and pad_w == 0:
+            return pixel_values, image_grid_thw
+
+        constant_value = -100
+        pixel_values = torch.cat([
+            pixel_values,
+            torch.ones((padding_len, pixel_values.shape[1]),
+                       device=pixel_values.device) * constant_value
+        ])
+
+        image_grid_thw = torch.tensor([[1, h_orig + pad_h, w_orig + pad_w]],
+                                      device=image_grid_thw.device,
+                                      dtype=image_grid_thw.dtype)
+
+        assert image_grid_thw.prod(-1).sum() == desired_number_of_pixels
+        return pixel_values, image_grid_thw
+
     def _process_image_input(
             self, image_input: OvisImagePatchInputs) -> MultiModalEmbeddings:
         image_patches_flat = image_input["flat_data"]
-        patches_per_image = image_input["patches_per_image"]
         indicator_tokens = image_input["indicator_tokens"]
         grid_thws = image_input["grids"]
 
-        indicator_per_image = list(
-            map(lambda x: 2 if x > 1 else x + 2, patches_per_image))
-
         target_dtype = self.visual_tokenizer.dtype
-        visual_tokens = self.visual_tokenizer(
-            image_patches_flat.to(target_dtype), grid_thws)
 
+        visual_embeds, grid_thws = self.pad_multimodal_data(
+            image_patches_flat.to(target_dtype), grid_thws,
+            self.vision_buckets)
+
+        visual_tokens = self.visual_tokenizer(visual_embeds, grid_thws)
         visual_embeds = self.vte(visual_tokens)  # 1:1 numeric eq.
         indicator_embeds = self.vte(indicator_tokens)
+        padded_patches_per_image = [
+            grid[1] * grid[2] // (self.config.vit_config.hidden_stride**2)
+            for grid in grid_thws
+        ]
 
-        visual_embeds_per_image = visual_embeds.split(patches_per_image, dim=0)
+        visual_embeds_per_image = visual_embeds.split(padded_patches_per_image,
+                                                      dim=0)
+        indicator_per_image = list(
+            map(lambda x: 2 if x > 1 else x + 2, padded_patches_per_image))
         indicator_embeds_per_image = indicator_embeds.split(
             indicator_per_image)
 
         vision_embeddings = []
-        for indicator, visual in zip(indicator_embeds_per_image,
-                                     visual_embeds_per_image):
+        for idx, (indicator, visual) in enumerate(
+                zip(indicator_embeds_per_image, visual_embeds_per_image)):
             vision_embeddings_per_image = []
             visual = visual.unsqueeze(0)
             for i in range(visual.shape[0]):
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index e79f08d59b1d..f3d64a3ab3d8 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -123,7 +123,7 @@ def __init__(self, is_batch_based):
                     multimodal_buckets = [1, 2, 4, 8]  # batch sizes for gemma3
                 else:
                     multimodal_buckets = [
-                        1600, 3136, 4096, 6400, 7744, 9216, 12544
+                        784, 1600, 3136, 4096, 6400, 7744, 9216, 12544
                     ]
             else:
                 multimodal_buckets = [int(i) for i in envvar.split(',')]

From 35d653689d57d2e9a53f618a6cfe692488762a99 Mon Sep 17 00:00:00 2001
From: Jianhong-Zhang <jianhong.zhang@intel.com>
Date: Mon, 6 Oct 2025 14:08:17 -0700
Subject: [PATCH 13/13] Refine Ovis padding

---
 vllm/model_executor/models/ovis2_5.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/models/ovis2_5.py b/vllm/model_executor/models/ovis2_5.py
index 483d9be5c437..49759bc9e683 100644
--- a/vllm/model_executor/models/ovis2_5.py
+++ b/vllm/model_executor/models/ovis2_5.py
@@ -477,13 +477,15 @@ def find_padding(self, h_orig, w_orig, desired_patches):
         if desired_patches % h_orig == 0:
             best_pad_h = 0
             w_factor = desired_patches // h_orig
-            best_pad_w = w_factor - w_orig if (w_factor > w_orig
-                                               and w_factor % 2 == 0) else 0
+            best_pad_w = (w_factor - w_orig if
+                          (w_factor > w_orig and w_factor %
+                           self.config.vit_config.hidden_stride == 0) else 0)
         elif desired_patches % w_orig == 0:
             best_pad_w = 0
             h_factor = desired_patches // w_orig
-            best_pad_h = h_factor - h_orig if (h_factor > h_orig
-                                               and h_factor % 2 == 0) else 0
+            best_pad_h = (h_factor - h_orig if
+                          (h_factor > h_orig and h_factor %
+                           self.config.vit_config.hidden_stride == 0) else 0)
         elif desired_patches % h_orig != 0 and desired_patches % w_orig != 0:
             if h_orig > w_orig:
                 w_factor = self.find_factor(desired_patches, w_orig)