From 1aadbf37d48ad548ecc192583987d594d796477f Mon Sep 17 00:00:00 2001
From: Yeonsil Yoon <yeon.sil.yoon@intel.com>
Date: Tue, 30 Sep 2025 11:52:09 -0700
Subject: [PATCH 01/12] Ovis 2 5 (#1993)

## Essential Elements of an Effective PR Description Checklist
- [ ] The purpose of the PR, such as "Fix some issue (link existing
issues this PR will resolve)".
- [ ] The test plan, such as providing test command.
- [ ] The test results, such as pasting the results comparison before
and after, or e2e results


## Purpose

## Test Plan

## Test Result

<!--- pyml disable-next-line no-emphasis-as-heading -->

---------

Co-authored-by: Christopher Manteuffel <christopher.manteuffel@intel.com>
---
 .../lm-eval-harness/configs/models-ovis.txt   |   1 +
 .../lm-eval-harness/configs/ovis2_5-9b.yaml   |  12 +
 .jenkins/test_config.yaml                     | 107 +--
 .jenkins/vision/configs/models-ovis.txt       |   1 +
 .jenkins/vision/configs/ovis2_5-9b.yaml       |   7 +
 .jenkins/vision/test_enc_dec_model.py         |   2 +
 .../vision_language_multi_image.py            |  31 +
 .../generation/vlm_utils/model_utils.py       |  57 ++
 vllm/entrypoints/chat_utils.py                |   3 +-
 vllm/model_executor/models/ovis2_5.py         | 566 ++++++++++++++++
 vllm/model_executor/models/registry.py        |   1 +
 vllm/model_executor/models/siglip2navit.py    | 626 ++++++++++++++++++
 vllm/transformers_utils/config.py             |  16 +
 .../transformers_utils/processors/__init__.py |   3 +-
 vllm/transformers_utils/processors/ovis2_5.py | 458 +++++++++++++
 15 files changed, 1841 insertions(+), 50 deletions(-)
 create mode 100644 .jenkins/lm-eval-harness/configs/models-ovis.txt
 create mode 100644 .jenkins/lm-eval-harness/configs/ovis2_5-9b.yaml
 create mode 100644 .jenkins/vision/configs/models-ovis.txt
 create mode 100644 .jenkins/vision/configs/ovis2_5-9b.yaml
 create mode 100644 vllm/model_executor/models/ovis2_5.py
 create mode 100644 vllm/model_executor/models/siglip2navit.py
 create mode 100644 vllm/transformers_utils/processors/ovis2_5.py

diff --git a/.jenkins/lm-eval-harness/configs/models-ovis.txt b/.jenkins/lm-eval-harness/configs/models-ovis.txt
new file mode 100644
index 000000000000..c20ecf534544
--- /dev/null
+++ b/.jenkins/lm-eval-harness/configs/models-ovis.txt
@@ -0,0 +1 @@
+ovis2_5-9b.yaml
\ No newline at end of file
diff --git a/.jenkins/lm-eval-harness/configs/ovis2_5-9b.yaml b/.jenkins/lm-eval-harness/configs/ovis2_5-9b.yaml
new file mode 100644
index 000000000000..19546a4bd3e1
--- /dev/null
+++ b/.jenkins/lm-eval-harness/configs/ovis2_5-9b.yaml
@@ -0,0 +1,12 @@
+model_name: "/mnt/weka/data/llm/aidc-ai/ovis2.5-9b"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.700
+  - name: "exact_match,flexible-extract"
+    value: 0.700
+limit: 256
+num_fewshot: 8
+dtype: "bfloat16"
+trust_remote_code: True
diff --git a/.jenkins/test_config.yaml b/.jenkins/test_config.yaml
index 0ac488e1b150..d2c55a038363 100644
--- a/.jenkins/test_config.yaml
+++ b/.jenkins/test_config.yaml
@@ -5,54 +5,59 @@ stages:
       - name: v0_gsm8k_small_g3_tp1_part1
         flavor: g3
         command: >-
-          export PT_HPU_LAZY_MODE=1 && 
+          export PT_HPU_LAZY_MODE=1 &&
           cd .jenkins/lm-eval-harness && bash run-tests.sh -c configs/models-small.txt -t 1
       - name: v0_gsm8k_small_g3_tp1_part2
         flavor: g3
         command: >-
-          export PT_HPU_LAZY_MODE=1 && 
+          export PT_HPU_LAZY_MODE=1 &&
           cd .jenkins/lm-eval-harness && bash run-tests.sh -c configs/models-small-2.txt -t 1
       - name: v0_gsm8k_small_g3_tp1_part3
         flavor: g3
         command: >-
-          export PT_HPU_LAZY_MODE=1 && 
+          export PT_HPU_LAZY_MODE=1 &&
           cd .jenkins/lm-eval-harness && bash run-tests.sh -c configs/models-small-3.txt -t 1
       - name: v0_gsm8k_small_g3_tp2
         flavor: g3.s
         command: >-
-          export PT_HPU_LAZY_MODE=1 && 
+          export PT_HPU_LAZY_MODE=1 &&
           cd .jenkins/lm-eval-harness && bash run-tests.sh -c configs/models-small.txt -t 2
       - name: v0_gsm8k_small_g2_tp1
         flavor: g2
         command: >-
-          export PT_HPU_LAZY_MODE=1 && 
+          export PT_HPU_LAZY_MODE=1 &&
           cd .jenkins/lm-eval-harness && bash run-tests.sh -c configs/models-small.txt -t 1
       - name: v0_gsm8k_small_g2_tp2
         flavor: g2.s
         command: >-
-          export PT_HPU_LAZY_MODE=1 && 
+          export PT_HPU_LAZY_MODE=1 &&
           cd .jenkins/lm-eval-harness && bash run-tests.sh -c configs/models-small.txt -t 2
       - name: v0_gsm8k_g2_deepseek-v2-lite_tp1
         flavor: g3
         command: >-
-          export PT_HPU_LAZY_MODE=1 && 
+          export PT_HPU_LAZY_MODE=1 &&
           cd .jenkins/lm-eval-harness && bash run-tests.sh -c configs/models-deepseek.txt -t 1
       - name: v0_gsm8k_g3_gemma3_tp1
         flavor: g3.s
         command: >-
           export PT_HPU_LAZY_MODE=1 &&
           cd .jenkins/lm-eval-harness && bash run-tests.sh -c configs/models-gemma.txt -t 1
+      - name: v0_gsm8k_g3_ovis2_5_tp1
+        flavor: g3.s
+        command: >-
+          export PT_HPU_LAZY_MODE=1 && export VLLM_SKIP_WARMUP=true &&
+          cd .jenkins/lm-eval-harness && bash run-tests.sh -c configs/models-ovis.txt -t 1
   - name: test_gsm8k_small_models_apc
     steps:
       - name: gsm8k_small_g3_tp1_apc
         flavor: g3
         command: >-
-          export VLLM_CONTIGUOUS_PA=false && 
+          export VLLM_CONTIGUOUS_PA=false &&
           cd .jenkins/lm-eval-harness && bash run-tests.sh -c configs/models-small.txt -t 1 -a
       - name: gsm8k_small_g2_tp1_apc
         flavor: g2
         command: >-
-          export VLLM_CONTIGUOUS_PA=false && 
+          export VLLM_CONTIGUOUS_PA=false &&
           cd .jenkins/lm-eval-harness && bash run-tests.sh -c configs/models-small.txt -t 1 -a
   - name: test_gsm8k_small_models_merged_prefill
     steps:
@@ -66,100 +71,100 @@ stages:
       - name: v0_gsm8k_large_g3_tp2_part1
         flavor: g3.s
         command: >-
-          export PT_HPU_LAZY_MODE=1 && 
+          export PT_HPU_LAZY_MODE=1 &&
           cd .jenkins/lm-eval-harness && bash run-tests.sh -c configs/models-large.txt -t 2
       - name: v0_gsm8k_large_g3_tp2_part2
         flavor: g3.s
         command: >-
-          export PT_HPU_LAZY_MODE=1 && 
+          export PT_HPU_LAZY_MODE=1 &&
           cd .jenkins/lm-eval-harness && bash run-tests.sh -c configs/models-large-2.txt -t 2
       - name: v0_gsm8k_large_g2_tp4
         flavor: g2.m
         command: >-
-          export PT_HPU_LAZY_MODE=1 && 
+          export PT_HPU_LAZY_MODE=1 &&
           cd .jenkins/lm-eval-harness && bash run-tests.sh -c configs/models-large.txt -t 4
   - name: test_gsm8k_fp8
     steps:
       - name: gsm8k_small_g3_tp1_fp8
         flavor: g3
         command: >-
-          cd .jenkins/lm-eval-harness && 
-          PT_HPU_LAZY_MODE=1 
+          cd .jenkins/lm-eval-harness &&
+          PT_HPU_LAZY_MODE=1
           bash run-tests.sh -c configs/models-fp8-g3-tp1.txt -t 1
       # - name: gsm8k_small_g3_tp2_fp8
       #   flavor: g3.s
       #   command: >-
-      #     cd .jenkins/lm-eval-harness && 
-      #     PT_HPU_LAZY_MODE=1 
+      #     cd .jenkins/lm-eval-harness &&
+      #     PT_HPU_LAZY_MODE=1
       #     bash run-tests.sh -c configs/models-fp8.txt -t 2
   - name: test_gsm8k_fp8_bypass_inc
     steps:
       - name: gsm8k_fp8_llama4_scout_g3_tp2_compressed_tensor
         flavor: g3.s
         command: >-
-          cd .jenkins/lm-eval-harness && 
-          PT_HPU_LAZY_MODE=1 
+          cd .jenkins/lm-eval-harness &&
+          PT_HPU_LAZY_MODE=1
           bash run-tests.sh -c configs/models-fp8-compressedtensor.txt -t 2
       - name: gsm8k_fp8_qwen3_30B_g3_tp1_block_scale_dynamic
         flavor: g3
         command: >-
-          cd .jenkins/lm-eval-harness && 
-          PT_HPU_LAZY_MODE=1 
+          cd .jenkins/lm-eval-harness &&
+          PT_HPU_LAZY_MODE=1
           bash run-tests.sh -c configs/models-fp8-blockfp8.txt -t 1
       - name: gsm8k_fp8_qwen3_30B_g3_tp1_block_scale_dequant
         flavor: g3
         command: >-
-          cd .jenkins/lm-eval-harness && 
-          PT_HPU_LAZY_MODE=1 VLLM_HPU_FORCE_CHANNEL_FP8=0 
+          cd .jenkins/lm-eval-harness &&
+          PT_HPU_LAZY_MODE=1 VLLM_HPU_FORCE_CHANNEL_FP8=0
           bash run-tests.sh -c configs/models-fp8-blockfp8.txt -t 1
   - name: test_gsm8k_mss
     steps:
       - name: gsm8k_small_g3_tp1_mss
         flavor: g3
         command: >-
-          cd .jenkins/lm-eval-harness && 
-          PT_HPU_LAZY_MODE=1 
+          cd .jenkins/lm-eval-harness &&
+          PT_HPU_LAZY_MODE=1
           bash run-tests.sh -c configs/models-mss.txt -t 1
       - name: gsm8k_small_g2_tp1_mss
         flavor: g2
         command: >-
-          cd .jenkins/lm-eval-harness && 
-          PT_HPU_LAZY_MODE=1 
+          cd .jenkins/lm-eval-harness &&
+          PT_HPU_LAZY_MODE=1
           bash run-tests.sh -c configs/models-mss.txt -t 1
       - name: gsm8k_small_g3_tp2_mss
         flavor: g3.s
         command: >-
-          cd .jenkins/lm-eval-harness && 
-          PT_HPU_LAZY_MODE=1 
+          cd .jenkins/lm-eval-harness &&
+          PT_HPU_LAZY_MODE=1
           bash run-tests.sh -c configs/models-mss.txt -t 2
       - name: gsm8k_small_g2_tp2_mss
         flavor: g2.s
         command: >-
-          cd .jenkins/lm-eval-harness && 
-          PT_HPU_LAZY_MODE=1 
+          cd .jenkins/lm-eval-harness &&
+          PT_HPU_LAZY_MODE=1
           bash run-tests.sh -c configs/models-mss.txt -t 2
       - name: gsm8k_small_g2_tp1_spec_decode
         flavor: g2
         command: >-
-          cd .jenkins/lm-eval-harness && 
-          PT_HPU_LAZY_MODE=1 
+          cd .jenkins/lm-eval-harness &&
+          PT_HPU_LAZY_MODE=1
           bash run-tests.sh -c configs/models-mss.txt -t 1
   - name: test_gsm8k_spec_decode
     steps:
       # - name: gsm8k_small_g2_tp1_mlp_spec_decode
       #   flavor: g2
       #   command: >-
-      #     PT_HPU_LAZY_MODE=1 VLLM_CONTIGUOUS_PA=false VLLM_SKIP_WARMUP=True 
+      #     PT_HPU_LAZY_MODE=1 VLLM_CONTIGUOUS_PA=false VLLM_SKIP_WARMUP=True
       #     pytest -v tests/spec_decode/e2e/test_mlp_correctness.py::test_mlp_e2e_greedy_correctness
       - name: gsm8k_small_g2_tp1_medusa_spec_decode
         flavor: g2
         command: >-
-          PT_HPU_LAZY_MODE=1 VLLM_CONTIGUOUS_PA=false VLLM_SKIP_WARMUP=True 
+          PT_HPU_LAZY_MODE=1 VLLM_CONTIGUOUS_PA=false VLLM_SKIP_WARMUP=True
           pytest -v tests/spec_decode/e2e/test_medusa_correctness.py::test_medusa_e2e_greedy_correctness
       - name: gsm8k_small_g2_tp1_eagle_spec_decode
         flavor: g2
         command: >-
-          PT_HPU_LAZY_MODE=1 VLLM_CONTIGUOUS_PA=false VLLM_SKIP_WARMUP=True 
+          PT_HPU_LAZY_MODE=1 VLLM_CONTIGUOUS_PA=false VLLM_SKIP_WARMUP=True
           pytest -v tests/spec_decode/e2e/test_eagle_correctness.py::test_eagle_e2e_greedy_correctness
   #TODO(kwisniewski98) temporary disable test, until model specific for Gaudi2 is uploaded to test infrastructure
   # - name: test_deepseek_mtp
@@ -167,38 +172,38 @@ stages:
   #     - name: test_deepseek_mtp_correctness
   #       flavor: g3
   #       command: >-
-  #         PT_HPU_LAZY_MODE=1 VLLM_CONTIGUOUS_PA=false VLLM_SKIP_WARMUP=True 
+  #         PT_HPU_LAZY_MODE=1 VLLM_CONTIGUOUS_PA=false VLLM_SKIP_WARMUP=True
   #         pytest -v tests/spec_decode/e2e/test_mtp_correctness.py::test_mtp_e2e_greedy_correctness
   - name: tests_lora
     steps:
       - name: test_llama_lora
         flavor: g2
         command: >-
-          PT_HPU_LAZY_MODE=1 VLLM_SKIP_WARMUP=true 
+          PT_HPU_LAZY_MODE=1 VLLM_SKIP_WARMUP=true
           pytest -v tests/lora/test_llama_hpu.py::test_llama_lora_1x
       - name: test_multilora
         flavor: g2
         command: >-
-          PT_HPU_LAZY_MODE=1 VLLM_SKIP_WARMUP=true 
+          PT_HPU_LAZY_MODE=1 VLLM_SKIP_WARMUP=true
           pytest -v tests/lora/test_multilora_hpu.py::test_llama_multilora_1x
       # - name: test_long_context
       #   flavor: g2
       #   command: >-
-      #     PT_HPU_LAZY_MODE=1 VLLM_SKIP_WARMUP=true 
+      #     PT_HPU_LAZY_MODE=1 VLLM_SKIP_WARMUP=true
       #     pytest -v tests/lora/test_long_context_hpu.py::test_quality
   - name: tests_multimodal
     steps:
       - name: multimodal_small_g3_tp1
         flavor: g3
         command: >-
-          cd .jenkins/vision && 
-          PT_HPU_LAZY_MODE=1 
+          cd .jenkins/vision &&
+          PT_HPU_LAZY_MODE=1
           bash run-tests.sh -c configs/models-small.txt -t 1
       - name: multimodal_small_g3_tp2
         flavor: g3.s
         command: >-
           cd .jenkins/vision &&
-          PT_HPU_LAZY_MODE=1 
+          PT_HPU_LAZY_MODE=1
           bash run-tests.sh -c configs/models-small.txt -t 2
       - name: multimodal_qwen_tp1
         flavor: g3.s
@@ -210,13 +215,13 @@ stages:
         flavor: g3
         command: >-
           cd .jenkins/vision &&
-          PT_HPU_LAZY_MODE=1 
+          PT_HPU_LAZY_MODE=1
           bash run-tests.sh -c configs/models-mss.txt -t 1
       - name: multimodal_small_g3_tp2_mss
         flavor: g3.s
         command: >-
           cd .jenkins/vision &&
-          PT_HPU_LAZY_MODE=1 
+          PT_HPU_LAZY_MODE=1
           bash run-tests.sh -c configs/models-mss.txt -t 2
       - name: multimodal_llama4_scout_g3_tp2_ep
         flavor: g3.s
@@ -230,17 +235,23 @@ stages:
           cd .jenkins/vision &&
           PT_HPU_LAZY_MODE=1
           bash run-tests.sh -c configs/models-gemma.txt -t 1
+      - name: multimodal_ovis2_5_g3_tp1_ep
+        flavor: g3.s
+        command: >-
+          cd .jenkins/vision &&
+          PT_HPU_LAZY_MODE=1 VLLM_SKIP_WARMUP=true
+          bash run-tests.sh -c configs/models-ovis.txt -t 1
   - name: tests_int4_quantization
     steps:
       - name: test_awq
         flavor: g2
         command: >-
-          PT_HPU_LAZY_MODE=1 VLLM_SKIP_WARMUP=true 
+          PT_HPU_LAZY_MODE=1 VLLM_SKIP_WARMUP=true
           pytest -v tests/quantization/test_awq.py::test_awq
       - name: test_gptq
         flavor: g2
         command: >-
-          PT_HPU_LAZY_MODE=1 VLLM_SKIP_WARMUP=true 
+          PT_HPU_LAZY_MODE=1 VLLM_SKIP_WARMUP=true
           pytest -v tests/quantization/test_gptq.py::test_gptq
   - name: tests_guided_decode
     steps:
@@ -248,8 +259,8 @@ stages:
       flavor: g2
       command: >-
         pip install -e tests/vllm_test_utils &&
-        export VLLM_SKIP_WARMUP=true && PT_HPU_LAZY_MODE=1 
-        pytest -v tests/entrypoints/llm/test_lazy_outlines.py -s -vvv --log-cli-level=INFO 
+        export VLLM_SKIP_WARMUP=true && PT_HPU_LAZY_MODE=1
+        pytest -v tests/entrypoints/llm/test_lazy_outlines.py -s -vvv --log-cli-level=INFO
     # - name: test_guided_generate
     #   flavor: g2
     #   command: >-
diff --git a/.jenkins/vision/configs/models-ovis.txt b/.jenkins/vision/configs/models-ovis.txt
new file mode 100644
index 000000000000..c20ecf534544
--- /dev/null
+++ b/.jenkins/vision/configs/models-ovis.txt
@@ -0,0 +1 @@
+ovis2_5-9b.yaml
\ No newline at end of file
diff --git a/.jenkins/vision/configs/ovis2_5-9b.yaml b/.jenkins/vision/configs/ovis2_5-9b.yaml
new file mode 100644
index 000000000000..8c86fc59245d
--- /dev/null
+++ b/.jenkins/vision/configs/ovis2_5-9b.yaml
@@ -0,0 +1,7 @@
+model_name: "/mnt/weka/data/llm/aidc-ai/ovis2.5-9b"
+dtype: "bfloat16"
+max_model_len: 32768
+max_num_seqs: 32
+num_prompts: 4
+limit_mm_per_prompt_image: 5
+trust_remote_code: True
diff --git a/.jenkins/vision/test_enc_dec_model.py b/.jenkins/vision/test_enc_dec_model.py
index a1571c64f41a..2e021a2ebb70 100644
--- a/.jenkins/vision/test_enc_dec_model.py
+++ b/.jenkins/vision/test_enc_dec_model.py
@@ -24,6 +24,7 @@ def fail_on_exit():
 def launch_enc_dec_model(config, question, images):
     model_name = config.get('model_name')
     dtype = config.get('dtype', 'bfloat16')
+    trust_remote_code = config.get('trust_remote_code', False)
     max_num_seqs = config.get('max_num_seqs', 128)
     max_model_len = config.get('max_model_len', 4096)
     enforce_eager = config.get('enforce_eager', False)
@@ -41,6 +42,7 @@ def launch_enc_dec_model(config, question, images):
         enable_expert_parallel=enable_expert_parallel,
         enforce_eager=enforce_eager,
         limit_mm_per_prompt={"image": limit_mm_per_prompt_image},
+        trust_remote_code=trust_remote_code,
     )
 
     tokenizer = AutoTokenizer.from_pretrained(model_name)
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index ea7a793d026b..899f0a009532 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -460,6 +460,36 @@ def load_ovis(question: str, image_urls: list[str]) -> ModelRequestData:
     )
 
 
+# ovis2_5
+def load_ovis2_5(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "AIDC-AI/Ovis2.5-2B"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=2,
+        trust_remote_code=True,
+        dtype="half",
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    placeholders = "\n".join(
+        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
+    )
+    messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    prompt = tokenizer.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=[fetch_image(url) for url in image_urls],
+    )
+
+
 def load_pixtral_hf(question: str, image_urls: list[str]) -> ModelRequestData:
     model_name = "mistral-community/pixtral-12b"
 
@@ -742,6 +772,7 @@ def load_tarsier(question: str, image_urls: list[str]) -> ModelRequestData:
     "mllama": load_mllama,
     "NVLM_D": load_nvlm_d,
     "ovis": load_ovis,
+    "ovis2_5": load_ovis2_5,
     "phi3_v": load_phi3v,
     "phi4_mm": load_phi4mm,
     "pixtral_hf": load_pixtral_hf,
diff --git a/tests/models/multimodal/generation/vlm_utils/model_utils.py b/tests/models/multimodal/generation/vlm_utils/model_utils.py
index af4c72f44b67..2a937c56fbb9 100644
--- a/tests/models/multimodal/generation/vlm_utils/model_utils.py
+++ b/tests/models/multimodal/generation/vlm_utils/model_utils.py
@@ -788,6 +788,63 @@ def processor(*args, text="", images=None, **kwargs):
     return hf_model
 
 
+def ovis2_5_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+    """Patches and returns an instance of the HfRunner to use for Ovis2."""
+    hf_model.model.get_output_embeddings = lambda: \
+        hf_model.model.llm.get_output_embeddings()
+
+    def processor(*args, text="", images=None, videos=None, **kwargs):
+        if images is None:
+            images = []
+        else:
+            images = [images] if isinstance(images, Image) else images
+        if videos is None:
+            videos = []
+        else:
+            videos = [videos] if isinstance(videos, np.ndarray) else videos
+            videos = [[Image.fromarray(frame) for frame in vid]
+                      for vid in videos]
+
+        prompt_start_and_end = {
+            "qwen2": ("<|im_start|>user\n", "<|im_end|>\n"),
+            "llama":
+            ("<|start_header_id|>user<|end_header_id|>\n\n", "<|eot_id|>"),
+            "gemma2": ("<start_of_turn>user\n", "<end_of_turn>\n"),
+        }
+        for start, end in prompt_start_and_end.values():
+            if start in text and end in text:
+                text = text.split(start)[1].split(end)[0]
+                break
+
+        images_message = [{"type": "image", "image": img} for img in images]
+        videos_message = [{"type": "video", "video": vid} for vid in videos]
+
+        messages = [{
+            "role":
+            "user",
+            "content": [
+                *images_message,
+                *videos_message,
+                {
+                    "type": "text",
+                    "text": text
+                },
+            ],
+        }]
+
+        input_ids, pixel_values, grid_thws = hf_model.model.preprocess_inputs(
+            messages=messages, enable_thinking=True)
+        inputs = {
+            "inputs": input_ids,
+            "pixel_values": pixel_values,
+            "grid_thws": grid_thws,
+        }
+        return BatchFeature(data=inputs, tensor_type="pt")
+
+    hf_model.processor = processor
+    return hf_model
+
+
 def qwen2_5_omni_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
     """Patches and returns an instance of the HfRunner for Qwen2.5-Omni."""
     thinker = hf_model.model.thinker
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index f5f45a62ca2f..af09cf0580ba 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -527,7 +527,8 @@ def _placeholder_str(self, modality: ModalityStr,
 
             if model_type in ("aya_vision", "chameleon", "deepseek_vl_v2",
                               "internvl_chat", "ovis", "skywork_chat",
-                              "NVLM_D", "h2ovl_chat", "idefics3", "smolvlm"):
+                              "NVLM_D", "h2ovl_chat", "idefics3", "smolvlm",
+                              "ovis2_5"):
                 return "<image>"
             if model_type in ("mllama", "llama4"):
                 return "<|image|>"
diff --git a/vllm/model_executor/models/ovis2_5.py b/vllm/model_executor/models/ovis2_5.py
new file mode 100644
index 000000000000..dd110f7c7cf9
--- /dev/null
+++ b/vllm/model_executor/models/ovis2_5.py
@@ -0,0 +1,566 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+""" PyTorch Ovis model."""
+from collections.abc import Iterable, Mapping
+from functools import partial
+from typing import Optional, Union
+
+import torch
+import torch.nn as nn
+from transformers import BaseImageProcessor, BatchFeature, PretrainedConfig
+
+from vllm.config import VllmConfig
+from vllm.model_executor.layers.linear import ReplicatedLinear
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.models.ovis import (OvisImagePatchInputs,
+                                             VisualEmbedding)
+from vllm.model_executor.models.siglip2navit import Siglip2NavitModel
+from vllm.model_executor.models.utils import (AutoWeightsLoader, flatten_bn,
+                                              init_vllm_registered_model,
+                                              maybe_prefix)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs)
+from vllm.multimodal.parse import ImageSize, MultiModalDataItems
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, PromptReplacement)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.processors.ovis2_5 import Ovis2_5Processor
+
+from .interfaces import MultiModalEmbeddings, SupportsMultiModal
+
+IMAGE_TOKEN = "<image>"
+VIDEO_TOKEN = "<video>"
+INDICATOR_IDS = [-301, -302, -303, -304]
+
+IMAGE_PAD_TOKEN_MAP = {
+    "gemma2": "<unused0>",
+    "llama": "<|reserved_special_token_0|>",
+    "qwen2": "<|image_pad|>",
+    "qwen3": "<|image_pad|>",
+}
+IMAGE_PAD_TOKEN_ID_MAP = {
+    "gemma2": 7,
+    "llama": 128002,
+    "qwen2": 151655,
+    "qwen3": 151655,
+}
+
+
+def _ovis2_5_field_config():
+    return dict(pixel_values=MultiModalFieldConfig.batched("image"),
+                grids=MultiModalFieldConfig.batched("image"),
+                indicator_tokens=MultiModalFieldConfig.batched("image"),
+                video_pixel_values=MultiModalFieldConfig.batched("video"),
+                video_indicator_tokens=MultiModalFieldConfig.batched("video"),
+                video_grids=MultiModalFieldConfig.batched("video"))
+
+
+class VisualTokenizer(torch.nn.Module):
+    """
+    VIT
+    """
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        visual_vocab_size: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.vit = self._init_backbone(
+            config=config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.vit",
+        )
+        # reserved tokens for INDICATOR_IDS
+        head_dim = visual_vocab_size - len(INDICATOR_IDS)
+        self.head = torch.nn.Sequential(
+            ReplicatedLinear(
+                self.config.hidden_size * self.config.hidden_stride**2,
+                head_dim,
+                bias=False,
+                return_bias=False,
+            ), torch.nn.LayerNorm(head_dim))
+
+    def _init_backbone(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        model_type = config.model_type
+        if model_type == "siglip2_navit":
+            return Siglip2NavitModel(config=config, )
+        raise ValueError(
+            f"Unsupported visual tokenizer model_type: {model_type}")
+
+    @property
+    def dtype(self):
+        return next(self.head.parameters()).dtype
+
+    @property
+    def device(self):
+        return next(self.head.parameters()).device
+
+    def tokenize(self, logits):
+        tokens = torch.softmax(logits, dim=-1,
+                               dtype=torch.float32).to(logits.dtype)
+        return tokens
+
+    def encode(self, pixel_values, grid_thws):
+        features = self.vit(pixel_values,
+                            grid_thws,
+                            output_hidden_states=True,
+                            return_dict=True)
+        # refer to qwen2.5-vl patchmerger
+        seq_len, _ = features.shape
+        features = features.reshape(seq_len // (self.config.hidden_stride**2),
+                                    -1)
+
+        return features
+
+    def forward(self, pixel_values, grid_thws) -> torch.Tensor:
+        features = self.encode(pixel_values, grid_thws)
+        logits = self.head(features)
+        tokens = self.tokenize(logits)
+        # tokens' shape is [#Token, VocabSize-4],
+        # so padding with [#Token, 4], after which,
+        # tokens' shape should become [#Token, VocabSize];
+        tokens = torch.nn.functional.pad(
+            tokens,
+            (0, len(INDICATOR_IDS)),
+            mode="constant",
+            value=0,
+        )
+        return tokens
+
+
+class Ovis2_5ProcessingInfo(BaseProcessingInfo):
+
+    def get_hf_config(self):
+        return self.ctx.get_hf_config()
+
+    def get_hf_processor(self, **kwargs):
+        vit_config = self.get_hf_config().vit_config
+        return self.ctx.get_hf_processor(
+            Ovis2_5Processor,
+            image_pad_token=self.get_image_pad_token(),
+            patch_size=vit_config.patch_size,
+            hidden_stride=vit_config.hidden_stride,
+            temporal_patch_size=vit_config.temporal_patch_size,
+        )
+
+    def get_image_pad_token(self) -> str:
+        hf_text_config = self.get_hf_config().get_text_config()
+        text_model_type = hf_text_config.model_type
+        return IMAGE_PAD_TOKEN_MAP.get(text_model_type)
+
+    def get_image_processor(self) -> BaseImageProcessor:
+        return self.get_hf_processor().image_processor  # type: ignore
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None, "video": 1}
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        # NOTE(myselvess): max_pixels 1792 * 1792 hardcoded in original code
+        # TODO(myselvess): Be adjusted based on the max_pixels
+        return ImageSize(width=1792, height=1792)
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        num_frames: int = 1,
+    ) -> tuple[ImageSize, int]:
+        hf_config = self.get_hf_config()
+        vit_config = hf_config.vit_config
+        patch_size = vit_config.patch_size
+        temporal_patch_size = vit_config.temporal_patch_size
+        # NOTE: Frames are padded to be divisible by `temporal_patch_size`
+        # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py#L294
+        padded_num_frames = num_frames + (-num_frames % temporal_patch_size)
+        grid_t = max(padded_num_frames // temporal_patch_size, 1)
+        grid_h = image_height // patch_size
+        grid_w = image_width // patch_size
+        num_patches = grid_t * grid_h * grid_w
+        num_vision_tokens = num_patches
+        return num_vision_tokens
+
+    def get_max_image_tokens(self) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
+        return self.get_num_image_tokens(image_width=target_width,
+                                         image_height=target_height)
+
+    def _get_max_video_frames(self, max_tokens: int) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
+        num_frames = 0
+        while True:
+            next_num_frames = num_frames + 1
+            next_max_tokens = self.get_num_video_tokens(
+                image_width=target_width,
+                image_height=target_height,
+                num_frames=next_num_frames,
+                image_processor=None,
+            )
+            if next_max_tokens > max_tokens:
+                break
+            num_frames = next_num_frames
+        return num_frames
+
+    def get_num_frames_with_most_features(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> int:
+        max_images = mm_counts.get("image", 0)
+        max_videos = mm_counts.get("video", 0)
+        max_image_tokens = self.get_max_image_tokens() * max_images
+        max_total_frames = self._get_max_video_frames(seq_len -
+                                                      max_image_tokens)
+        max_frames_per_video = max_total_frames // max(max_videos, 1)
+        return max(max_frames_per_video, 1)
+
+    def get_num_video_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        num_frames: int,
+        image_processor: Optional[BaseImageProcessor],
+    ) -> int:
+        num_video_tokens = self.get_num_image_tokens(image_width=image_width,
+                                                     image_height=image_height,
+                                                     num_frames=num_frames)
+        return num_video_tokens
+
+    def get_max_video_tokens(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
+        return self.get_num_video_tokens(
+            image_width=target_width,
+            image_height=target_height,
+            num_frames=self.get_num_frames_with_most_features(
+                seq_len, mm_counts),
+            image_processor=None,
+        )
+
+
+class Ovis2_5DummyInputsBuilder(BaseDummyInputsBuilder[Ovis2_5ProcessingInfo]):
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+        num_videos = mm_counts.get("video", 0)
+        return IMAGE_TOKEN * num_images + VIDEO_TOKEN * num_videos
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        num_images = mm_counts.get("image", 0)
+        num_videos = mm_counts.get("video", 0)
+
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
+        target_num_frames = \
+            self.info.get_num_frames_with_most_features(seq_len, mm_counts)
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images),
+            "video":
+            self._get_dummy_videos(
+                width=target_width,
+                height=target_height,
+                num_frames=target_num_frames,
+                num_videos=num_videos,
+            )
+        }
+        return mm_data
+
+
+class Ovis2_5MultiModalProcessor(BaseMultiModalProcessor[Ovis2_5ProcessingInfo]
+                                 ):
+
+    def visual_indicators_to_visual_tokens(
+        self,
+        visual_indicators: list[int],
+    ) -> list[int]:
+        """
+        Filter image indicators placeholders and convert them to corresponding
+        tokens in visual tokenizer.
+        """
+        hf_config = self.info.get_hf_config()
+        vte_vocab_size = hf_config.visual_vocab_size
+        return [
+            vte_vocab_size - len(INDICATOR_IDS) + abs(x + 300) - 1
+            for x in visual_indicators if x < -300
+        ]
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        if not mm_data:
+            # Avoid warning from HF logger for text-only input
+            tokenizer = self.info.get_tokenizer()
+            prompt_ids = tokenizer.encode(prompt, add_special_tokens=False)
+            return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
+
+        processed_outputs = super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+        )
+        hf_processor = self.info.get_hf_processor()
+
+        if "videos" in mm_data:
+            visual_indicators = [
+                hf_processor.construct_visual_indicators((1, 1, 1), True)
+                for grid in processed_outputs["video_grids"]
+            ]
+            indicator_tokens = [
+                self.visual_indicators_to_visual_tokens(indicator)
+                for indicator in visual_indicators
+            ]
+            processed_outputs["video_indicator_tokens"] = indicator_tokens
+        if "images" in mm_data:
+            visual_indicators = [
+                hf_processor.construct_visual_indicators((1, 1, 1), False)
+                for grid in processed_outputs["grids"]
+            ]
+            indicator_tokens = [
+                self.visual_indicators_to_visual_tokens(indicator)
+                for indicator in visual_indicators
+            ]
+
+            processed_outputs["indicator_tokens"] = indicator_tokens
+        return processed_outputs
+
+    def _apply_hf_processor_tokens_only(
+        self,
+        prompt_tokens: list[int],
+    ) -> list[int]:
+
+        return prompt_tokens
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return _ovis2_5_field_config()
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> list[PromptReplacement]:
+
+        def get_replacement_ovis(item_idx, modality: str):
+            if modality == "image":
+                grid = out_mm_kwargs["grids"][item_idx]
+            elif modality == "video":
+                grid = out_mm_kwargs["video_grids"][item_idx]
+            hf_processor = self.info.get_hf_processor()
+            return hf_processor.construct_visual_placeholders(grid[0], )
+
+        return [
+            PromptReplacement(
+                modality=modality,
+                target=IMAGE_TOKEN if modality == "image" else VIDEO_TOKEN,
+                replacement=partial(get_replacement_ovis, modality=modality),
+            ) for modality in ("image", "video")
+        ]
+
+
+@MULTIMODAL_REGISTRY.register_processor(Ovis2_5MultiModalProcessor,
+                                        info=Ovis2_5ProcessingInfo,
+                                        dummy_inputs=Ovis2_5DummyInputsBuilder)
+class Ovis2_5(nn.Module, SupportsMultiModal):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+
+        self.config: PretrainedConfig = config
+        self.llm = init_vllm_registered_model(
+            vllm_config=vllm_config.with_hf_config(config.text_config),
+            prefix=maybe_prefix(prefix, "llm"),
+        )
+
+        self.visual_tokenizer = VisualTokenizer(
+            config=config.vit_config,
+            visual_vocab_size=config.visual_vocab_size,
+            quant_config=quant_config,
+            prefix=f"{prefix}.visual_tokenizer",
+        )
+
+        self.vte = VisualEmbedding(config.visual_vocab_size,
+                                   config.hidden_size)
+
+        text_model_type = self.config.get_text_config().model_type
+        self.image_pad_token_id = IMAGE_PAD_TOKEN_ID_MAP[text_model_type]
+
+        # TODO(Isotr0py): PP support
+        # self.make_empty_intermediate_tensors = (
+        #    self.language_model.make_empty_intermediate_tensors)
+
+    def _parse_and_validate_visual_input(
+            self, is_video,
+            **kwargs: object) -> Optional[OvisImagePatchInputs]:
+        if is_video:
+            pixel_values = kwargs.pop("video_pixel_values", None)
+            indicator_tokens = kwargs.pop("video_indicator_tokens", None)
+            grids = kwargs.pop("video_grids", None)
+        else:
+            pixel_values = kwargs.pop("pixel_values", None)
+            indicator_tokens = kwargs.pop("indicator_tokens", None)
+            grids = kwargs.pop("grids", None)
+        if pixel_values is None and indicator_tokens is None:
+            return None
+
+        if pixel_values is not None and indicator_tokens is not None:
+            if not isinstance(pixel_values, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of pixel values. "
+                                 f"Got type: {type(pixel_values)}")
+
+            if not isinstance(indicator_tokens, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of indicator_tokens. "
+                                 f"Got type: {type(indicator_tokens)}")
+
+            return OvisImagePatchInputs(
+                type="image_patches",
+                flat_data=flatten_bn(flatten_bn(pixel_values), concat=True),
+                patches_per_image=[
+                    x.shape[0] // (self.config.vit_config.hidden_stride**2)
+                    for x in flatten_bn(pixel_values)
+                ],
+                indicator_tokens=flatten_bn(flatten_bn(indicator_tokens),
+                                            concat=True),
+                grids=flatten_bn(flatten_bn(grids), concat=True),
+            )
+
+        raise AssertionError("This line should be unreachable.")
+
+    def _process_image_input(
+            self, image_input: OvisImagePatchInputs) -> MultiModalEmbeddings:
+        image_patches_flat = image_input["flat_data"]
+        patches_per_image = image_input["patches_per_image"]
+        indicator_tokens = image_input["indicator_tokens"]
+        grid_thws = image_input["grids"]
+
+        indicator_per_image = list(
+            map(lambda x: 2 if x > 1 else x + 2, patches_per_image))
+
+        target_dtype = self.visual_tokenizer.dtype
+        visual_tokens = self.visual_tokenizer(
+            image_patches_flat.to(target_dtype), grid_thws)
+
+        visual_embeds = self.vte(visual_tokens)  # 1:1 numeric eq.
+        indicator_embeds = self.vte(indicator_tokens)
+
+        visual_embeds_per_image = visual_embeds.split(patches_per_image, dim=0)
+        indicator_embeds_per_image = indicator_embeds.split(
+            indicator_per_image)
+
+        vision_embeddings = []
+        for indicator, visual in zip(indicator_embeds_per_image,
+                                     visual_embeds_per_image):
+            vision_embeddings_per_image = []
+            visual = visual.unsqueeze(0)
+            for i in range(visual.shape[0]):
+                vision_embeddings_per_image.append(
+                    torch.cat([indicator[i:i + 1], visual[i]], dim=0))
+            vision_embeddings_per_image.append(indicator[i + 1:])
+            vision_embeddings.append(
+                torch.cat(vision_embeddings_per_image, dim=0))
+        return tuple(vision_embeddings)
+
+    def get_multimodal_embeddings(
+            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
+        embeddings = []
+
+        # NOTE: _parse_and_validate_visual_input has side-effects and pops
+        # keys from kwargs. We process images first, then videos.
+        image_input = self._parse_and_validate_visual_input(False, **kwargs)
+        if image_input:
+            embeddings.extend(self._process_image_input(image_input))
+
+        video_input = self._parse_and_validate_visual_input(True, **kwargs)
+        if video_input:
+            embeddings.extend(self._process_image_input(video_input))
+
+        return tuple(embeddings) if embeddings else None
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.llm.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None:
+            tmp = torch.concat(multimodal_embeddings, dim=0)
+            inputs_embeds[input_ids == self.image_pad_token_id] = tmp
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
+
+        # up until here we have a inputs_embeds 100% numerical identity
+        # between the OG HF Transformers implementation and ours
+        hidden_states = self.llm(
+            input_ids=input_ids,
+            positions=positions,
+            intermediate_tensors=intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+        )
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.llm.compute_logits(hidden_states, sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.llm
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index e82e36638069..560a7191de0b 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -202,6 +202,7 @@
     "MolmoForCausalLM": ("molmo", "MolmoForCausalLM"),
     "NVLM_D": ("nvlm_d", "NVLM_D_Model"),
     "Ovis": ("ovis", "Ovis"),
+    "Ovis2_5": ("ovis2_5", "Ovis2_5"),
     "PaliGemmaForConditionalGeneration": ("paligemma", "PaliGemmaForConditionalGeneration"),  # noqa: E501
     "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
     "PixtralForConditionalGeneration": ("pixtral", "PixtralForConditionalGeneration"),  # noqa: E501
diff --git a/vllm/model_executor/models/siglip2navit.py b/vllm/model_executor/models/siglip2navit.py
new file mode 100644
index 000000000000..e728f94a3935
--- /dev/null
+++ b/vllm/model_executor/models/siglip2navit.py
@@ -0,0 +1,626 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Implementation of SiglipVisionModel intended to be only used
+within a vision language model."""
+
+from typing import Optional, Union
+
+import torch
+from einops import rearrange, repeat
+from torch import nn
+from torch.nn import functional as F
+from transformers.activations import ACT2FN
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_outputs import BaseModelOutputWithNoAttention
+
+from vllm.platforms import _Backend, current_platform
+
+from .vision import get_vit_attn_backend
+
+is_hpu = current_platform.is_hpu()
+
+if is_hpu:
+    import habana_frameworks.torch.core as htcore
+    from habana_frameworks.torch.hpex.kernels import FusedSDPA
+
+
+class VisionRotaryEmbedding(nn.Module):
+
+    def __init__(self, dim: int, theta: float = 10000.0) -> None:
+        super().__init__()
+        inv_freq = 1.0 / (theta
+                          **(torch.arange(0, dim, 2, dtype=torch.float) / dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+    def forward(self, seqlen: int) -> torch.Tensor:
+        seq = torch.arange(seqlen,
+                           device=self.inv_freq.device,
+                           dtype=self.inv_freq.dtype)
+        freqs = torch.outer(seq, self.inv_freq)
+        return freqs
+
+
+class Siglip2VisionEmbeddings(nn.Module):
+
+    def __init__(self, config: PretrainedConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.patch_size = config.patch_size
+        self.image_size = config.image_size
+        self.num_patches = config.num_patches
+        self.preserve_original_pe = config.preserve_original_pe
+        self.hidden_stride = config.hidden_stride
+
+        # siglip2 naflex
+        if self.num_patches > 0:
+            self.patch_embedding = nn.Linear(
+                in_features=config.num_channels * self.patch_size *
+                self.patch_size,
+                out_features=self.embed_dim,
+            )
+            if self.preserve_original_pe:
+                self.position_embedding_size = int(self.num_patches**0.5)
+                self.position_embedding = nn.Embedding(self.num_patches,
+                                                       self.embed_dim)
+
+        else:
+            self.patch_embedding = nn.Conv2d(
+                in_channels=config.num_channels,
+                out_channels=self.embed_dim,
+                kernel_size=self.patch_size,
+                stride=self.patch_size,
+                padding="valid",
+            )
+            if self.preserve_original_pe:
+                self.num_patches = (self.image_size // self.patch_size)**2
+                self.position_embedding_size = (self.image_size //
+                                                self.patch_size)
+                self.position_embedding = nn.Embedding(self.num_patches,
+                                                       self.embed_dim)
+
+    def forward(self,
+                pixel_values: torch.FloatTensor,
+                grid_thws: Optional[torch.LongTensor] = None) -> torch.Tensor:
+        """
+        Args:
+            pixel_values (`torch.FloatTensor`):
+                Pixel values of shape (
+                    num_patches,
+                    num_channels * temporal_patch_size * patch_size * patch_size
+                )
+            grid_thws: (`torch.LongTensor`):
+                grid shape (num_patches, 3)
+        """
+
+        # Apply patch embeddings to already patchified pixel values
+        target_dtype = self.patch_embedding.weight.dtype
+        if isinstance(self.patch_embedding, nn.Linear):
+            patch_embeds = self.patch_embedding(
+                pixel_values.to(dtype=target_dtype))
+        elif isinstance(self.patch_embedding, nn.Conv2d):
+            pixel_values = pixel_values.view(
+                -1, self.config.num_channels * self.config.temporal_patch_size,
+                self.patch_size, self.patch_size)
+            patch_embeds = self.patch_embedding(
+                pixel_values.to(dtype=target_dtype))
+            patch_embeds = patch_embeds.reshape(-1, self.embed_dim)
+
+        if self.preserve_original_pe:
+            assert grid_thws is not None
+            pos_embed_new = torch.zeros_like(patch_embeds)
+            positional_embeddings = self.position_embedding.weight.reshape(
+                self.position_embedding_size, self.position_embedding_size,
+                -1).unsqueeze(0).permute(0, 3, 1, 2)
+            cnt = 0
+            for t, h, w in grid_thws:
+                volume = t * h * w
+                pe = F.interpolate(positional_embeddings,
+                                   size=(h, w),
+                                   mode='bicubic',
+                                   align_corners=False)
+                pe = pe.permute(0, 2, 3, 1).reshape(1, h * w, -1)
+                pe = pe[0].repeat(t, 1)
+                pe = pe.reshape(t, h // self.hidden_stride, self.hidden_stride,
+                                w // self.hidden_stride, self.hidden_stride,
+                                -1)
+                pe = pe.permute(0, 1, 3, 2, 4, 5).reshape(volume, -1)
+                pos_embed_new[cnt:cnt + volume] = pe
+                cnt += volume
+            patch_embeds = patch_embeds + pos_embed_new
+
+        return patch_embeds
+
+
+# copy from flash_attn/layers/rotary.py
+def rotate_half(x, interleaved=False):
+    if not interleaved:
+        x1, x2 = x.chunk(2, dim=-1)
+        return torch.cat((-x2, x1), dim=-1)
+    else:
+        x1, x2 = x[..., ::2], x[..., 1::2]
+        return rearrange(torch.stack((-x2, x1), dim=-1),
+                         "... d two -> ... (d two)",
+                         two=2)
+
+
+def apply_rotary_emb_torch(x, cos, sin, interleaved=False):
+    """
+    x: (batch_size, seqlen, nheads, headdim)
+    cos, sin: (seqlen, rotary_dim / 2) or (batch_size, seqlen, rotary_dim / 2)
+    """
+    ro_dim = cos.shape[-1] * 2
+    assert ro_dim <= x.shape[-1]
+    cos = repeat(
+        cos,
+        "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)")
+    sin = repeat(
+        sin,
+        "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)")
+    return torch.cat(
+        [
+            x[..., :ro_dim] * cos +
+            rotate_half(x[..., :ro_dim], interleaved) * sin, x[..., ro_dim:]
+        ],
+        dim=-1,
+    )
+
+
+def apply_rotary_pos_emb(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    is_flash_attn_backend: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    cos = cos.chunk(2, dim=-1)[0].contiguous()
+    sin = sin.chunk(2, dim=-1)[0].contiguous()
+    if is_flash_attn_backend:
+        from flash_attn.layers.rotary import apply_rotary_emb
+        apply_rotary_emb_func = apply_rotary_emb
+    else:
+        apply_rotary_emb_func = apply_rotary_emb_torch
+    q_embed = apply_rotary_emb_func(q.float(), cos.float(),
+                                    sin.float()).type_as(q)
+    k_embed = apply_rotary_emb_func(k.float(), cos.float(),
+                                    sin.float()).type_as(k)
+    return q_embed, k_embed
+
+
+class Siglip2Attention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads "
+                f"(got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads}).")
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+        self.is_causal = False
+
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
+
+        self.use_rope = config.use_rope
+
+        # Detect attention implementation.
+        self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
+        if self.attn_backend not in {
+                _Backend.FLASH_ATTN,
+                _Backend.TORCH_SDPA,
+        }:
+            self.attn_backend = _Backend.TORCH_SDPA
+        self.is_flash_attn_backend = self.attn_backend in {
+            _Backend.FLASH_ATTN,
+        }
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        position_embeddings: Optional[tuple[torch.Tensor,
+                                            torch.Tensor]] = None,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """Input shape: Batch x Time x Channel"""
+
+        seq_length, embed_dim = hidden_states.shape
+
+        queries = self.q_proj(hidden_states)
+        keys = self.k_proj(hidden_states)
+        values = self.v_proj(hidden_states)
+
+        queries = queries.view(seq_length, self.num_heads, self.head_dim)
+        keys = keys.view(seq_length, self.num_heads, self.head_dim)
+        values = values.view(seq_length, self.num_heads, self.head_dim)
+
+        if self.use_rope:
+            cos, sin = position_embeddings
+            queries, keys = apply_rotary_pos_emb(queries.unsqueeze(0),
+                                                 keys.unsqueeze(0), cos, sin,
+                                                 self.is_flash_attn_backend)
+            queries = queries.squeeze(0)
+            keys = keys.squeeze(0)
+
+        max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
+        if self.is_flash_attn_backend:
+            from flash_attn import flash_attn_varlen_func
+            attn_output = flash_attn_varlen_func(
+                queries, keys, values, cu_seqlens, cu_seqlens, max_seqlen,
+                max_seqlen).reshape(seq_length, -1)
+        elif self.attn_backend == _Backend.TORCH_SDPA:
+            # Execute attention entry by entry for speed & less VRAM.
+            batch_size = cu_seqlens.shape[0] - 1
+            outputs = []
+            cu = cu_seqlens.tolist()
+            for i in range(batch_size):
+                start_idx = cu[i]
+                end_idx = cu[i + 1]
+
+                # Each sequence is processed independently.
+                q_i = queries[start_idx:end_idx].unsqueeze(0)
+                k_i = keys[start_idx:end_idx].unsqueeze(0)
+                v_i = values[start_idx:end_idx].unsqueeze(0)
+
+                # (1, seq_len, num_heads, head_dim) ->
+                # (1, num_heads, seq_len, head_dim)
+                q_i, k_i, v_i = [x.transpose(1, 2) for x in (q_i, k_i, v_i)]
+
+                if is_hpu:
+                    output_i = FusedSDPA.apply(q_i, k_i, v_i, None, 0.0, False,
+                                               None)
+                else:
+                    output_i = F.scaled_dot_product_attention(q_i,
+                                                              k_i,
+                                                              v_i,
+                                                              dropout_p=0.0)
+                # (1, num_heads, seq_len, head_dim) -> (seq_len, embed_dim)
+                output_i = output_i.transpose(1, 2).reshape(-1, self.embed_dim)
+                outputs.append(output_i)
+
+            attn_output = torch.cat(outputs, dim=0)
+        attn_output = self.out_proj(attn_output)
+        return attn_output
+
+
+class Siglip2MLP(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+class Siglip2EncoderLayer(nn.Module):
+
+    def __init__(self, config: PretrainedConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim,
+                                        eps=config.layer_norm_eps)
+        self.self_attn = Siglip2Attention(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim,
+                                        eps=config.layer_norm_eps)
+        self.mlp = Siglip2MLP(config)
+
+    def forward(self, hidden_states: torch.Tensor, cu_seqlens: torch.Tensor,
+                position_embeddings: torch.Tensor) -> tuple[torch.FloatTensor]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`):
+                Input to the layer of shape `(batch, seq_len, embed_dim)`.
+            output_attentions (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the attentions tensors of all
+                attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states = self.self_attn(hidden_states=hidden_states,
+                                       cu_seqlens=cu_seqlens,
+                                       position_embeddings=position_embeddings)
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+class Siglip2Encoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers`
+    self attention layers. Each layer is a [`Siglip2EncoderLayer`].
+    Args:
+        config: PretrainedConfig
+    """
+
+    def __init__(self, config: PretrainedConfig):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([
+            Siglip2EncoderLayer(config)
+            for _ in range(config.num_hidden_layers)
+        ])
+        self.gradient_checkpointing = False
+
+        self.rotary_pos_emb = VisionRotaryEmbedding(
+            config.hidden_size // config.num_attention_heads // 2)
+        self.patch_size = config.patch_size
+        self.hidden_stride = config.hidden_stride
+        self.window_size = config.window_size
+        self.spatial_merge_unit = config.hidden_stride * config.hidden_stride
+        if config.fullatt_block_indexes is None:
+            self.fullatt_block_indexes = None
+        else:
+            self.fullatt_block_indexes = [
+                int(i) for i in config.fullatt_block_indexes.split('|')
+            ]
+
+    # copied from qwen2.5_vl
+    def rot_pos_emb(self, grid_thw):
+        pos_ids = []
+        for t, h, w in grid_thw:
+            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
+            hpos_ids = hpos_ids.reshape(
+                h // self.hidden_stride,
+                self.hidden_stride,
+                w // self.hidden_stride,
+                self.hidden_stride,
+            )
+            hpos_ids = hpos_ids.permute(0, 2, 1, 3)
+            hpos_ids = hpos_ids.flatten()
+
+            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
+            wpos_ids = wpos_ids.reshape(
+                h // self.hidden_stride,
+                self.hidden_stride,
+                w // self.hidden_stride,
+                self.hidden_stride,
+            )
+            wpos_ids = wpos_ids.permute(0, 2, 1, 3)
+            wpos_ids = wpos_ids.flatten()
+            pos_ids.append(
+                torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
+        pos_ids = torch.cat(pos_ids, dim=0)
+        max_grid_size = grid_thw[:, 1:].max()
+        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
+        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
+        return rotary_pos_emb
+
+    def get_window_index(self, grid_thw):
+        window_index: list = []
+        cu_window_seqlens: list = [0]
+        window_index_id = 0
+        # patch (after merge) number in each window
+        vit_merger_window_size = (self.window_size // self.hidden_stride //
+                                  self.patch_size)
+
+        for grid_t, grid_h, grid_w in grid_thw:
+            llm_grid_h, llm_grid_w = (
+                grid_h // self.hidden_stride,  # number of patch after merge
+                grid_w // self.hidden_stride,
+            )
+            index = torch.arange(grid_t * llm_grid_h * llm_grid_w).reshape(
+                grid_t, llm_grid_h, llm_grid_w)
+            pad_h = vit_merger_window_size - llm_grid_h % vit_merger_window_size
+            pad_w = vit_merger_window_size - llm_grid_w % vit_merger_window_size
+            num_windows_h = (llm_grid_h + pad_h) // vit_merger_window_size
+            num_windows_w = (llm_grid_w + pad_w) // vit_merger_window_size
+            index_padded = F.pad(index, (0, pad_w, 0, pad_h), "constant", -100)
+            index_padded = index_padded.reshape(
+                grid_t,
+                num_windows_h,
+                vit_merger_window_size,
+                num_windows_w,
+                vit_merger_window_size,
+            )
+            index_padded = index_padded.permute(0, 1, 3, 2, 4).reshape(
+                grid_t,
+                num_windows_h * num_windows_w,
+                vit_merger_window_size,
+                vit_merger_window_size,
+            )
+            seqlens = (index_padded != -100).sum([2, 3]).reshape(-1)
+            index_padded = index_padded.reshape(-1)
+            index_new = index_padded[index_padded != -100]
+            window_index.append(index_new + window_index_id)
+            cu_seqlens_tmp = seqlens.cumsum(
+                0) * self.spatial_merge_unit + cu_window_seqlens[-1]
+            cu_window_seqlens.extend(cu_seqlens_tmp.tolist())
+            window_index_id += (grid_t * llm_grid_h * llm_grid_w).item()
+        window_index = torch.cat(window_index, dim=0)
+
+        return window_index, cu_window_seqlens
+
+    # Ignore copy
+    def forward(
+        self,
+        inputs_embeds,
+        grid_thws: torch.Tensor,
+        output_hidden_states: bool = False,
+    ) -> tuple[torch.Tensor, Optional[tuple[torch.Tensor, ...]]]:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape
+                `(batch_size, sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to
+                directly pass an embedded representation. This is useful if
+                you want more control over how to convert `input_ids` indices
+                into associated vectors than the model's internal embedding
+                lookup matrix.
+            grid_thws (`torch.LongTensor`):
+                grid shape (num_patches, 3)
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See
+                `hidden_states` under returned tensors for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of
+                a plain tuple.
+        """
+        rotary_pos_emb = self.rot_pos_emb(grid_thws)
+        window_index, cu_window_seqlens = self.get_window_index(grid_thws)
+
+        # NOTE: unique_consecutive is a dynamic operation
+        # we are using `remove_duplicates_cpu` instead
+        def remove_duplicates_cpu(a):
+            return [a[i] for i in range(len(a)) if i == 0 or a[i - 1] != a[i]]
+
+        if is_hpu:
+            cu_window_seqlens = remove_duplicates_cpu(cu_window_seqlens)
+
+        cu_window_seqlens = torch.tensor(
+            cu_window_seqlens,
+            device=inputs_embeds.device,
+            dtype=grid_thws.dtype if torch.jit.is_tracing() else torch.int32,
+        )
+        if not is_hpu:
+            cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens)
+
+        seq_len, _ = inputs_embeds.size()
+        inputs_embeds = inputs_embeds.reshape(
+            seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
+        inputs_embeds = inputs_embeds[window_index, :, :]
+        inputs_embeds = inputs_embeds.reshape(seq_len, -1)
+        rotary_pos_emb = rotary_pos_emb.reshape(
+            seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
+        rotary_pos_emb = rotary_pos_emb[window_index, :, :]
+        rotary_pos_emb = rotary_pos_emb.reshape(seq_len, -1)
+        emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
+        position_embeddings = (emb.cos(), emb.sin())
+
+        cu_seqlens = torch.repeat_interleave(
+            grid_thws[:, 1] * grid_thws[:, 2], grid_thws[:, 0]
+        ).cumsum(
+            dim=0,
+            # Select dtype based on the following factors:
+            #  - FA2 requires that cu_seqlens_q must have dtype int32
+            #  - torch.onnx.export requires that cu_seqlens_q must have
+            #    same dtype as grid_thw
+            # See https://github.com/huggingface/transformers/pull/34852
+            # for more information
+            dtype=grid_thws.dtype if torch.jit.is_tracing() else torch.int32,
+        )
+        cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
+
+        reverse_indices = torch.argsort(window_index)
+        encoder_states = () if output_hidden_states else None
+
+        hidden_states = inputs_embeds
+
+        htcore.mark_step()
+        for index, block in enumerate(self.layers):
+            if (not self.fullatt_block_indexes
+                    or index in self.fullatt_block_indexes):
+                cu_seqlens_tmp = cu_seqlens
+            else:
+                cu_seqlens_tmp = cu_window_seqlens
+            hidden_states = block(hidden_states, cu_seqlens_tmp,
+                                  position_embeddings)
+            if output_hidden_states:
+                hidden_states_ = hidden_states.reshape(
+                    seq_len // self.spatial_merge_unit,
+                    self.spatial_merge_unit, -1)
+                encoder_states += (hidden_states_[reverse_indices, :].reshape(
+                    seq_len, -1), )
+        # tokens = self.post_trunk_norm(tokens)
+        htcore.mark_step()
+        hidden_states = hidden_states.reshape(
+            seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
+        hidden_states = hidden_states[reverse_indices, :].reshape(seq_len, -1)
+
+        return hidden_states, encoder_states
+
+
+class Siglip2VisionTransformer(nn.Module):
+
+    def __init__(self, config: PretrainedConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = Siglip2VisionEmbeddings(config)
+        self.encoder = Siglip2Encoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim,
+                                           eps=config.layer_norm_eps)
+        self._use_flash_attention_2 = \
+            (config._attn_implementation == "flash_attention_2")
+
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        grid_thws: torch.LongTensor,
+        output_hidden_states: Optional[bool] = True,
+        return_dict: Optional[bool] = True,
+    ) -> Union[
+            tuple[torch.Tensor],
+            tuple[torch.Tensor, tuple[torch.Tensor, ...]],
+            BaseModelOutputWithNoAttention,
+    ]:
+        r"""
+        spatial_shapes (`torch.LongTensor` of shape `(batch_size, 2)`):
+            Tensor containing the spatial dimensions (height, width)
+            of the input images.
+        """
+        hidden_states = self.embeddings(pixel_values, grid_thws)
+
+        last_hidden_state, hidden_states = self.encoder(
+            hidden_states, grid_thws, output_hidden_states)
+        last_hidden_state = self.post_layernorm(last_hidden_state)
+
+        if not return_dict:
+            output = (last_hidden_state, )
+            output += (hidden_states, ) if output_hidden_states else ()
+            return output
+
+        return last_hidden_state
+
+
+class Siglip2NavitModel(torch.nn.Module):
+
+    def __init__(self, config: PretrainedConfig):
+        super().__init__()
+
+        self.vision_model = Siglip2VisionTransformer(config)
+
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        grid_thws: torch.LongTensor,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[
+            tuple[torch.Tensor],
+            tuple[torch.Tensor, tuple[torch.Tensor, ...]],
+            BaseModelOutputWithNoAttention,
+    ]:
+
+        if output_hidden_states is None:
+            output_hidden_states = self.config.output_hidden_states
+        if return_dict is None:
+            return_dict = self.config.use_return_dict
+
+        return self.vision_model(
+            pixel_values=pixel_values,
+            grid_thws=grid_thws,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 9bc3b8e09ada..a09e6c269174 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -87,6 +87,10 @@
     **_CONFIG_REGISTRY_OVERRIDE_HF
 }
 
+_CONFIG_ATTRS_MAPPING: dict[str, str] = {
+    "llm_config": "text_config",
+}
+
 
 class ConfigFormat(str, enum.Enum):
     AUTO = "auto"
@@ -240,6 +244,17 @@ def _uses_mrope(config: PretrainedConfig) -> bool:
     return "mrope_section" in rope_scaling
 
 
+def _maybe_remap_hf_config_attrs(config: PretrainedConfig) -> PretrainedConfig:
+    """Remap config attributes to match the expected names."""
+    for old_attr, new_attr in _CONFIG_ATTRS_MAPPING.items():
+        if hasattr(config, old_attr):
+            if not hasattr(config, new_attr):
+                config.update({new_attr: getattr(config, old_attr)})
+            logger.debug("Remapped config attribute '%s' to '%s'", old_attr,
+                         new_attr)
+    return config
+
+
 def uses_mrope(config: PretrainedConfig) -> bool:
     """Detect if the model with this config uses M-ROPE."""
     return _uses_mrope(config) or thinker_uses_mrope(config)
@@ -357,6 +372,7 @@ def get_config(
                     raise RuntimeError(err_msg) from e
                 else:
                     raise e
+            config = _maybe_remap_hf_config_attrs(config)
 
     elif config_format == ConfigFormat.MISTRAL:
         config = load_params_config(model, revision, **kwargs)
diff --git a/vllm/transformers_utils/processors/__init__.py b/vllm/transformers_utils/processors/__init__.py
index 14d15f2bc167..0bf1b7c58be1 100644
--- a/vllm/transformers_utils/processors/__init__.py
+++ b/vllm/transformers_utils/processors/__init__.py
@@ -4,5 +4,6 @@
 from vllm.transformers_utils.processors.deepseek_vl2 import (
     DeepseekVLV2Processor)
 from vllm.transformers_utils.processors.ovis import OvisProcessor
+from vllm.transformers_utils.processors.ovis2_5 import Ovis2_5Processor
 
-__all__ = ["DeepseekVLV2Processor", "OvisProcessor"]
+__all__ = ["DeepseekVLV2Processor", "OvisProcessor", "Ovis2_5Processor"]
diff --git a/vllm/transformers_utils/processors/ovis2_5.py b/vllm/transformers_utils/processors/ovis2_5.py
new file mode 100644
index 000000000000..c0bace5ae52f
--- /dev/null
+++ b/vllm/transformers_utils/processors/ovis2_5.py
@@ -0,0 +1,458 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import math
+from functools import cached_property
+from typing import Optional, Union
+
+import numpy as np
+import PIL
+import torch
+from transformers import AutoProcessor, BatchFeature
+from transformers.image_utils import ImageInput
+from transformers.processing_utils import (ProcessingKwargs, ProcessorMixin,
+                                           Unpack)
+from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
+
+__all__ = ['Ovis2_5Processor']
+IMAGE_TOKEN = "<image>"
+VIDEO_TOKEN = "<video>"
+MIN_PIXELS = 448 * 448
+MAX_PIXELS = 1792 * 1792
+
+
+class Ovis2_5ProcessorKwargs(ProcessingKwargs,
+                             total=False):  # type: ignore[call-arg]
+    _defaults = {
+        "text_kwargs": {
+            "padding": False,
+        },
+        "images_kwargs": {
+            'convert_to_rgb': True,
+            'min_pixels': MIN_PIXELS,
+            'max_pixels': MAX_PIXELS,
+        },
+        "videos_kwargs": {
+            'convert_to_rgb': True,
+            'min_pixels': MIN_PIXELS,
+            'max_pixels': MAX_PIXELS,
+        }
+    }
+
+
+class Ovis2_5Processor(ProcessorMixin):
+    r"""
+    Constructs a Ovis processor which wraps a Ovis image processor
+    and a Qwen2 tokenizer into a single processor.
+    [`OvisProcessor`] offers all the functionalities of
+    [`Qwen2VLImageProcessor`] and [`Qwen2TokenizerFast`].
+    See the [`~OvisProcessor.__call__`] and [`~OvisProcessor.decode`]
+    for more information.
+    Args:
+        image_processor ([`Qwen2VLImageProcessor`], *optional*):
+            The image processor is a required input.
+        tokenizer ([`Qwen2TokenizerFast`], *optional*):
+            The tokenizer is a required input.
+        chat_template (`str`, *optional*): A Jinja template which will
+            be used to convert lists of messages in a chat into
+            a tokenizable string.
+    """
+
+    attributes = ["image_processor", "tokenizer"]
+    valid_kwargs = ["chat_template", "image_pad_token"]
+
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+
+    def __init__(
+        self,
+        image_processor=None,
+        tokenizer=None,
+        chat_template=None,
+        image_pad_token=None,
+        patch_size=16,
+        hidden_stride=2,
+        temporal_patch_size=1,
+        **kwargs,
+    ):
+        self.image_token = IMAGE_TOKEN
+        self.video_token = VIDEO_TOKEN
+        self.image_pad_token = "<|image_pad|>"
+
+        self.patch_size = patch_size
+        self.hidden_stride = hidden_stride
+        self.temporal_patch_size = temporal_patch_size
+        super().__init__(image_processor,
+                         tokenizer,
+                         chat_template=chat_template)
+
+    @cached_property
+    def extra_special_tokens(self):
+        image_pad_token_id = self.tokenizer.get_vocab()[self.image_pad_token]
+        extra_special_tokens = {
+            "image_token": -200,
+            "video_token": -201,
+            "visual_atom": -300,
+            "image_start": -301,
+            "image_end": -302,
+            "video_start": -303,
+            "video_end": -304,
+            'image_pad': image_pad_token_id,
+        }
+        return extra_special_tokens
+
+    def __call__(
+        self,
+        images: ImageInput = None,
+        videos: Union[np.ndarray, list[ImageInput]] = None,
+        text: Union[TextInput, PreTokenizedInput, list[TextInput],
+                    list[PreTokenizedInput]] = None,
+        **kwargs: Unpack[Ovis2_5ProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several sequences(s)
+        and image(s). This method forwards the `text`and `kwargs` arguments
+        to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text`
+        is not `None` to encode the text. To prepare the vision inputs,
+        this method forwards the `vision_infos` and `kwrags` arguments to
+        Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`]
+        if `vision_infos` is not `None`.
+            Args:
+                images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`,
+                    `list[PIL.Image.Image]`, `list[np.ndarray]`,
+                    `list[torch.Tensor]`):
+                    The image or batch of images to be prepared.
+                    Each image can be a PIL image, NumPy array or PyTorch
+                    tensor. Both channels-first and channels-last formats
+                    are supported.
+                text (`str`, `list[str]`, `list[list[str]]`):
+                    The sequence or batch of sequences to be encoded.
+                    Each sequence can be a string or a list of strings
+                    (pretokenized string). If the sequences are provided as
+                    list of strings (pretokenized), you must set
+                    `is_split_into_words=True` (to lift the ambiguity with
+                    a batch of sequences).
+                videos (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`,
+                    `list[torch.Tensor]`):
+                    The image or batch of videos to be prepared. Each video
+                    can be a 4D NumPy array or PyTorch tensor, or a nested
+                    list of 3D frames. Both channels-first and channels-last
+                    formats are supported.
+                return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                    If set, will return tensors of a particular framework.
+                    Acceptable values are:
+                    - `'tf'`: Return TensorFlow `tf.constant` objects.
+                    - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                    - `'np'`: Return NumPy `np.ndarray` objects.
+                    - `'jax'`: Return JAX `jnp.ndarray` objects.
+            Returns:
+                [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+                - **input_ids** -- list of token ids to be fed to a model.
+                  Returned when `text` is not `None`.
+                - **attention_mask** -- list of indices specifying which tokens
+                  should be attended to by the model (when
+                  `return_attention_mask=True` or if *"attention_mask"*
+                  is in `self.model_input_names` and if `text` is not `None`).
+                - **pixel_values** -- Pixel values to be fed to a model.
+                  Returned when `images` is not `None`.
+                - **pixel_values_videos** -- Pixel values of videos to be fed to
+                  a model. Returned when `videos` is not `None`.
+                - **image_grid_thw** -- list of image 3D grid in LLM. Returned
+                  when `images` is not `None`.
+                - **video_grid_thw** -- list of video 3D grid in LLM. Returned
+                  when `videos` is not `None`.
+                - **second_per_grid_ts** -- list of video seconds per time grid.
+                  Returned when `videos` is not `None`.
+        """
+        output_kwargs = self._merge_kwargs(
+            Ovis2_5ProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        # Process all images first
+        visual_features = {}
+        output = BatchFeature()
+        if images is not None:
+            processed_images = []
+            image_placeholders_list = []
+            grids = []
+            # Process each image
+            for image in images if isinstance(images, list) else [images]:
+                pixel_values, image_placeholders, grid = (
+                    self.preprocess_multidata(
+                        images=image, **output_kwargs["images_kwargs"]))
+                processed_images.append(pixel_values)
+                image_placeholders_list.append(image_placeholders)
+                grids.append(grid)
+
+            # assign all processed images
+            if processed_images:
+                visual_features["image_placeholders"] = image_placeholders_list
+            output["pixel_values"] = processed_images
+            output["grids"] = grids
+
+        if videos is not None:
+            processed_videos = []
+            videos_placeholders_list = []
+            grids = []
+            # Process each video
+            for video in videos if isinstance(videos, list) else [videos]:
+                pixel_values, video_placeholders, grid = (
+                    self.preprocess_multidata(
+                        video=video, **output_kwargs["videos_kwargs"]))
+                processed_videos.append(pixel_values)
+                videos_placeholders_list.append(video_placeholders)
+                grids.append(grid)
+            # assign all processed videos
+            if processed_videos:
+                visual_features[
+                    "video_placeholders"] = videos_placeholders_list
+            output["video_pixel_values"] = processed_videos
+            output["video_grids"] = grids
+
+        # Process text input
+        if text is not None:
+            if not isinstance(text, list):
+                text = [text]
+            tokenized_batched_text = self._tokenize_with_visual_symbol(text)
+            image_token_id = self.get_token_value("image_token")
+            video_token_id = self.get_token_value("video_token")
+            replaced_ids_list = []
+            image_idx = 0
+            video_idx = 0
+            for ids_tensor in tokenized_batched_text:
+                has_image_tokens = (image_token_id in ids_tensor
+                                    and "image_placeholders" in visual_features
+                                    and image_idx < len(
+                                        visual_features["image_placeholders"]))
+                has_video_tokens = (video_token_id in ids_tensor
+                                    and "video_placeholders" in visual_features
+                                    and video_idx < len(
+                                        visual_features["video_placeholders"]))
+                if has_image_tokens or has_video_tokens:
+                    # Convert to list for easier manipulation
+                    ids_list = ids_tensor.tolist()
+                    new_ids = []
+
+                    # Replace placeholders
+                    for token_id in ids_list:
+                        if token_id == image_token_id:
+                            new_ids.extend(
+                                visual_features["image_placeholders"]
+                                [image_idx])
+                            image_idx += 1
+                        elif token_id == video_token_id:
+                            new_ids.extend(
+                                visual_features["video_placeholders"]
+                                [video_idx])
+                            video_idx += 1
+                        else:
+                            new_ids.append(token_id)
+                    # Convert back to tensor
+                    ids_tensor = torch.tensor(new_ids, dtype=torch.long)
+                replaced_ids_list.append(ids_tensor)
+            if replaced_ids_list:
+                replaced_and_tokenized_ids = torch.stack(replaced_ids_list)
+            else:
+                replaced_and_tokenized_ids = torch.tensor([], dtype=torch.long)
+            output["input_ids"] = replaced_and_tokenized_ids
+
+            return output
+        # If only images were provided
+        return BatchFeature(data=visual_features)
+
+    def _tokenize_with_visual_symbol(self,
+                                     text_list: list[str]) -> torch.LongTensor:
+        batch_token_ids = []
+        for text in text_list:
+            token_ids = []
+            video_token_id = self.get_token_value("video_token")
+            image_token_id = self.get_token_value("image_token")
+            video_split_texts = text.split(self.video_token)
+
+            for j, video_segment in enumerate(video_split_texts):
+                image_split_texts = video_segment.split(self.image_token)
+                text_chunks = [
+                    self.tokenizer(chunk, add_special_tokens=False).input_ids
+                    for chunk in image_split_texts
+                ]
+                segment_tokens = []
+                for i, chunk in enumerate(text_chunks):
+                    segment_tokens.extend(chunk)
+                    if i < len(text_chunks) - 1:
+                        segment_tokens.append(image_token_id)
+                token_ids.extend(segment_tokens)
+                if j < len(video_split_texts) - 1:
+                    token_ids.append(video_token_id)
+
+            batch_token_ids.append(token_ids)
+        return torch.tensor(batch_token_ids, dtype=torch.long)
+
+    # Copied from qwen2_vl
+    def smart_resize(self,
+                     height: int,
+                     width: int,
+                     factor: int = 28,
+                     min_pixels: int = MIN_PIXELS,
+                     max_pixels: int = MAX_PIXELS):
+        """Rescales the image so that the following conditions are met:
+        1. Both dimensions (height and width) are divisible by 'factor'.
+        2. The total number of pixels is within the range
+            ['min_pixels', 'max_pixels'].
+        3. The aspect ratio of the image is maintained as closely as possible.
+        """
+        if height < factor or width < factor:
+            print(f"height:{height} or width:{width} must be "
+                  f"larger than factor:{factor}")
+            if height < width:
+                width = round(factor / height * width)
+                height = factor
+            else:
+                height = round(factor / width * height)
+                width = factor
+
+        elif max(height, width) / min(height, width) > 200:
+            print(f"absolute aspect ratio must be smaller than 200, "
+                  f"got {max(height, width) / min(height, width)}")
+            if height > width:
+                height = 200 * width
+            else:
+                width = 200 * height
+
+        h_bar = round(height / factor) * factor
+        w_bar = round(width / factor) * factor
+        if h_bar * w_bar > max_pixels:
+            beta = math.sqrt((height * width) / max_pixels)
+            h_bar = math.floor(height / beta / factor) * factor
+            w_bar = math.floor(width / beta / factor) * factor
+        elif h_bar * w_bar < min_pixels:
+            beta = math.sqrt(min_pixels / (height * width))
+            h_bar = math.ceil(height * beta / factor) * factor
+            w_bar = math.ceil(width * beta / factor) * factor
+        return h_bar, w_bar
+
+    def get_token_value(self, tok):
+        return self.extra_special_tokens[tok]
+
+    def construct_visual_indicators(self, grid, is_video: bool = False):
+        if is_video:
+            start_token = self.get_token_value('video_start')
+            end_token = self.get_token_value('video_end')
+        else:
+            start_token = self.get_token_value('image_start')
+            end_token = self.get_token_value('image_end')
+
+        image_placeholders = [start_token, self.get_token_value('visual_atom')]
+        if grid[0] * grid[1] > 1:
+            for r in range(grid[0]):
+                for c in range(grid[1]):
+                    image_placeholders.append(
+                        self.get_token_value('visual_atom'))
+
+        image_placeholders.append(end_token)
+        return image_placeholders
+
+    def construct_visual_placeholders(self, grid, is_video: bool = False):
+        visual_placeholders = self.construct_visual_indicators((1, 1),
+                                                               is_video)
+
+        image_atom_token_id = self.get_token_value('visual_atom')
+        # Extract the padding token ID from tokenizer
+        image_padding_token_id = self.get_token_value('image_pad')
+
+        num_image_atoms = grid[0] * grid[1] * grid[2]
+        num_image_atoms //= self.hidden_stride**2
+        num_image_atoms //= self.temporal_patch_size
+
+        # Create a new list with padding tokens inserted
+        padded_placeholder_tokens = []
+        for token in visual_placeholders:
+            if token == image_atom_token_id:
+                padded_placeholder_tokens.extend([image_padding_token_id] *
+                                                 num_image_atoms)
+            else:
+                padded_placeholder_tokens.append(image_padding_token_id)
+        return padded_placeholder_tokens
+
+    def preprocess_multidata(
+        self,
+        images: Optional[Union[PIL.Image.Image, list[PIL.Image.Image]]] = None,
+        video: Optional[Union[list[PIL.Image.Image], np.ndarray]] = None,
+        convert_to_rgb: Optional[bool] = True,
+        min_pixels: int = MIN_PIXELS,
+        max_pixels: int = MAX_PIXELS,
+        return_tensors: Optional[str] = 'pt',
+    ):
+        is_video = False
+        if images is not None:
+            if not isinstance(images, list):
+                images = [images]
+        elif video is not None:
+            is_video = True
+            # type of vidoe in dummy_mm_data is np.ndarray
+            if isinstance(video, np.ndarray):
+                images = []
+                for i in range(video.shape[0]):
+                    image = PIL.Image.fromarray(video[i].astype(np.uint8))
+                    images.append(image)
+            elif isinstance(video, list):
+                images = video
+        min_pixels = min(max_pixels if max_pixels is not None else MAX_PIXELS,
+                         min_pixels if min_pixels is not None else MIN_PIXELS)
+        images = [
+            image.convert("RGB")
+            if convert_to_rgb and image.mode != 'RGB' else image
+            for image in images
+        ]
+
+        width, height = images[0].size
+        resized_height, resized_width = height, width
+        processed_images = []
+        for image in images:
+            resized_height, resized_width = self.smart_resize(
+                height,
+                width,
+                factor=self.patch_size * self.hidden_stride,
+                min_pixels=min_pixels,
+                max_pixels=max_pixels,
+            )
+            new_size = dict(height=resized_height, width=resized_width)
+            image_pt = self.image_processor.preprocess(
+                image, size=new_size, return_tensors="np")['pixel_values'][0]
+
+            processed_images.append(image_pt)
+
+        patches = np.array(processed_images)
+        if patches.shape[0] % self.temporal_patch_size != 0:
+            num_to_pad = self.temporal_patch_size - (patches.shape[0] %
+                                                     self.temporal_patch_size)
+            repeats = np.repeat(patches[-1][np.newaxis], num_to_pad, axis=0)
+            patches = np.concatenate([patches, repeats], axis=0)
+        channel = patches.shape[1]
+        grid_t = patches.shape[0] // self.temporal_patch_size
+        grid_h = resized_height // self.patch_size
+        grid_w = resized_width // self.patch_size
+
+        patches = patches.reshape(
+            grid_t,
+            self.temporal_patch_size,
+            channel,
+            grid_h // self.hidden_stride,
+            self.hidden_stride,
+            self.patch_size,
+            grid_w // self.hidden_stride,
+            self.hidden_stride,
+            self.patch_size,
+        )
+        patches = patches.transpose(0, 3, 6, 4, 7, 2, 1, 5, 8)
+        flatten_patches = patches.reshape(
+            grid_t * grid_h * grid_w, channel * self.temporal_patch_size *
+            self.patch_size * self.patch_size)
+
+        visual_placeholders = self.construct_visual_placeholders(
+            [grid_t, grid_h, grid_w], is_video)
+        return torch.tensor(
+            flatten_patches), visual_placeholders, torch.tensor(
+                [[grid_t, grid_h, grid_w]])
+
+
+AutoProcessor.register("Ovis2_5Processor", Ovis2_5Processor)
\ No newline at end of file

From a2697a85e7459556187c8fb80855e20597fe59ba Mon Sep 17 00:00:00 2001
From: Yeonsil Yoon <yeon.sil.yoon@intel.com>
Date: Wed, 1 Oct 2025 11:54:01 -0700
Subject: [PATCH 02/12] Ovis padding (#1994)

From https://github.com/HabanaAI/vllm-fork/pull/1940

---------

Co-authored-by: Christopher Manteuffel <christopher.manteuffel@intel.com>
Co-authored-by: Jianhong-Zhang <jianhong.zhang@intel.com>
---
 vllm/model_executor/models/ovis2_5.py | 108 +++++++++++++++++++++++---
 vllm/worker/hpu_model_runner.py       |   2 +-
 2 files changed, 99 insertions(+), 11 deletions(-)

diff --git a/vllm/model_executor/models/ovis2_5.py b/vllm/model_executor/models/ovis2_5.py
index dd110f7c7cf9..dfbb8c183ae6 100644
--- a/vllm/model_executor/models/ovis2_5.py
+++ b/vllm/model_executor/models/ovis2_5.py
@@ -10,6 +10,8 @@
 from transformers import BaseImageProcessor, BatchFeature, PretrainedConfig
 
 from vllm.config import VllmConfig
+from vllm.logger import init_logger
+
 from vllm.model_executor.layers.linear import ReplicatedLinear
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
@@ -29,9 +31,11 @@
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.processors.ovis2_5 import Ovis2_5Processor
-
+from vllm.worker.hpu_model_runner import VisionBuckets
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal
 
+logger = init_logger(__name__)
+
 IMAGE_TOKEN = "<image>"
 VIDEO_TOKEN = "<video>"
 INDICATOR_IDS = [-301, -302, -303, -304]
@@ -416,6 +420,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         text_model_type = self.config.get_text_config().model_type
         self.image_pad_token_id = IMAGE_PAD_TOKEN_ID_MAP[text_model_type]
+        self.vision_buckets = VisionBuckets(is_batch_based=False)
 
         # TODO(Isotr0py): PP support
         # self.make_empty_intermediate_tensors = (
@@ -458,30 +463,113 @@ def _parse_and_validate_visual_input(
 
         raise AssertionError("This line should be unreachable.")
 
+    def find_factor(self, desired_patches, orig):
+        for i in range(orig + 1, desired_patches + 1):
+            if desired_patches % i == 0:
+                if i % 2 != 0:
+                    continue
+                else:
+                    return i
+        return None
+
+    def find_padding(self, h_orig, w_orig, desired_patches):
+        best_pad_h, best_pad_w = 0, 0
+        if desired_patches % h_orig == 0:
+            best_pad_h = 0
+            w_factor = desired_patches // h_orig
+            best_pad_w = w_factor - w_orig if (w_factor > w_orig
+                                               and w_factor % 2 == 0) else 0
+        elif desired_patches % w_orig == 0:
+            best_pad_w = 0
+            h_factor = desired_patches // w_orig
+            best_pad_h = h_factor - h_orig if (h_factor > h_orig
+                                               and h_factor % 2 == 0) else 0
+        elif desired_patches % h_orig != 0 and desired_patches % w_orig != 0:
+            if h_orig > w_orig:
+                w_factor = self.find_factor(desired_patches, w_orig)
+                if w_factor is not None:
+                    best_pad_w = w_factor - w_orig
+                    h_factor = desired_patches // w_factor
+                    if h_factor > h_orig:
+                        best_pad_h = h_factor - h_orig
+            else:
+                h_factor = self.find_factor(desired_patches, h_orig)
+                if h_factor is not None:
+                    best_pad_h = h_factor - h_orig
+                    w_factor = desired_patches // h_factor
+                    if w_factor > w_orig:
+                        best_pad_w = w_factor - w_orig
+
+        if (best_pad_h + h_orig) * (best_pad_w + w_orig) != desired_patches:
+            best_pad_h, best_pad_w = 0, 0
+
+        return best_pad_h, best_pad_w
+
+    def pad_multimodal_data(self, pixel_values, image_grid_thw,
+                            vision_buckets):
+        desired_number_of_pixels = vision_buckets.get_multimodal_bucket(
+            pixel_values.shape[0])
+        padding_len = desired_number_of_pixels - pixel_values.shape[0]
+        if padding_len <= 0:
+            return pixel_values, image_grid_thw
+
+        logger_msg = "Padding current number pixel " \
+            + str(pixel_values.shape[0]) \
+            + " to " \
+            + str(desired_number_of_pixels)
+        logger.info(logger_msg)
+
+        h_orig, w_orig = image_grid_thw[0, 1].item(), image_grid_thw[0,
+                                                                     2].item()
+        pad_h, pad_w = self.find_padding(h_orig, w_orig,
+                                         desired_number_of_pixels)
+        if pad_h == 0 and pad_w == 0:
+            return pixel_values, image_grid_thw
+
+        constant_value = -100
+        pixel_values = torch.cat([
+            pixel_values,
+            torch.ones((padding_len, pixel_values.shape[1]),
+                       device=pixel_values.device) * constant_value
+        ])
+
+        image_grid_thw = torch.tensor([[1, h_orig + pad_h, w_orig + pad_w]],
+                                      device=image_grid_thw.device,
+                                      dtype=image_grid_thw.dtype)
+
+        assert image_grid_thw.prod(-1).sum() == desired_number_of_pixels
+        return pixel_values, image_grid_thw
+
     def _process_image_input(
             self, image_input: OvisImagePatchInputs) -> MultiModalEmbeddings:
         image_patches_flat = image_input["flat_data"]
-        patches_per_image = image_input["patches_per_image"]
         indicator_tokens = image_input["indicator_tokens"]
         grid_thws = image_input["grids"]
 
-        indicator_per_image = list(
-            map(lambda x: 2 if x > 1 else x + 2, patches_per_image))
-
         target_dtype = self.visual_tokenizer.dtype
-        visual_tokens = self.visual_tokenizer(
-            image_patches_flat.to(target_dtype), grid_thws)
 
+        visual_embeds, grid_thws = self.pad_multimodal_data(
+            image_patches_flat.to(target_dtype), grid_thws,
+            self.vision_buckets)
+
+        visual_tokens = self.visual_tokenizer(visual_embeds, grid_thws)
         visual_embeds = self.vte(visual_tokens)  # 1:1 numeric eq.
         indicator_embeds = self.vte(indicator_tokens)
+        padded_patches_per_image = [
+            grid[1] * grid[2] // (self.config.vit_config.hidden_stride**2)
+            for grid in grid_thws
+        ]
 
-        visual_embeds_per_image = visual_embeds.split(patches_per_image, dim=0)
+        visual_embeds_per_image = visual_embeds.split(padded_patches_per_image,
+                                                      dim=0)
+        indicator_per_image = list(
+            map(lambda x: 2 if x > 1 else x + 2, padded_patches_per_image))
         indicator_embeds_per_image = indicator_embeds.split(
             indicator_per_image)
 
         vision_embeddings = []
-        for indicator, visual in zip(indicator_embeds_per_image,
-                                     visual_embeds_per_image):
+        for idx, (indicator, visual) in enumerate(
+                zip(indicator_embeds_per_image, visual_embeds_per_image)):
             vision_embeddings_per_image = []
             visual = visual.unsqueeze(0)
             for i in range(visual.shape[0]):
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 8f66901db263..25fa47d6a1bf 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -124,7 +124,7 @@ def __init__(self, is_batch_based):
                     multimodal_buckets = [1, 2, 4, 8]  # batch sizes for gemma3
                 else:
                     multimodal_buckets = [
-                        1600, 3136, 4096, 6400, 7744, 9216, 12544
+                        784, 1600, 3136, 4096, 6400, 7744, 9216, 12544
                     ]
             else:
                 multimodal_buckets = [int(i) for i in envvar.split(',')]

From 0703df8a5a8174a44ae85a3c523c507c7ed7cb4d Mon Sep 17 00:00:00 2001
From: Jianhong Zhang <jianhong.zhang@intel.com>
Date: Wed, 1 Oct 2025 16:33:59 -0700
Subject: [PATCH 03/12] Update visionbucket for Ovis (#1995)

---
 vllm/model_executor/models/ovis2_5.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/ovis2_5.py b/vllm/model_executor/models/ovis2_5.py
index dfbb8c183ae6..fc2d211b7a64 100644
--- a/vllm/model_executor/models/ovis2_5.py
+++ b/vllm/model_executor/models/ovis2_5.py
@@ -420,7 +420,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         text_model_type = self.config.get_text_config().model_type
         self.image_pad_token_id = IMAGE_PAD_TOKEN_ID_MAP[text_model_type]
-        self.vision_buckets = VisionBuckets(is_batch_based=False)
+        self.vision_buckets = VisionBuckets(self)
 
         # TODO(Isotr0py): PP support
         # self.make_empty_intermediate_tensors = (

From ab6d98d2229cef859ddcef55da5b1dc396f9fa2a Mon Sep 17 00:00:00 2001
From: Jianhong-Zhang <jianhong.zhang@intel.com>
Date: Mon, 6 Oct 2025 14:08:17 -0700
Subject: [PATCH 04/12] Refine Ovis padding

---
 vllm/model_executor/models/ovis2_5.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/models/ovis2_5.py b/vllm/model_executor/models/ovis2_5.py
index fc2d211b7a64..4f9b551eedd4 100644
--- a/vllm/model_executor/models/ovis2_5.py
+++ b/vllm/model_executor/models/ovis2_5.py
@@ -477,13 +477,15 @@ def find_padding(self, h_orig, w_orig, desired_patches):
         if desired_patches % h_orig == 0:
             best_pad_h = 0
             w_factor = desired_patches // h_orig
-            best_pad_w = w_factor - w_orig if (w_factor > w_orig
-                                               and w_factor % 2 == 0) else 0
+            best_pad_w = (w_factor - w_orig if
+                          (w_factor > w_orig and w_factor %
+                           self.config.vit_config.hidden_stride == 0) else 0)
         elif desired_patches % w_orig == 0:
             best_pad_w = 0
             h_factor = desired_patches // w_orig
-            best_pad_h = h_factor - h_orig if (h_factor > h_orig
-                                               and h_factor % 2 == 0) else 0
+            best_pad_h = (h_factor - h_orig if
+                          (h_factor > h_orig and h_factor %
+                           self.config.vit_config.hidden_stride == 0) else 0)
         elif desired_patches % h_orig != 0 and desired_patches % w_orig != 0:
             if h_orig > w_orig:
                 w_factor = self.find_factor(desired_patches, w_orig)

From b7bda77fb7a9c3060d9554df6cea0f93e2e4b197 Mon Sep 17 00:00:00 2001
From: Seunghyuk Park <seunghyuk.h.park@intel.com>
Date: Sat, 4 Oct 2025 19:38:52 +0000
Subject: [PATCH 05/12] Add hpu mediapipe preprocess for ovis

---
 vllm/transformers_utils/processors/ovis2_5.py | 281 +++++++++++++++++-
 1 file changed, 280 insertions(+), 1 deletion(-)

diff --git a/vllm/transformers_utils/processors/ovis2_5.py b/vllm/transformers_utils/processors/ovis2_5.py
index c0bace5ae52f..9a763a157e89 100644
--- a/vllm/transformers_utils/processors/ovis2_5.py
+++ b/vllm/transformers_utils/processors/ovis2_5.py
@@ -13,6 +13,217 @@
                                            Unpack)
 from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
 
+from habana_frameworks.mediapipe import fn
+from habana_frameworks.mediapipe.mediapipe import MediaPipe
+from habana_frameworks.mediapipe.media_types import dtype as dt
+from habana_frameworks.mediapipe.media_types import imgtype as it
+from habana_frameworks.mediapipe.media_types import readerOutType as ro
+from habana_frameworks.mediapipe.operators.reader_nodes.reader_nodes import (
+    media_ext_reader_op_impl, media_ext_reader_op_tensor_info
+)
+from habana_frameworks.mediapipe.plugins.iterator_pytorch import (
+    MediaGenericPytorchIterator
+)
+# Handle MediaPipe pipe_manager destructor
+from habana_frameworks.mediapipe.backend.cal import (
+    pipe_manager, cpp_pipe_manager_list
+)
+
+from queue import Queue
+import os
+import io
+import time
+import atexit
+from dataclasses import dataclass
+
+from vllm.logger import init_logger
+logger = init_logger(__name__)
+
+
+def _patched_close(self):
+    """Patched close method that handles None cpp_pipe_manager_list during shutdown"""
+    try:
+        # Check if cpp_pipe_manager_list exists and is not None
+        if cpp_pipe_manager_list is not None and self._pm_ in cpp_pipe_manager_list:
+            cpp_pipe_manager_list.remove(self._pm_)
+    except (TypeError, AttributeError):
+        # Handle case where cpp_pipe_manager_list is None or not iterable
+        pass
+
+    # Clean up the pipe manager
+    if self._pm_ is not None:
+        self._pm_.close()
+        self._pm_ = None
+
+pipe_manager.close = _patched_close
+
+# Queue shared between external reader and mediapipe call
+shared_q = Queue()
+
+
+class MediaPytorchIterator(MediaGenericPytorchIterator):
+    def __init__(self, mediapipe):
+        super().__init__(mediapipe=mediapipe, device="hpu", fw_type="PYT_FW")
+
+
+class external_reader(media_ext_reader_op_impl):
+    def __init__(self, params, fw_params):
+        self.batch_size = fw_params.batch_size
+        self.max_file = ""
+        self.num_batches = 1
+
+    def __iter__(self):
+        return self
+
+    def __len__(self):
+        return self.num_batches
+
+    def __next__(self):
+        img_list = shared_q.get()
+        for i in range(len(img_list)):
+            # NOTE: this padding is needed because of HW alignmnet requirment
+            rem = len(img_list[i]) % 64
+            pad = (64 - rem) % 64
+            if pad:
+                img_list[i] = np.pad(img_list[i],
+                                    (0, pad),
+                                    'constant')
+        return img_list
+
+    def get_media_output_type(self):
+        return ro.BUFFER_LIST
+
+    def get_largest_file(self):
+        return self.max_file
+
+    def gen_output_info(self):
+        out_info = []
+        o = media_ext_reader_op_tensor_info(
+            dt.NDT, np.array([self.batch_size], dtype=np.uint32), "")
+        out_info.append(o)
+        return out_info
+
+
+class hpuMediaPipe(MediaPipe):
+    def __init__(self, device, queue_depth, batch_size,
+                 num_threads, op_device,
+                 img_height, img_width):
+        super(
+            hpuMediaPipe,
+            self).__init__(
+            device,
+            queue_depth,
+            batch_size,
+            num_threads,
+            self.__class__.__name__)
+
+        mediapipe_seed = int(time.time_ns() % (2**31 - 1))
+
+        self.input = fn.MediaExtReaderOp(impl=external_reader,
+                                         num_outputs=1,
+                                         seed=mediapipe_seed,
+                                         device=op_device)
+        self.decode = fn.ImageDecoder(
+            device="hpu", output_format=it.RGB_I, resize=[img_width, img_height])
+
+        self.mean_node = fn.MediaConst(
+            data=np.array([127.5, 127.5, 127.5], dtype=dt.FLOAT32),
+            shape=[1, 1, 3],
+            dtype=dt.FLOAT32
+        )
+
+        self.std_node = fn.MediaConst(
+            data=np.array([1/127.5, 1/127.5, 1/127.5], dtype=dt.FLOAT32),
+            shape=[1, 1, 3],
+            dtype=dt.FLOAT32
+        )
+
+        self.cmn = fn.CropMirrorNorm(crop_w=img_width, crop_h=img_height, dtype=dt.FLOAT32, device="hpu")
+
+        self.transpose = fn.Transpose(
+            device="hpu",
+            tensorDim=4,
+            permutation=[1, 2, 0, 3] #NCHW
+        )
+
+    def definegraph(self):
+        images = self.input()
+        images = self.decode(images)
+        mean = self.mean_node()
+        std = self.std_node()
+        images = self.cmn(images, mean, std)
+        images = self.transpose(images)
+
+        return images
+
+
+# -----------------------------------------------------------------------------
+# MediaPipe manager (persist pipes/iterators)
+# -----------------------------------------------------------------------------
+@dataclass
+class _PipeState:
+    pipe:   hpuMediaPipe | None = None
+    it:     MediaGenericPytorchIterator | None = None
+    bsz:    int | None = None
+    H:      int | None = None
+    W:      int | None = None
+
+
+class MediaPipeTiler:
+    """Owns and reuses MediaPipe pipes/iterators for main path"""
+    def __init__(self) -> None:
+        self._main  = _PipeState()
+
+    def _rebuild(self, st: _PipeState, *, bsz: int, H: int, W: int) -> None:
+        if st.pipe is not None:
+            try:
+                st.pipe.close()
+            except Exception:
+                pass
+        pipe = hpuMediaPipe("legacy", 0, bsz, 1, "cpu", H, W)
+        pipe.build()
+        st.pipe, st.it, st.bsz, st.H, st.W = pipe, iter(MediaPytorchIterator(pipe)), bsz, H, W
+
+    def ensure_main(self, *, bsz: int, H: int, W: int) -> tuple[hpuMediaPipe, MediaGenericPytorchIterator]:
+        st = self._main
+        if st.pipe is None or st.bsz != bsz or st.H != H or st.W != W:
+            self._rebuild(st, bsz=bsz, H=H, W=W)
+        return st.pipe, st.it  # type: ignore[return-value]
+
+    def reset_iter(self) -> None:
+        st = self._main
+        if st.pipe is not None:
+            st.it = iter(MediaPytorchIterator(st.pipe))
+
+    def close_all(self) -> None:
+        st = self._main
+        try:
+            if st.pipe is not None:
+                st.pipe.close()
+        except Exception:
+            pass
+        finally:
+            st.pipe = None
+            st.it = None
+            st.bsz = st.H = st.W = None
+
+
+_MP = MediaPipeTiler()
+atexit.register(_MP.close_all)
+
+def get_image_info(data):
+    # Get image info using PIL without decoding
+    try:
+        with Image.open(io.BytesIO(data)) as img:
+            return {
+                'format': img.format,
+                'size': img.size,
+                'mode': img.mode
+            }
+    except Exception as e:
+        raise ValueError(f"Input image bitstream is not in supported format: {str(e)}")
+
+
 __all__ = ['Ovis2_5Processor']
 IMAGE_TOKEN = "<image>"
 VIDEO_TOKEN = "<video>"
@@ -398,6 +609,74 @@ def preprocess_multidata(
                 images = video
         min_pixels = min(max_pixels if max_pixels is not None else MAX_PIXELS,
                          min_pixels if min_pixels is not None else MIN_PIXELS)
+
+        use_hpu_mp = (os.getenv("VLLM_USE_MEDIA_PIPELINE", "false").lower() in ("1", "true", "yes"))
+        is_bytes_batch = isinstance(images, list) and images and isinstance(images[0], (bytes, bytearray))
+
+        if use_hpu_mp and is_bytes_batch:
+            queue_depth = 0
+            num_threads = 1
+
+            # Probe original size from first image (without decoding on CPU)
+            with PIL.Image.open(io.BytesIO(images[0])) as _probe:
+                width, height = _probe.size
+            # Ovis policy: factor = patch_size * hidden_stride
+            resized_height, resized_width = self.smart_resize(
+                height, width,
+                factor=self.patch_size * self.hidden_stride,
+                min_pixels=min_pixels, max_pixels=max_pixels,
+            )
+            batch_size = len(images)
+
+            # if batch, H, W is changed, create new one
+            main_pipe, main_iter = _MP.ensure_main(bsz=batch_size, H=resized_height, W=resized_width)
+
+            img_list = np.empty(shape=[batch_size, ], dtype=object)
+            for i in range(batch_size):
+                img_list[i] = np.frombuffer(images[i], dtype=np.uint8)
+
+            shared_q.put(img_list)
+            try:
+                processed_images = next(main_iter)[0]  # (B, 3, H, W) on hpu
+            except StopIteration:
+                _MP.reset_iter()
+                _, main_iter = _MP.ensure_main(bsz=batch_size, H=resized_height, W=resized_width)
+                processed_images = next(main_iter)[0]
+            finally:
+                shared_q.task_done()
+
+            ps = self.patch_size
+            hs = self.hidden_stride
+
+            # processed: (B, C, H, W)  — resize + normalized
+            B, C, H, W = processed_images.shape
+
+            # check sizes
+            assert H % ps == 0 and W % ps == 0, f"(H,W)=({H},{W}) not divisible by ps={ps}"
+            Ty, Tx = H // ps, W // ps
+            assert Ty % hs == 0 and Tx % hs == 0, f"(Ty,Tx)=({Ty},{Tx}) not divisible by hidden_stride={hs}"
+            Gy, Gx = Ty // hs, Tx // hs
+
+            # (B,C,H,W) -> (B,C,Ty,ps,Tx,ps) -> (B,C,Gy,hs,ps,Gx,hs,ps)
+            x = processed_images.contiguous().reshape(B, C, Ty, ps, Tx, ps) \
+                                    .reshape(B, C, Gy, hs, ps, Gx, hs, ps)
+
+            # stride-aware order: (B, Gy, Gx, hs, hs, C, ps, ps)
+            tiles = x.permute(0, 2, 5, 3, 6, 1, 4, 7).contiguous()
+
+            # (B*Ty*Tx, C*ps*ps)
+            flatten_patches = tiles.reshape(B * Ty * Tx, C * ps * ps).contiguous()
+
+            # grids/placeholder
+            grid_t = 1
+            grids = torch.tensor([[grid_t, Ty, Tx]] * B, device=flatten_patches.device)
+            visual_placeholders = [
+                self.construct_visual_placeholders([grid_t, Ty, Tx], is_video=False)
+                for _ in range(B)
+            ]
+
+            return flatten_patches, visual_placeholders, grids
+
         images = [
             image.convert("RGB")
             if convert_to_rgb and image.mode != 'RGB' else image
@@ -455,4 +734,4 @@ def preprocess_multidata(
                 [[grid_t, grid_h, grid_w]])
 
 
-AutoProcessor.register("Ovis2_5Processor", Ovis2_5Processor)
\ No newline at end of file
+AutoProcessor.register("Ovis2_5Processor", Ovis2_5Processor)

From 44cf4ae726c58ddb1df8529be61cd0d91245c4b4 Mon Sep 17 00:00:00 2001
From: Spurthi Lokeshappa <spurthi.lokeshappa@intel.com>
Date: Mon, 6 Oct 2025 15:29:57 -0700
Subject: [PATCH 06/12] Add hpu_graph to vte and visual_tokenize (#2000)

---
 vllm/model_executor/models/siglip2navit.py | 27 ++++++-------
 vllm/worker/hpu_model_runner.py            | 46 ++++++++++++++++++++--
 2 files changed, 56 insertions(+), 17 deletions(-)

diff --git a/vllm/model_executor/models/siglip2navit.py b/vllm/model_executor/models/siglip2navit.py
index e728f94a3935..dc0022a169c4 100644
--- a/vllm/model_executor/models/siglip2navit.py
+++ b/vllm/model_executor/models/siglip2navit.py
@@ -439,13 +439,15 @@ def get_window_index(self, grid_thw):
                 vit_merger_window_size,
                 vit_merger_window_size,
             )
-            seqlens = (index_padded != -100).sum([2, 3]).reshape(-1)
+            if self.fullatt_block_indexes:
+                seqlens = (index_padded != -100).sum([2, 3]).reshape(-1)
             index_padded = index_padded.reshape(-1)
             index_new = index_padded[index_padded != -100]
             window_index.append(index_new + window_index_id)
-            cu_seqlens_tmp = seqlens.cumsum(
-                0) * self.spatial_merge_unit + cu_window_seqlens[-1]
-            cu_window_seqlens.extend(cu_seqlens_tmp.tolist())
+            if self.fullatt_block_indexes:
+                cu_seqlens_tmp = seqlens.cumsum(
+                    0) * self.spatial_merge_unit + cu_window_seqlens[-1]
+                cu_window_seqlens.extend(cu_seqlens_tmp.tolist())
             window_index_id += (grid_t * llm_grid_h * llm_grid_w).item()
         window_index = torch.cat(window_index, dim=0)
 
@@ -484,16 +486,13 @@ def forward(
         def remove_duplicates_cpu(a):
             return [a[i] for i in range(len(a)) if i == 0 or a[i - 1] != a[i]]
 
-        if is_hpu:
-            cu_window_seqlens = remove_duplicates_cpu(cu_window_seqlens)
-
-        cu_window_seqlens = torch.tensor(
-            cu_window_seqlens,
-            device=inputs_embeds.device,
-            dtype=grid_thws.dtype if torch.jit.is_tracing() else torch.int32,
-        )
-        if not is_hpu:
-            cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens)
+        if self.fullatt_block_indexes:
+            cu_window_seqlens = torch.tensor(
+                cu_window_seqlens,
+                device=inputs_embeds.device,
+                dtype=grid_thws.dtype
+                if torch.jit.is_tracing() else torch.int32,
+            )
 
         seq_len, _ = inputs_embeds.size()
         inputs_embeds = inputs_embeds.reshape(
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 25fa47d6a1bf..35b14d902726 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -160,9 +160,13 @@ def __call__(cls, *args, **kwargs):
 
 
 def is_mm_optimized(model):
-    return 'Gemma3ForConditionalGeneration' in str(type(model.model)) \
-        if hasattr(model, 'model') else \
-        'Gemma3ForConditionalGeneration' in str(type(model))
+    mm_models = [
+        'Gemma3ForConditionalGeneration', 'InternVLChatModel', 'Ovis2_5'
+    ]
+
+    return any(m in str(type(model.model)) for m in mm_models) \
+        if hasattr(model, 'model') \
+        else any(m in str(type(model)) for m in mm_models)
 
 
 def pad_flat_tensor(tensor, desired_size):
@@ -380,6 +384,20 @@ def __init__(self, model, vllm_config, is_causal, sampler):
                             htorch.hpu.wrap_in_hpu_graph( \
                             self.model.multi_modal_projector, \
                             disable_tensor_cache=True)
+                if hasattr(self.model, 'vision_model'):
+                    self.model.vision_model = htorch.hpu.wrap_in_hpu_graph(
+                        self.model.vision_model, disable_tensor_cache=True)
+                if hasattr(self.model, 'mlp1'):
+                    self.model.mlp1 = htorch.hpu.wrap_in_hpu_graph(
+                        self.model.mlp1, disable_tensor_cache=True)
+                if hasattr(self.model, 'vte'):
+                    self.model.vte = htorch.hpu.wrap_in_hpu_graph(
+                        self.model.vte, disable_tensor_cache=False)
+                if hasattr(self.model, 'visual_tokenizer'):
+                    self.model.visual_tokenizer = htorch.hpu.wrap_in_hpu_graph(
+                        self.model.visual_tokenizer,
+                        disable_tensor_cache=False)
+
 
         self._rotary_embed_module = self._get_rotary_embedding_module(
             self.model)
@@ -2702,6 +2720,28 @@ def create_dummy_multi_modal_seq_group_metadata(self, group_id, img_args,
                 "pixel_values": pixel_values,
                 "image_grid_thw": image_grid_thw,
             }
+        elif "Ovis2_5" in str(type(self.model.model)):
+            vit_cfg = self.model.model.config.vit_config
+            self.image_token_id = getattr(self.model.model.config,
+                                          "image_token_id", -200)
+            image_h = 128
+            image_w = int(img_args / image_h)
+            num_image_tokens = int(image_h * image_w //
+                                   (vit_cfg.hidden_stride**2))
+            image_grid_thw = torch.tensor([[1, image_h, image_w]],
+                                          dtype=torch.int32)
+
+            pixel_values = torch.randn(
+                image_grid_thw[0].prod(),
+                vit_cfg.num_channels * vit_cfg.temporal_patch_size *
+                vit_cfg.patch_size * vit_cfg.patch_size)
+            indicator_tokens = torch.tensor([65532, 65533],
+                                            device='hpu:0').unsqueeze(0)
+            multi_modal_data = {
+                "pixel_values": pixel_values,
+                "indicator_tokens": indicator_tokens,
+                "grids": [image_grid_thw],
+            }
         else:
             s = self.model.model.config.vision_config.image_size
             pixel_values = torch.randn([img_args, 3, s, s])

From 52a3810511f95e0402f2e9a3e3348482946809b7 Mon Sep 17 00:00:00 2001
From: Jianhong Zhang <jianhong.zhang@intel.com>
Date: Wed, 8 Oct 2025 13:30:27 -0700
Subject: [PATCH 07/12] Fix create dummy for Ovis2.5 (#2008)

1. fix pixel_values dimension
2. fix typo for Ovis
3. remove log for padding
---
 vllm/model_executor/models/ovis.py    |  2 +-
 vllm/model_executor/models/ovis2_5.py |  6 ------
 vllm/worker/hpu_model_runner.py       | 16 +++++++++-------
 3 files changed, 10 insertions(+), 14 deletions(-)

diff --git a/vllm/model_executor/models/ovis.py b/vllm/model_executor/models/ovis.py
index 5c11d54c6124..4b2041949bcc 100644
--- a/vllm/model_executor/models/ovis.py
+++ b/vllm/model_executor/models/ovis.py
@@ -211,7 +211,7 @@ class OvisImagePatchInputs(TypedDict):
     `(batch_size * num_patches, patch_size_x * patch_size_y * num_channels)`
     """
 
-    inducator_tokens: torch.Tensor
+    indicator_tokens: torch.Tensor
     """
     Shape: 
     `(batch_size * (num_patches + 1))`
diff --git a/vllm/model_executor/models/ovis2_5.py b/vllm/model_executor/models/ovis2_5.py
index 4f9b551eedd4..f2d44cc1a057 100644
--- a/vllm/model_executor/models/ovis2_5.py
+++ b/vllm/model_executor/models/ovis2_5.py
@@ -515,12 +515,6 @@ def pad_multimodal_data(self, pixel_values, image_grid_thw,
         if padding_len <= 0:
             return pixel_values, image_grid_thw
 
-        logger_msg = "Padding current number pixel " \
-            + str(pixel_values.shape[0]) \
-            + " to " \
-            + str(desired_number_of_pixels)
-        logger.info(logger_msg)
-
         h_orig, w_orig = image_grid_thw[0, 1].item(), image_grid_thw[0,
                                                                      2].item()
         pad_h, pad_w = self.find_padding(h_orig, w_orig,
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 35b14d902726..fc85a318f2b8 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -2729,14 +2729,16 @@ def create_dummy_multi_modal_seq_group_metadata(self, group_id, img_args,
             num_image_tokens = int(image_h * image_w //
                                    (vit_cfg.hidden_stride**2))
             image_grid_thw = torch.tensor([[1, image_h, image_w]],
-                                          dtype=torch.int32)
-
-            pixel_values = torch.randn(
-                image_grid_thw[0].prod(),
-                vit_cfg.num_channels * vit_cfg.temporal_patch_size *
-                vit_cfg.patch_size * vit_cfg.patch_size)
+                                          dtype=torch.int64).to('hpu')
+
+            pixel_values = torch.randn(1,
+                                       image_grid_thw[0].prod(),
+                                       vit_cfg.num_channels *
+                                       vit_cfg.temporal_patch_size *
+                                       vit_cfg.patch_size * vit_cfg.patch_size,
+                                       dtype=torch.float32).to('hpu')
             indicator_tokens = torch.tensor([65532, 65533],
-                                            device='hpu:0').unsqueeze(0)
+                                            device='hpu').unsqueeze(0)
             multi_modal_data = {
                 "pixel_values": pixel_values,
                 "indicator_tokens": indicator_tokens,

From 391516d6dc5c17097c83d2a67b7ca24938ca0e90 Mon Sep 17 00:00:00 2001
From: Jianhong Zhang <jianhong.zhang@intel.com>
Date: Wed, 8 Oct 2025 16:07:27 -0700
Subject: [PATCH 08/12] Fix non-singleton error Ovis2.5 (#2009)

---
 vllm/worker/hpu_model_runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index fc85a318f2b8..527e41dfc16a 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -2729,7 +2729,7 @@ def create_dummy_multi_modal_seq_group_metadata(self, group_id, img_args,
             num_image_tokens = int(image_h * image_w //
                                    (vit_cfg.hidden_stride**2))
             image_grid_thw = torch.tensor([[1, image_h, image_w]],
-                                          dtype=torch.int64).to('hpu')
+                                          dtype=torch.int64)
 
             pixel_values = torch.randn(1,
                                        image_grid_thw[0].prod(),

From 1cbd6f25f66bcdab848f0f650d7c94854d2eac9b Mon Sep 17 00:00:00 2001
From: spalne <supreet.singh.palne@intel.com>
Date: Thu, 9 Oct 2025 07:35:53 +0000
Subject: [PATCH 09/12] Fix pre commit

---
 vllm/model_executor/models/ovis2_5.py         |   2 +-
 vllm/transformers_utils/processors/ovis2_5.py | 155 +++++++++---------
 vllm/worker/hpu_model_runner.py               |   1 -
 3 files changed, 83 insertions(+), 75 deletions(-)

diff --git a/vllm/model_executor/models/ovis2_5.py b/vllm/model_executor/models/ovis2_5.py
index f2d44cc1a057..8a80f11667ae 100644
--- a/vllm/model_executor/models/ovis2_5.py
+++ b/vllm/model_executor/models/ovis2_5.py
@@ -11,7 +11,6 @@
 
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
-
 from vllm.model_executor.layers.linear import ReplicatedLinear
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
@@ -32,6 +31,7 @@
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.processors.ovis2_5 import Ovis2_5Processor
 from vllm.worker.hpu_model_runner import VisionBuckets
+
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal
 
 logger = init_logger(__name__)
diff --git a/vllm/transformers_utils/processors/ovis2_5.py b/vllm/transformers_utils/processors/ovis2_5.py
index 9a763a157e89..9f52ac7a9820 100644
--- a/vllm/transformers_utils/processors/ovis2_5.py
+++ b/vllm/transformers_utils/processors/ovis2_5.py
@@ -1,42 +1,38 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import atexit
+import io
 import math
+import os
+import time
+from dataclasses import dataclass
 from functools import cached_property
+from queue import Queue
 from typing import Optional, Union
 
 import numpy as np
 import PIL
 import torch
-from transformers import AutoProcessor, BatchFeature
-from transformers.image_utils import ImageInput
-from transformers.processing_utils import (ProcessingKwargs, ProcessorMixin,
-                                           Unpack)
-from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
-
 from habana_frameworks.mediapipe import fn
-from habana_frameworks.mediapipe.mediapipe import MediaPipe
+# Handle MediaPipe pipe_manager destructor
+from habana_frameworks.mediapipe.backend.cal import (cpp_pipe_manager_list,
+                                                     pipe_manager)
 from habana_frameworks.mediapipe.media_types import dtype as dt
 from habana_frameworks.mediapipe.media_types import imgtype as it
 from habana_frameworks.mediapipe.media_types import readerOutType as ro
+from habana_frameworks.mediapipe.mediapipe import MediaPipe
 from habana_frameworks.mediapipe.operators.reader_nodes.reader_nodes import (
-    media_ext_reader_op_impl, media_ext_reader_op_tensor_info
-)
+    media_ext_reader_op_impl, media_ext_reader_op_tensor_info)
 from habana_frameworks.mediapipe.plugins.iterator_pytorch import (
-    MediaGenericPytorchIterator
-)
-# Handle MediaPipe pipe_manager destructor
-from habana_frameworks.mediapipe.backend.cal import (
-    pipe_manager, cpp_pipe_manager_list
-)
-
-from queue import Queue
-import os
-import io
-import time
-import atexit
-from dataclasses import dataclass
+    MediaGenericPytorchIterator)
+from transformers import AutoProcessor, BatchFeature
+from transformers.image_utils import ImageInput
+from transformers.processing_utils import (ProcessingKwargs, ProcessorMixin,
+                                           Unpack)
+from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
 
 from vllm.logger import init_logger
+
 logger = init_logger(__name__)
 
 
@@ -55,6 +51,7 @@ def _patched_close(self):
         self._pm_.close()
         self._pm_ = None
 
+
 pipe_manager.close = _patched_close
 
 # Queue shared between external reader and mediapipe call
@@ -62,11 +59,13 @@ def _patched_close(self):
 
 
 class MediaPytorchIterator(MediaGenericPytorchIterator):
+
     def __init__(self, mediapipe):
         super().__init__(mediapipe=mediapipe, device="hpu", fw_type="PYT_FW")
 
 
 class external_reader(media_ext_reader_op_impl):
+
     def __init__(self, params, fw_params):
         self.batch_size = fw_params.batch_size
         self.max_file = ""
@@ -85,9 +84,7 @@ def __next__(self):
             rem = len(img_list[i]) % 64
             pad = (64 - rem) % 64
             if pad:
-                img_list[i] = np.pad(img_list[i],
-                                    (0, pad),
-                                    'constant')
+                img_list[i] = np.pad(img_list[i], (0, pad), 'constant')
         return img_list
 
     def get_media_output_type(self):
@@ -105,17 +102,12 @@ def gen_output_info(self):
 
 
 class hpuMediaPipe(MediaPipe):
-    def __init__(self, device, queue_depth, batch_size,
-                 num_threads, op_device,
+
+    def __init__(self, device, queue_depth, batch_size, num_threads, op_device,
                  img_height, img_width):
-        super(
-            hpuMediaPipe,
-            self).__init__(
-            device,
-            queue_depth,
-            batch_size,
-            num_threads,
-            self.__class__.__name__)
+        super(hpuMediaPipe,
+              self).__init__(device, queue_depth, batch_size, num_threads,
+                             self.__class__.__name__)
 
         mediapipe_seed = int(time.time_ns() % (2**31 - 1))
 
@@ -123,27 +115,29 @@ def __init__(self, device, queue_depth, batch_size,
                                          num_outputs=1,
                                          seed=mediapipe_seed,
                                          device=op_device)
-        self.decode = fn.ImageDecoder(
-            device="hpu", output_format=it.RGB_I, resize=[img_width, img_height])
+        self.decode = fn.ImageDecoder(device="hpu",
+                                      output_format=it.RGB_I,
+                                      resize=[img_width, img_height])
 
-        self.mean_node = fn.MediaConst(
-            data=np.array([127.5, 127.5, 127.5], dtype=dt.FLOAT32),
-            shape=[1, 1, 3],
-            dtype=dt.FLOAT32
-        )
+        self.mean_node = fn.MediaConst(data=np.array([127.5, 127.5, 127.5],
+                                                     dtype=dt.FLOAT32),
+                                       shape=[1, 1, 3],
+                                       dtype=dt.FLOAT32)
 
-        self.std_node = fn.MediaConst(
-            data=np.array([1/127.5, 1/127.5, 1/127.5], dtype=dt.FLOAT32),
-            shape=[1, 1, 3],
-            dtype=dt.FLOAT32
-        )
+        self.std_node = fn.MediaConst(data=np.array(
+            [1 / 127.5, 1 / 127.5, 1 / 127.5], dtype=dt.FLOAT32),
+                                      shape=[1, 1, 3],
+                                      dtype=dt.FLOAT32)
 
-        self.cmn = fn.CropMirrorNorm(crop_w=img_width, crop_h=img_height, dtype=dt.FLOAT32, device="hpu")
+        self.cmn = fn.CropMirrorNorm(crop_w=img_width,
+                                     crop_h=img_height,
+                                     dtype=dt.FLOAT32,
+                                     device="hpu")
 
         self.transpose = fn.Transpose(
             device="hpu",
             tensorDim=4,
-            permutation=[1, 2, 0, 3] #NCHW
+            permutation=[1, 2, 0, 3]  #NCHW
         )
 
     def definegraph(self):
@@ -162,17 +156,18 @@ def definegraph(self):
 # -----------------------------------------------------------------------------
 @dataclass
 class _PipeState:
-    pipe:   hpuMediaPipe | None = None
-    it:     MediaGenericPytorchIterator | None = None
-    bsz:    int | None = None
-    H:      int | None = None
-    W:      int | None = None
+    pipe: hpuMediaPipe | None = None
+    it: MediaGenericPytorchIterator | None = None
+    bsz: int | None = None
+    H: int | None = None
+    W: int | None = None
 
 
 class MediaPipeTiler:
     """Owns and reuses MediaPipe pipes/iterators for main path"""
+
     def __init__(self) -> None:
-        self._main  = _PipeState()
+        self._main = _PipeState()
 
     def _rebuild(self, st: _PipeState, *, bsz: int, H: int, W: int) -> None:
         if st.pipe is not None:
@@ -182,9 +177,12 @@ def _rebuild(self, st: _PipeState, *, bsz: int, H: int, W: int) -> None:
                 pass
         pipe = hpuMediaPipe("legacy", 0, bsz, 1, "cpu", H, W)
         pipe.build()
-        st.pipe, st.it, st.bsz, st.H, st.W = pipe, iter(MediaPytorchIterator(pipe)), bsz, H, W
+        st.pipe, st.it, st.bsz, st.H, st.W = pipe, iter(
+            MediaPytorchIterator(pipe)), bsz, H, W
 
-    def ensure_main(self, *, bsz: int, H: int, W: int) -> tuple[hpuMediaPipe, MediaGenericPytorchIterator]:
+    def ensure_main(
+            self, *, bsz: int, H: int,
+            W: int) -> tuple[hpuMediaPipe, MediaGenericPytorchIterator]:
         st = self._main
         if st.pipe is None or st.bsz != bsz or st.H != H or st.W != W:
             self._rebuild(st, bsz=bsz, H=H, W=W)
@@ -211,17 +209,15 @@ def close_all(self) -> None:
 _MP = MediaPipeTiler()
 atexit.register(_MP.close_all)
 
+
 def get_image_info(data):
     # Get image info using PIL without decoding
     try:
         with Image.open(io.BytesIO(data)) as img:
-            return {
-                'format': img.format,
-                'size': img.size,
-                'mode': img.mode
-            }
+            return {'format': img.format, 'size': img.size, 'mode': img.mode}
     except Exception as e:
-        raise ValueError(f"Input image bitstream is not in supported format: {str(e)}")
+        raise ValueError(
+            f"Input image bitstream is not in supported format: {str(e)}")
 
 
 __all__ = ['Ovis2_5Processor']
@@ -610,8 +606,10 @@ def preprocess_multidata(
         min_pixels = min(max_pixels if max_pixels is not None else MAX_PIXELS,
                          min_pixels if min_pixels is not None else MIN_PIXELS)
 
-        use_hpu_mp = (os.getenv("VLLM_USE_MEDIA_PIPELINE", "false").lower() in ("1", "true", "yes"))
-        is_bytes_batch = isinstance(images, list) and images and isinstance(images[0], (bytes, bytearray))
+        use_hpu_mp = (os.getenv("VLLM_USE_MEDIA_PIPELINE", "false").lower()
+                      in ("1", "true", "yes"))
+        is_bytes_batch = isinstance(images, list) and images and isinstance(
+            images[0], (bytes, bytearray))
 
         if use_hpu_mp and is_bytes_batch:
             queue_depth = 0
@@ -622,16 +620,22 @@ def preprocess_multidata(
                 width, height = _probe.size
             # Ovis policy: factor = patch_size * hidden_stride
             resized_height, resized_width = self.smart_resize(
-                height, width,
+                height,
+                width,
                 factor=self.patch_size * self.hidden_stride,
-                min_pixels=min_pixels, max_pixels=max_pixels,
+                min_pixels=min_pixels,
+                max_pixels=max_pixels,
             )
             batch_size = len(images)
 
             # if batch, H, W is changed, create new one
-            main_pipe, main_iter = _MP.ensure_main(bsz=batch_size, H=resized_height, W=resized_width)
+            main_pipe, main_iter = _MP.ensure_main(bsz=batch_size,
+                                                   H=resized_height,
+                                                   W=resized_width)
 
-            img_list = np.empty(shape=[batch_size, ], dtype=object)
+            img_list = np.empty(shape=[
+                batch_size,
+            ], dtype=object)
             for i in range(batch_size):
                 img_list[i] = np.frombuffer(images[i], dtype=np.uint8)
 
@@ -640,7 +644,9 @@ def preprocess_multidata(
                 processed_images = next(main_iter)[0]  # (B, 3, H, W) on hpu
             except StopIteration:
                 _MP.reset_iter()
-                _, main_iter = _MP.ensure_main(bsz=batch_size, H=resized_height, W=resized_width)
+                _, main_iter = _MP.ensure_main(bsz=batch_size,
+                                               H=resized_height,
+                                               W=resized_width)
                 processed_images = next(main_iter)[0]
             finally:
                 shared_q.task_done()
@@ -665,13 +671,16 @@ def preprocess_multidata(
             tiles = x.permute(0, 2, 5, 3, 6, 1, 4, 7).contiguous()
 
             # (B*Ty*Tx, C*ps*ps)
-            flatten_patches = tiles.reshape(B * Ty * Tx, C * ps * ps).contiguous()
+            flatten_patches = tiles.reshape(B * Ty * Tx,
+                                            C * ps * ps).contiguous()
 
             # grids/placeholder
             grid_t = 1
-            grids = torch.tensor([[grid_t, Ty, Tx]] * B, device=flatten_patches.device)
+            grids = torch.tensor([[grid_t, Ty, Tx]] * B,
+                                 device=flatten_patches.device)
             visual_placeholders = [
-                self.construct_visual_placeholders([grid_t, Ty, Tx], is_video=False)
+                self.construct_visual_placeholders([grid_t, Ty, Tx],
+                                                   is_video=False)
                 for _ in range(B)
             ]
 
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 527e41dfc16a..1aae12ab92c0 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -398,7 +398,6 @@ def __init__(self, model, vllm_config, is_causal, sampler):
                         self.model.visual_tokenizer,
                         disable_tensor_cache=False)
 
-
         self._rotary_embed_module = self._get_rotary_embedding_module(
             self.model)
         self._rotary_prepare_cos_sin = self._get_prepare_cos_sin()

From 85b1011cc8d4184bf5960fc7fb70b7773a84b472 Mon Sep 17 00:00:00 2001
From: spalne <supreet.singh.palne@intel.com>
Date: Thu, 9 Oct 2025 07:35:53 +0000
Subject: [PATCH 10/12] Fix pre commit

---
 vllm/model_executor/models/ovis2_5.py         |   2 +-
 vllm/transformers_utils/processors/ovis2_5.py | 183 ++++++++++--------
 vllm/worker/hpu_model_runner.py               |   1 -
 3 files changed, 100 insertions(+), 86 deletions(-)

diff --git a/vllm/model_executor/models/ovis2_5.py b/vllm/model_executor/models/ovis2_5.py
index f2d44cc1a057..8a80f11667ae 100644
--- a/vllm/model_executor/models/ovis2_5.py
+++ b/vllm/model_executor/models/ovis2_5.py
@@ -11,7 +11,6 @@
 
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
-
 from vllm.model_executor.layers.linear import ReplicatedLinear
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
@@ -32,6 +31,7 @@
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.processors.ovis2_5 import Ovis2_5Processor
 from vllm.worker.hpu_model_runner import VisionBuckets
+
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal
 
 logger = init_logger(__name__)
diff --git a/vllm/transformers_utils/processors/ovis2_5.py b/vllm/transformers_utils/processors/ovis2_5.py
index 9a763a157e89..93d2c4515531 100644
--- a/vllm/transformers_utils/processors/ovis2_5.py
+++ b/vllm/transformers_utils/processors/ovis2_5.py
@@ -1,50 +1,52 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import atexit
+import io
 import math
+import os
+import time
+from contextlib import suppress
+from dataclasses import dataclass
 from functools import cached_property
+from queue import Queue
 from typing import Optional, Union
 
 import numpy as np
 import PIL
 import torch
-from transformers import AutoProcessor, BatchFeature
-from transformers.image_utils import ImageInput
-from transformers.processing_utils import (ProcessingKwargs, ProcessorMixin,
-                                           Unpack)
-from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
-
 from habana_frameworks.mediapipe import fn
-from habana_frameworks.mediapipe.mediapipe import MediaPipe
+# Handle MediaPipe pipe_manager destructor
+from habana_frameworks.mediapipe.backend.cal import (cpp_pipe_manager_list,
+                                                     pipe_manager)
 from habana_frameworks.mediapipe.media_types import dtype as dt
 from habana_frameworks.mediapipe.media_types import imgtype as it
 from habana_frameworks.mediapipe.media_types import readerOutType as ro
+from habana_frameworks.mediapipe.mediapipe import MediaPipe
 from habana_frameworks.mediapipe.operators.reader_nodes.reader_nodes import (
-    media_ext_reader_op_impl, media_ext_reader_op_tensor_info
-)
+    media_ext_reader_op_impl, media_ext_reader_op_tensor_info)
 from habana_frameworks.mediapipe.plugins.iterator_pytorch import (
-    MediaGenericPytorchIterator
-)
-# Handle MediaPipe pipe_manager destructor
-from habana_frameworks.mediapipe.backend.cal import (
-    pipe_manager, cpp_pipe_manager_list
-)
-
-from queue import Queue
-import os
-import io
-import time
-import atexit
-from dataclasses import dataclass
+    MediaGenericPytorchIterator)
+from PIL import Image
+from transformers import AutoProcessor, BatchFeature
+from transformers.image_utils import ImageInput
+from transformers.processing_utils import (ProcessingKwargs, ProcessorMixin,
+                                           Unpack)
+from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
 
 from vllm.logger import init_logger
+
 logger = init_logger(__name__)
 
 
 def _patched_close(self):
-    """Patched close method that handles None cpp_pipe_manager_list during shutdown"""
+    """
+    Patched close method that handles None cpp_pipe_manager_list 
+    during shutdown
+    """
     try:
         # Check if cpp_pipe_manager_list exists and is not None
-        if cpp_pipe_manager_list is not None and self._pm_ in cpp_pipe_manager_list:
+        if (cpp_pipe_manager_list is not None
+                and self._pm_ in cpp_pipe_manager_list):
             cpp_pipe_manager_list.remove(self._pm_)
     except (TypeError, AttributeError):
         # Handle case where cpp_pipe_manager_list is None or not iterable
@@ -55,18 +57,21 @@ def _patched_close(self):
         self._pm_.close()
         self._pm_ = None
 
+
 pipe_manager.close = _patched_close
 
 # Queue shared between external reader and mediapipe call
-shared_q = Queue()
+shared_q: Queue = Queue()
 
 
 class MediaPytorchIterator(MediaGenericPytorchIterator):
+
     def __init__(self, mediapipe):
         super().__init__(mediapipe=mediapipe, device="hpu", fw_type="PYT_FW")
 
 
 class external_reader(media_ext_reader_op_impl):
+
     def __init__(self, params, fw_params):
         self.batch_size = fw_params.batch_size
         self.max_file = ""
@@ -81,13 +86,11 @@ def __len__(self):
     def __next__(self):
         img_list = shared_q.get()
         for i in range(len(img_list)):
-            # NOTE: this padding is needed because of HW alignmnet requirment
+            # NOTE: this padding is needed because of HW alignment requirement
             rem = len(img_list[i]) % 64
             pad = (64 - rem) % 64
             if pad:
-                img_list[i] = np.pad(img_list[i],
-                                    (0, pad),
-                                    'constant')
+                img_list[i] = np.pad(img_list[i], (0, pad), 'constant')
         return img_list
 
     def get_media_output_type(self):
@@ -105,17 +108,11 @@ def gen_output_info(self):
 
 
 class hpuMediaPipe(MediaPipe):
-    def __init__(self, device, queue_depth, batch_size,
-                 num_threads, op_device,
+
+    def __init__(self, device, queue_depth, batch_size, num_threads, op_device,
                  img_height, img_width):
-        super(
-            hpuMediaPipe,
-            self).__init__(
-            device,
-            queue_depth,
-            batch_size,
-            num_threads,
-            self.__class__.__name__)
+        super().__init__(device, queue_depth, batch_size, num_threads,
+                         self.__class__.__name__)
 
         mediapipe_seed = int(time.time_ns() % (2**31 - 1))
 
@@ -123,27 +120,29 @@ def __init__(self, device, queue_depth, batch_size,
                                          num_outputs=1,
                                          seed=mediapipe_seed,
                                          device=op_device)
-        self.decode = fn.ImageDecoder(
-            device="hpu", output_format=it.RGB_I, resize=[img_width, img_height])
+        self.decode = fn.ImageDecoder(device="hpu",
+                                      output_format=it.RGB_I,
+                                      resize=[img_width, img_height])
 
-        self.mean_node = fn.MediaConst(
-            data=np.array([127.5, 127.5, 127.5], dtype=dt.FLOAT32),
-            shape=[1, 1, 3],
-            dtype=dt.FLOAT32
-        )
+        self.mean_node = fn.MediaConst(data=np.array([127.5, 127.5, 127.5],
+                                                     dtype=dt.FLOAT32),
+                                       shape=[1, 1, 3],
+                                       dtype=dt.FLOAT32)
 
-        self.std_node = fn.MediaConst(
-            data=np.array([1/127.5, 1/127.5, 1/127.5], dtype=dt.FLOAT32),
-            shape=[1, 1, 3],
-            dtype=dt.FLOAT32
-        )
+        self.std_node = fn.MediaConst(data=np.array(
+            [1 / 127.5, 1 / 127.5, 1 / 127.5], dtype=dt.FLOAT32),
+                                      shape=[1, 1, 3],
+                                      dtype=dt.FLOAT32)
 
-        self.cmn = fn.CropMirrorNorm(crop_w=img_width, crop_h=img_height, dtype=dt.FLOAT32, device="hpu")
+        self.cmn = fn.CropMirrorNorm(crop_w=img_width,
+                                     crop_h=img_height,
+                                     dtype=dt.FLOAT32,
+                                     device="hpu")
 
         self.transpose = fn.Transpose(
             device="hpu",
             tensorDim=4,
-            permutation=[1, 2, 0, 3] #NCHW
+            permutation=[1, 2, 0, 3]  #NCHW
         )
 
     def definegraph(self):
@@ -162,29 +161,31 @@ def definegraph(self):
 # -----------------------------------------------------------------------------
 @dataclass
 class _PipeState:
-    pipe:   hpuMediaPipe | None = None
-    it:     MediaGenericPytorchIterator | None = None
-    bsz:    int | None = None
-    H:      int | None = None
-    W:      int | None = None
+    pipe: hpuMediaPipe | None = None
+    it: MediaGenericPytorchIterator | None = None
+    bsz: int | None = None
+    H: int | None = None
+    W: int | None = None
 
 
 class MediaPipeTiler:
     """Owns and reuses MediaPipe pipes/iterators for main path"""
+
     def __init__(self) -> None:
-        self._main  = _PipeState()
+        self._main = _PipeState()
 
     def _rebuild(self, st: _PipeState, *, bsz: int, H: int, W: int) -> None:
         if st.pipe is not None:
-            try:
+            with suppress(Exception):
                 st.pipe.close()
-            except Exception:
-                pass
         pipe = hpuMediaPipe("legacy", 0, bsz, 1, "cpu", H, W)
         pipe.build()
-        st.pipe, st.it, st.bsz, st.H, st.W = pipe, iter(MediaPytorchIterator(pipe)), bsz, H, W
+        st.pipe, st.it, st.bsz, st.H, st.W = pipe, iter(
+            MediaPytorchIterator(pipe)), bsz, H, W
 
-    def ensure_main(self, *, bsz: int, H: int, W: int) -> tuple[hpuMediaPipe, MediaGenericPytorchIterator]:
+    def ensure_main(
+            self, *, bsz: int, H: int,
+            W: int) -> tuple[hpuMediaPipe, MediaGenericPytorchIterator]:
         st = self._main
         if st.pipe is None or st.bsz != bsz or st.H != H or st.W != W:
             self._rebuild(st, bsz=bsz, H=H, W=W)
@@ -211,17 +212,16 @@ def close_all(self) -> None:
 _MP = MediaPipeTiler()
 atexit.register(_MP.close_all)
 
+
 def get_image_info(data):
     # Get image info using PIL without decoding
     try:
         with Image.open(io.BytesIO(data)) as img:
-            return {
-                'format': img.format,
-                'size': img.size,
-                'mode': img.mode
-            }
+            return {'format': img.format, 'size': img.size, 'mode': img.mode}
     except Exception as e:
-        raise ValueError(f"Input image bitstream is not in supported format: {str(e)}")
+        raise ValueError(
+            f"Input image bitstream is not in supported format: {str(e)}"
+        ) from e
 
 
 __all__ = ['Ovis2_5Processor']
@@ -610,28 +610,36 @@ def preprocess_multidata(
         min_pixels = min(max_pixels if max_pixels is not None else MAX_PIXELS,
                          min_pixels if min_pixels is not None else MIN_PIXELS)
 
-        use_hpu_mp = (os.getenv("VLLM_USE_MEDIA_PIPELINE", "false").lower() in ("1", "true", "yes"))
-        is_bytes_batch = isinstance(images, list) and images and isinstance(images[0], (bytes, bytearray))
+        use_hpu_mp = (os.getenv("VLLM_USE_MEDIA_PIPELINE", "false").lower()
+                      in ("1", "true", "yes"))
+        is_bytes_batch = isinstance(images, list) and images and isinstance(
+            images[0], (bytes, bytearray))
 
         if use_hpu_mp and is_bytes_batch:
-            queue_depth = 0
-            num_threads = 1
+            _queue_depth = 0
+            _num_threads = 1
 
             # Probe original size from first image (without decoding on CPU)
             with PIL.Image.open(io.BytesIO(images[0])) as _probe:
                 width, height = _probe.size
             # Ovis policy: factor = patch_size * hidden_stride
             resized_height, resized_width = self.smart_resize(
-                height, width,
+                height,
+                width,
                 factor=self.patch_size * self.hidden_stride,
-                min_pixels=min_pixels, max_pixels=max_pixels,
+                min_pixels=min_pixels,
+                max_pixels=max_pixels,
             )
             batch_size = len(images)
 
             # if batch, H, W is changed, create new one
-            main_pipe, main_iter = _MP.ensure_main(bsz=batch_size, H=resized_height, W=resized_width)
+            main_pipe, main_iter = _MP.ensure_main(bsz=batch_size,
+                                                   H=resized_height,
+                                                   W=resized_width)
 
-            img_list = np.empty(shape=[batch_size, ], dtype=object)
+            img_list = np.empty(shape=[
+                batch_size,
+            ], dtype=object)
             for i in range(batch_size):
                 img_list[i] = np.frombuffer(images[i], dtype=np.uint8)
 
@@ -640,7 +648,9 @@ def preprocess_multidata(
                 processed_images = next(main_iter)[0]  # (B, 3, H, W) on hpu
             except StopIteration:
                 _MP.reset_iter()
-                _, main_iter = _MP.ensure_main(bsz=batch_size, H=resized_height, W=resized_width)
+                _, main_iter = _MP.ensure_main(bsz=batch_size,
+                                               H=resized_height,
+                                               W=resized_width)
                 processed_images = next(main_iter)[0]
             finally:
                 shared_q.task_done()
@@ -652,9 +662,11 @@ def preprocess_multidata(
             B, C, H, W = processed_images.shape
 
             # check sizes
-            assert H % ps == 0 and W % ps == 0, f"(H,W)=({H},{W}) not divisible by ps={ps}"
+            assert H % ps == 0 and W % ps == 0, (
+                f"(H,W)=({H},{W}) not divisible by ps={ps}")
             Ty, Tx = H // ps, W // ps
-            assert Ty % hs == 0 and Tx % hs == 0, f"(Ty,Tx)=({Ty},{Tx}) not divisible by hidden_stride={hs}"
+            assert Ty % hs == 0 and Tx % hs == 0, (
+                f"(Ty,Tx)=({Ty},{Tx}) not divisible by hidden_stride={hs}")
             Gy, Gx = Ty // hs, Tx // hs
 
             # (B,C,H,W) -> (B,C,Ty,ps,Tx,ps) -> (B,C,Gy,hs,ps,Gx,hs,ps)
@@ -665,13 +677,16 @@ def preprocess_multidata(
             tiles = x.permute(0, 2, 5, 3, 6, 1, 4, 7).contiguous()
 
             # (B*Ty*Tx, C*ps*ps)
-            flatten_patches = tiles.reshape(B * Ty * Tx, C * ps * ps).contiguous()
+            flatten_patches = tiles.reshape(B * Ty * Tx,
+                                            C * ps * ps).contiguous()
 
             # grids/placeholder
             grid_t = 1
-            grids = torch.tensor([[grid_t, Ty, Tx]] * B, device=flatten_patches.device)
+            grids = torch.tensor([[grid_t, Ty, Tx]] * B,
+                                 device=flatten_patches.device)
             visual_placeholders = [
-                self.construct_visual_placeholders([grid_t, Ty, Tx], is_video=False)
+                self.construct_visual_placeholders([grid_t, Ty, Tx],
+                                                   is_video=False)
                 for _ in range(B)
             ]
 
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 527e41dfc16a..1aae12ab92c0 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -398,7 +398,6 @@ def __init__(self, model, vllm_config, is_causal, sampler):
                         self.model.visual_tokenizer,
                         disable_tensor_cache=False)
 
-
         self._rotary_embed_module = self._get_rotary_embedding_module(
             self.model)
         self._rotary_prepare_cos_sin = self._get_prepare_cos_sin()

From 27b84a4e7eb0c8fb1c1bb406bb8da1ef9de7a6d0 Mon Sep 17 00:00:00 2001
From: Seunghyuk Park <separk@habana.ai>
Date: Fri, 10 Oct 2025 19:40:41 +0000
Subject: [PATCH 11/12] Update hpu mediapipe changes more

---
 vllm/entrypoints/chat_utils.py                |  10 +-
 vllm/multimodal/image.py                      |   3 +
 vllm/multimodal/parse.py                      |   3 +
 vllm/multimodal/utils.py                      |  54 +++--
 vllm/transformers_utils/processors/ovis2_5.py | 187 ++++++++----------
 vllm/worker/hpu_model_runner.py               |  12 +-
 6 files changed, 150 insertions(+), 119 deletions(-)

diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index af09cf0580ba..2173148f7d9f 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -405,8 +405,14 @@ def _resolve_chat_template_content_format(
     jinja_text = (hf_chat_template if isinstance(hf_chat_template, str)
                   else load_chat_template(chat_template, is_literal=True))
 
-    detected_format = ("string" if jinja_text is None else
-                       _detect_content_format(jinja_text, default="string"))
+    # The InternVL template has mixed content access patterns that fail with automatic detection.
+    # Set string format for proper operation if InternVL is used.
+    model_type = getattr(model_config.hf_config, 'model_type', '')
+    if model_type == 'internvl_chat' or 'internvl' in model_config.model.lower():
+        detected_format = "string"
+    else:
+        detected_format = ("string" if jinja_text is None else
+                           _detect_content_format(jinja_text, default="string"))
 
     return detected_format
 
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index e673632d4366..152d3ee98c01 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -57,6 +57,9 @@ def load_bytes(self, data: bytes) -> Image.Image:
     def load_base64(self, media_type: str, data: str) -> Image.Image:
         return self.load_bytes(base64.b64decode(data))
 
+    def load_base64_bytes(self, media_type: str, data: str) -> bytes:
+        return base64.b64decode(data, validate=True)
+
     def load_file(self, filepath: Path) -> Image.Image:
         image = Image.open(filepath)
         image.load()
diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py
index cae62b2235e4..53bab426aa83 100644
--- a/vllm/multimodal/parse.py
+++ b/vllm/multimodal/parse.py
@@ -209,6 +209,9 @@ def get_image_size(self, item_idx: int) -> ImageSize:
 
         if isinstance(image, PILImage.Image):
             return ImageSize(*image.size)
+        if isinstance(image, (bytes, bytearray)):
+            with PILImage.open(io.BytesIO(image)) as img:
+                return ImageSize(*img.size)
         if isinstance(image, (np.ndarray, torch.Tensor)):
             _, h, w = image.shape
             return ImageSize(w, h)
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 11a25f851546..2a5d4a0c2098 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -6,6 +6,7 @@
 from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union
 from urllib.parse import ParseResult, urlparse
 
+import os
 import numpy as np
 import numpy.typing as npt
 import torch
@@ -66,6 +67,7 @@ def _load_data_url(
         self,
         url_spec: ParseResult,
         media_io: MediaIO[_M],
+        load_type: str = "PIL",
     ) -> _M:
         data_spec, data = url_spec.path.split(",", 1)
         media_type, data_type = data_spec.split(";", 1)
@@ -74,7 +76,10 @@ def _load_data_url(
             msg = "Only base64 data URLs are supported for now."
             raise NotImplementedError(msg)
 
-        return media_io.load_base64(media_type, data)
+        if load_type == "bytes":
+            return media_io.load_base64_bytes(media_type, data)
+        else:
+            return media_io.load_base64(media_type, data)
 
     def _load_file_url(
         self,
@@ -100,6 +105,7 @@ def load_from_url(
         media_io: MediaIO[_M],
         *,
         fetch_timeout: Optional[int] = None,
+        load_type: str = "bytes",
     ) -> _M:
         url_spec = urlparse(url)
 
@@ -107,13 +113,24 @@ def load_from_url(
             connection = self.connection
             data = connection.get_bytes(url, timeout=fetch_timeout)
 
-            return media_io.load_bytes(data)
+            if load_type == "bytes":
+                msg = "Only data URLs are currently supported for raw bytes loading."
+                raise NotImplementedError(msg)
+            else:
+                connection = self.connection
+                data = connection.get_bytes(url, timeout=fetch_timeout)
+
+                return media_io.load_bytes(data)
 
         if url_spec.scheme == "data":
-            return self._load_data_url(url_spec, media_io)
+            return self._load_data_url(url_spec, media_io, load_type)
 
         if url_spec.scheme == "file":
-            return self._load_file_url(url_spec, media_io)
+            if load_type == "bytes":
+                msg = "Only data URLs are currently supported for raw bytes loading."
+                raise NotImplementedError(msg)
+            else:
+                return self._load_file_url(url_spec, media_io)
 
         msg = "The URL must be either a HTTP, data or file URL."
         raise ValueError(msg)
@@ -124,20 +141,29 @@ async def load_from_url_async(
         media_io: MediaIO[_M],
         *,
         fetch_timeout: Optional[int] = None,
+        load_type: str = "bytes",
     ) -> _M:
         url_spec = urlparse(url)
 
         if url_spec.scheme.startswith("http"):
-            connection = self.connection
-            data = await connection.async_get_bytes(url, timeout=fetch_timeout)
+            if load_type == "bytes":
+                msg = "Only data URLs are currently supported for raw bytes loading."
+                raise NotImplementedError(msg)
+            else:
+                connection = self.connection
+                data = await connection.async_get_bytes(url, timeout=fetch_timeout)
 
-            return media_io.load_bytes(data)
+                return media_io.load_bytes(data)
 
         if url_spec.scheme == "data":
-            return self._load_data_url(url_spec, media_io)
+            return self._load_data_url(url_spec, media_io, load_type)
 
         if url_spec.scheme == "file":
-            return self._load_file_url(url_spec, media_io)
+            if load_type == "bytes":
+                msg = "Only data URLs are currently supported for raw bytes loading."
+                raise NotImplementedError(msg)
+            else:
+                return self._load_file_url(url_spec, media_io)
 
         msg = "The URL must be either a HTTP, data or file URL."
         raise ValueError(msg)
@@ -179,17 +205,20 @@ def fetch_image(
         image_mode: str = "RGB",
     ) -> Image.Image:
         """
-        Load a PIL image from a HTTP or base64 data URL.
+        Load a PIL image (or raw bytes) from a HTTP or base64 data URL.
 
         By default, the image is converted into RGB format.
         """
         image_io = ImageMediaIO(image_mode=image_mode)
+        use_mediapipe = os.getenv("VLLM_USE_MEDIA_PIPELINE", "false").lower() in ("1", "true", "yes")
+        load_type = "bytes" if use_mediapipe else "PIL"
 
         try:
             return self.load_from_url(
                 image_url,
                 image_io,
                 fetch_timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT,
+                load_type=load_type,
             )
         except UnidentifiedImageError as e:
             # convert to ValueError to be properly caught upstream
@@ -202,17 +231,20 @@ async def fetch_image_async(
         image_mode: str = "RGB",
     ) -> Image.Image:
         """
-        Asynchronously load a PIL image from a HTTP or base64 data URL.
+        Asynchronously load a PIL image (or raw bytes) from a HTTP or base64 data URL.
 
         By default, the image is converted into RGB format.
         """
         image_io = ImageMediaIO(image_mode=image_mode)
+        use_mediapipe = os.getenv("VLLM_USE_MEDIA_PIPELINE", "false").lower() in ("1", "true", "yes")
+        load_type = "bytes" if use_mediapipe else "PIL"
 
         try:
             return await self.load_from_url_async(
                 image_url,
                 image_io,
                 fetch_timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT,
+                load_type=load_type,
             )
         except UnidentifiedImageError as e:
             # convert to ValueError to be properly caught upstream
diff --git a/vllm/transformers_utils/processors/ovis2_5.py b/vllm/transformers_utils/processors/ovis2_5.py
index d574f03b2ee3..9a763a157e89 100644
--- a/vllm/transformers_utils/processors/ovis2_5.py
+++ b/vllm/transformers_utils/processors/ovis2_5.py
@@ -1,54 +1,50 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import atexit
-import io
 import math
-import os
-import time
-from contextlib import suppress
-
-from dataclasses import dataclass
 from functools import cached_property
-from queue import Queue
 from typing import Optional, Union
 
 import numpy as np
 import PIL
 import torch
+from transformers import AutoProcessor, BatchFeature
+from transformers.image_utils import ImageInput
+from transformers.processing_utils import (ProcessingKwargs, ProcessorMixin,
+                                           Unpack)
+from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
+
 from habana_frameworks.mediapipe import fn
-# Handle MediaPipe pipe_manager destructor
-from habana_frameworks.mediapipe.backend.cal import (cpp_pipe_manager_list,
-                                                     pipe_manager)
+from habana_frameworks.mediapipe.mediapipe import MediaPipe
 from habana_frameworks.mediapipe.media_types import dtype as dt
 from habana_frameworks.mediapipe.media_types import imgtype as it
 from habana_frameworks.mediapipe.media_types import readerOutType as ro
-from habana_frameworks.mediapipe.mediapipe import MediaPipe
 from habana_frameworks.mediapipe.operators.reader_nodes.reader_nodes import (
-    media_ext_reader_op_impl, media_ext_reader_op_tensor_info)
+    media_ext_reader_op_impl, media_ext_reader_op_tensor_info
+)
 from habana_frameworks.mediapipe.plugins.iterator_pytorch import (
-    MediaGenericPytorchIterator)
+    MediaGenericPytorchIterator
+)
+# Handle MediaPipe pipe_manager destructor
+from habana_frameworks.mediapipe.backend.cal import (
+    pipe_manager, cpp_pipe_manager_list
+)
 
-from PIL import Image
-from transformers import AutoProcessor, BatchFeature
-from transformers.image_utils import ImageInput
-from transformers.processing_utils import (ProcessingKwargs, ProcessorMixin,
-                                           Unpack)
-from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
+from queue import Queue
+import os
+import io
+import time
+import atexit
+from dataclasses import dataclass
 
 from vllm.logger import init_logger
-
 logger = init_logger(__name__)
 
 
 def _patched_close(self):
-    """
-    Patched close method that handles None cpp_pipe_manager_list 
-    during shutdown
-    """
+    """Patched close method that handles None cpp_pipe_manager_list during shutdown"""
     try:
         # Check if cpp_pipe_manager_list exists and is not None
-        if (cpp_pipe_manager_list is not None
-                and self._pm_ in cpp_pipe_manager_list):
+        if cpp_pipe_manager_list is not None and self._pm_ in cpp_pipe_manager_list:
             cpp_pipe_manager_list.remove(self._pm_)
     except (TypeError, AttributeError):
         # Handle case where cpp_pipe_manager_list is None or not iterable
@@ -59,21 +55,18 @@ def _patched_close(self):
         self._pm_.close()
         self._pm_ = None
 
-
 pipe_manager.close = _patched_close
 
 # Queue shared between external reader and mediapipe call
-shared_q: Queue = Queue()
+shared_q = Queue()
 
 
 class MediaPytorchIterator(MediaGenericPytorchIterator):
-
     def __init__(self, mediapipe):
         super().__init__(mediapipe=mediapipe, device="hpu", fw_type="PYT_FW")
 
 
 class external_reader(media_ext_reader_op_impl):
-
     def __init__(self, params, fw_params):
         self.batch_size = fw_params.batch_size
         self.max_file = ""
@@ -88,11 +81,13 @@ def __len__(self):
     def __next__(self):
         img_list = shared_q.get()
         for i in range(len(img_list)):
-            # NOTE: this padding is needed because of HW alignment requirement
+            # NOTE: this padding is needed because of HW alignmnet requirment
             rem = len(img_list[i]) % 64
             pad = (64 - rem) % 64
             if pad:
-                img_list[i] = np.pad(img_list[i], (0, pad), 'constant')
+                img_list[i] = np.pad(img_list[i],
+                                    (0, pad),
+                                    'constant')
         return img_list
 
     def get_media_output_type(self):
@@ -110,13 +105,17 @@ def gen_output_info(self):
 
 
 class hpuMediaPipe(MediaPipe):
-
-    def __init__(self, device, queue_depth, batch_size, num_threads, op_device,
+    def __init__(self, device, queue_depth, batch_size,
+                 num_threads, op_device,
                  img_height, img_width):
-
-        super().__init__(device, queue_depth, batch_size, num_threads,
-                         self.__class__.__name__)
-
+        super(
+            hpuMediaPipe,
+            self).__init__(
+            device,
+            queue_depth,
+            batch_size,
+            num_threads,
+            self.__class__.__name__)
 
         mediapipe_seed = int(time.time_ns() % (2**31 - 1))
 
@@ -124,29 +123,27 @@ def __init__(self, device, queue_depth, batch_size, num_threads, op_device,
                                          num_outputs=1,
                                          seed=mediapipe_seed,
                                          device=op_device)
-        self.decode = fn.ImageDecoder(device="hpu",
-                                      output_format=it.RGB_I,
-                                      resize=[img_width, img_height])
+        self.decode = fn.ImageDecoder(
+            device="hpu", output_format=it.RGB_I, resize=[img_width, img_height])
 
-        self.mean_node = fn.MediaConst(data=np.array([127.5, 127.5, 127.5],
-                                                     dtype=dt.FLOAT32),
-                                       shape=[1, 1, 3],
-                                       dtype=dt.FLOAT32)
+        self.mean_node = fn.MediaConst(
+            data=np.array([127.5, 127.5, 127.5], dtype=dt.FLOAT32),
+            shape=[1, 1, 3],
+            dtype=dt.FLOAT32
+        )
 
-        self.std_node = fn.MediaConst(data=np.array(
-            [1 / 127.5, 1 / 127.5, 1 / 127.5], dtype=dt.FLOAT32),
-                                      shape=[1, 1, 3],
-                                      dtype=dt.FLOAT32)
+        self.std_node = fn.MediaConst(
+            data=np.array([1/127.5, 1/127.5, 1/127.5], dtype=dt.FLOAT32),
+            shape=[1, 1, 3],
+            dtype=dt.FLOAT32
+        )
 
-        self.cmn = fn.CropMirrorNorm(crop_w=img_width,
-                                     crop_h=img_height,
-                                     dtype=dt.FLOAT32,
-                                     device="hpu")
+        self.cmn = fn.CropMirrorNorm(crop_w=img_width, crop_h=img_height, dtype=dt.FLOAT32, device="hpu")
 
         self.transpose = fn.Transpose(
             device="hpu",
             tensorDim=4,
-            permutation=[1, 2, 0, 3]  #NCHW
+            permutation=[1, 2, 0, 3] #NCHW
         )
 
     def definegraph(self):
@@ -165,31 +162,29 @@ def definegraph(self):
 # -----------------------------------------------------------------------------
 @dataclass
 class _PipeState:
-    pipe: hpuMediaPipe | None = None
-    it: MediaGenericPytorchIterator | None = None
-    bsz: int | None = None
-    H: int | None = None
-    W: int | None = None
+    pipe:   hpuMediaPipe | None = None
+    it:     MediaGenericPytorchIterator | None = None
+    bsz:    int | None = None
+    H:      int | None = None
+    W:      int | None = None
 
 
 class MediaPipeTiler:
     """Owns and reuses MediaPipe pipes/iterators for main path"""
-
     def __init__(self) -> None:
-        self._main = _PipeState()
+        self._main  = _PipeState()
 
     def _rebuild(self, st: _PipeState, *, bsz: int, H: int, W: int) -> None:
         if st.pipe is not None:
-            with suppress(Exception):
+            try:
                 st.pipe.close()
+            except Exception:
+                pass
         pipe = hpuMediaPipe("legacy", 0, bsz, 1, "cpu", H, W)
         pipe.build()
-        st.pipe, st.it, st.bsz, st.H, st.W = pipe, iter(
-            MediaPytorchIterator(pipe)), bsz, H, W
+        st.pipe, st.it, st.bsz, st.H, st.W = pipe, iter(MediaPytorchIterator(pipe)), bsz, H, W
 
-    def ensure_main(
-            self, *, bsz: int, H: int,
-            W: int) -> tuple[hpuMediaPipe, MediaGenericPytorchIterator]:
+    def ensure_main(self, *, bsz: int, H: int, W: int) -> tuple[hpuMediaPipe, MediaGenericPytorchIterator]:
         st = self._main
         if st.pipe is None or st.bsz != bsz or st.H != H or st.W != W:
             self._rebuild(st, bsz=bsz, H=H, W=W)
@@ -216,18 +211,17 @@ def close_all(self) -> None:
 _MP = MediaPipeTiler()
 atexit.register(_MP.close_all)
 
-
 def get_image_info(data):
     # Get image info using PIL without decoding
     try:
         with Image.open(io.BytesIO(data)) as img:
-            return {'format': img.format, 'size': img.size, 'mode': img.mode}
+            return {
+                'format': img.format,
+                'size': img.size,
+                'mode': img.mode
+            }
     except Exception as e:
-        raise ValueError(
-
-            f"Input image bitstream is not in supported format: {str(e)}"
-        ) from e
-
+        raise ValueError(f"Input image bitstream is not in supported format: {str(e)}")
 
 
 __all__ = ['Ovis2_5Processor']
@@ -616,36 +610,28 @@ def preprocess_multidata(
         min_pixels = min(max_pixels if max_pixels is not None else MAX_PIXELS,
                          min_pixels if min_pixels is not None else MIN_PIXELS)
 
-        use_hpu_mp = (os.getenv("VLLM_USE_MEDIA_PIPELINE", "false").lower()
-                      in ("1", "true", "yes"))
-        is_bytes_batch = isinstance(images, list) and images and isinstance(
-            images[0], (bytes, bytearray))
+        use_hpu_mp = (os.getenv("VLLM_USE_MEDIA_PIPELINE", "false").lower() in ("1", "true", "yes"))
+        is_bytes_batch = isinstance(images, list) and images and isinstance(images[0], (bytes, bytearray))
 
         if use_hpu_mp and is_bytes_batch:
-            _queue_depth = 0
-            _num_threads = 1
+            queue_depth = 0
+            num_threads = 1
 
             # Probe original size from first image (without decoding on CPU)
             with PIL.Image.open(io.BytesIO(images[0])) as _probe:
                 width, height = _probe.size
             # Ovis policy: factor = patch_size * hidden_stride
             resized_height, resized_width = self.smart_resize(
-                height,
-                width,
+                height, width,
                 factor=self.patch_size * self.hidden_stride,
-                min_pixels=min_pixels,
-                max_pixels=max_pixels,
+                min_pixels=min_pixels, max_pixels=max_pixels,
             )
             batch_size = len(images)
 
             # if batch, H, W is changed, create new one
-            main_pipe, main_iter = _MP.ensure_main(bsz=batch_size,
-                                                   H=resized_height,
-                                                   W=resized_width)
+            main_pipe, main_iter = _MP.ensure_main(bsz=batch_size, H=resized_height, W=resized_width)
 
-            img_list = np.empty(shape=[
-                batch_size,
-            ], dtype=object)
+            img_list = np.empty(shape=[batch_size, ], dtype=object)
             for i in range(batch_size):
                 img_list[i] = np.frombuffer(images[i], dtype=np.uint8)
 
@@ -654,9 +640,7 @@ def preprocess_multidata(
                 processed_images = next(main_iter)[0]  # (B, 3, H, W) on hpu
             except StopIteration:
                 _MP.reset_iter()
-                _, main_iter = _MP.ensure_main(bsz=batch_size,
-                                               H=resized_height,
-                                               W=resized_width)
+                _, main_iter = _MP.ensure_main(bsz=batch_size, H=resized_height, W=resized_width)
                 processed_images = next(main_iter)[0]
             finally:
                 shared_q.task_done()
@@ -668,11 +652,9 @@ def preprocess_multidata(
             B, C, H, W = processed_images.shape
 
             # check sizes
-            assert H % ps == 0 and W % ps == 0, (
-                f"(H,W)=({H},{W}) not divisible by ps={ps}")
+            assert H % ps == 0 and W % ps == 0, f"(H,W)=({H},{W}) not divisible by ps={ps}"
             Ty, Tx = H // ps, W // ps
-            assert Ty % hs == 0 and Tx % hs == 0, (
-                f"(Ty,Tx)=({Ty},{Tx}) not divisible by hidden_stride={hs}")
+            assert Ty % hs == 0 and Tx % hs == 0, f"(Ty,Tx)=({Ty},{Tx}) not divisible by hidden_stride={hs}"
             Gy, Gx = Ty // hs, Tx // hs
 
             # (B,C,H,W) -> (B,C,Ty,ps,Tx,ps) -> (B,C,Gy,hs,ps,Gx,hs,ps)
@@ -683,16 +665,13 @@ def preprocess_multidata(
             tiles = x.permute(0, 2, 5, 3, 6, 1, 4, 7).contiguous()
 
             # (B*Ty*Tx, C*ps*ps)
-            flatten_patches = tiles.reshape(B * Ty * Tx,
-                                            C * ps * ps).contiguous()
+            flatten_patches = tiles.reshape(B * Ty * Tx, C * ps * ps).contiguous()
 
             # grids/placeholder
             grid_t = 1
-            grids = torch.tensor([[grid_t, Ty, Tx]] * B,
-                                 device=flatten_patches.device)
+            grids = torch.tensor([[grid_t, Ty, Tx]] * B, device=flatten_patches.device)
             visual_placeholders = [
-                self.construct_visual_placeholders([grid_t, Ty, Tx],
-                                                   is_video=False)
+                self.construct_visual_placeholders([grid_t, Ty, Tx], is_video=False)
                 for _ in range(B)
             ]
 
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 1aae12ab92c0..f07e6c76fadc 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -1860,8 +1860,16 @@ def _prepare_prompt(
             input_positions=input_positions,
         )
         multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list)
-        multi_modal_kwargs = MultiModalKwargs.as_kwargs(multi_modal_kwargs,
-                                                        device=self.device)
+
+        use_mediapipe = os.getenv("VLLM_USE_MEDIA_PIPELINE", "false").lower() in ("1", "true", "yes")
+        if use_mediapipe:
+            # With mediapipe path some tensors will already be on HPU, we only move to HPU if needed
+            for key in multi_modal_kwargs.keys():
+                if hasattr(multi_modal_kwargs[key], "device") and multi_modal_kwargs[key].device != self.device:
+                    multi_modal_kwargs[key] = self.move_to_device(multi_modal_kwargs[key])
+        else:
+            multi_modal_kwargs = MultiModalKwargs.as_kwargs(multi_modal_kwargs,
+                                                            device=self.device)
 
         return PreparePromptMetadata(input_tokens=input_tokens_tensor,
                                      input_positions=input_positions,

From 9043741263bb25f0eec54e1433dd4bc40552b84b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20Kuligowski?= <mkuligowski@habana.ai>
Date: Tue, 14 Oct 2025 13:29:27 +0200
Subject: [PATCH 12/12] No padding for Ovis2.5 (#2015)

---
 hl-smi_log.txt                  | 48 +++++++++++++++++++++++++++++++++
 vllm/worker/hpu_model_runner.py |  4 +--
 2 files changed, 50 insertions(+), 2 deletions(-)
 create mode 100644 hl-smi_log.txt

diff --git a/hl-smi_log.txt b/hl-smi_log.txt
new file mode 100644
index 000000000000..3e6135fe4b0f
--- /dev/null
+++ b/hl-smi_log.txt
@@ -0,0 +1,48 @@
+[2025-10-10 02:17:09.054] [hlsmi] [info] +-----------------------------------------------------------------------------+
+[2025-10-10 02:17:09.054] [hlsmi] [info] | HL-SMI Version:                              hl-1.23.0-fw-62.2.0.0          |
+[2025-10-10 02:17:09.054] [hlsmi] [info] | Driver Version:                                     1.22.0-5f8fa9f          |
+[2025-10-10 02:17:09.054] [hlsmi] [info] | Nic Driver Version:                                 1.22.0-5f8fa9f          |
+[2025-10-10 02:17:09.054] [hlsmi] [info] |-------------------------------+----------------------+----------------------+
+[2025-10-10 02:17:09.054] [hlsmi] [info] | AIP  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncor-Events|
+[2025-10-10 02:17:09.054] [hlsmi] [info] | Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | AIP-Util  Compute M. |
+[2025-10-10 02:17:09.054] [hlsmi] [info] |===============================+======================+======================|
+[2025-10-10 02:17:09.056] [hlsmi] [info] |   0  HL-325              N/A  | 0000:34:00.0     N/A |                   0  |
+[2025-10-10 02:17:09.073] [hlsmi] [info] | N/A   33C   P0  224W /  900W  |   672MiB / 131072MiB |     0%            0% |
+[2025-10-10 02:17:09.073] [hlsmi] [info] |-------------------------------+----------------------+----------------------+
+[2025-10-10 02:17:09.073] [hlsmi] [info] | Compute Processes:                                               AIP Memory |
+[2025-10-10 02:17:09.073] [hlsmi] [info] |  AIP       PID   Type   Process name                             Usage      |
+[2025-10-10 02:17:09.073] [hlsmi] [info] |=============================================================================|
+[2025-10-10 02:17:09.073] [hlsmi] [info] |   0        N/A   N/A    N/A                                      N/A        |
+[2025-10-10 02:17:09.073] [hlsmi] [info] +=============================================================================+
+[2025-10-10 02:26:45.114] [hlsmi] [info] +-----------------------------------------------------------------------------+
+[2025-10-10 02:26:45.114] [hlsmi] [info] | HL-SMI Version:                              hl-1.22.0-fw-61.3.2.0          |
+[2025-10-10 02:26:45.114] [hlsmi] [info] | Driver Version:                                     1.22.0-5f8fa9f          |
+[2025-10-10 02:26:45.114] [hlsmi] [info] | Nic Driver Version:                                 1.22.0-5f8fa9f          |
+[2025-10-10 02:26:45.114] [hlsmi] [info] |-------------------------------+----------------------+----------------------+
+[2025-10-10 02:26:45.114] [hlsmi] [info] | AIP  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncor-Events|
+[2025-10-10 02:26:45.114] [hlsmi] [info] | Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | AIP-Util  Compute M. |
+[2025-10-10 02:26:45.114] [hlsmi] [info] |===============================+======================+======================|
+[2025-10-10 02:26:45.116] [hlsmi] [info] |   0  HL-325              N/A  | 0000:34:00.0     N/A |                   0  |
+[2025-10-10 02:26:45.133] [hlsmi] [info] | N/A   33C   P0  224W /  900W  |   672MiB / 131072MiB |     0%            0% |
+[2025-10-10 02:26:45.133] [hlsmi] [info] |-------------------------------+----------------------+----------------------+
+[2025-10-10 02:26:45.133] [hlsmi] [info] | Compute Processes:                                               AIP Memory |
+[2025-10-10 02:26:45.133] [hlsmi] [info] |  AIP       PID   Type   Process name                             Usage      |
+[2025-10-10 02:26:45.133] [hlsmi] [info] |=============================================================================|
+[2025-10-10 02:26:45.133] [hlsmi] [info] |   0        N/A   N/A    N/A                                      N/A        |
+[2025-10-10 02:26:45.133] [hlsmi] [info] +=============================================================================+
+[2025-10-10 02:33:37.571] [hlsmi] [info] +-----------------------------------------------------------------------------+
+[2025-10-10 02:33:37.571] [hlsmi] [info] | HL-SMI Version:                              hl-1.22.0-fw-61.3.2.0          |
+[2025-10-10 02:33:37.571] [hlsmi] [info] | Driver Version:                                     1.22.0-5f8fa9f          |
+[2025-10-10 02:33:37.571] [hlsmi] [info] | Nic Driver Version:                                 1.22.0-5f8fa9f          |
+[2025-10-10 02:33:37.571] [hlsmi] [info] |-------------------------------+----------------------+----------------------+
+[2025-10-10 02:33:37.571] [hlsmi] [info] | AIP  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncor-Events|
+[2025-10-10 02:33:37.571] [hlsmi] [info] | Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | AIP-Util  Compute M. |
+[2025-10-10 02:33:37.571] [hlsmi] [info] |===============================+======================+======================|
+[2025-10-10 02:33:37.573] [hlsmi] [info] |   0  HL-325              N/A  | 0000:34:00.0     N/A |                   0  |
+[2025-10-10 02:33:37.591] [hlsmi] [info] | N/A   33C   P0  224W /  900W  |   672MiB / 131072MiB |     0%            0% |
+[2025-10-10 02:33:37.591] [hlsmi] [info] |-------------------------------+----------------------+----------------------+
+[2025-10-10 02:33:37.591] [hlsmi] [info] | Compute Processes:                                               AIP Memory |
+[2025-10-10 02:33:37.591] [hlsmi] [info] |  AIP       PID   Type   Process name                             Usage      |
+[2025-10-10 02:33:37.591] [hlsmi] [info] |=============================================================================|
+[2025-10-10 02:33:37.591] [hlsmi] [info] |   0        N/A   N/A    N/A                                      N/A        |
+[2025-10-10 02:33:37.591] [hlsmi] [info] +=============================================================================+
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index f07e6c76fadc..f9a67d524446 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -2731,8 +2731,8 @@ def create_dummy_multi_modal_seq_group_metadata(self, group_id, img_args,
             vit_cfg = self.model.model.config.vit_config
             self.image_token_id = getattr(self.model.model.config,
                                           "image_token_id", -200)
-            image_h = 128
-            image_w = int(img_args / image_h)
+            image_w = 98
+            image_h = int(img_args / image_w)
             num_image_tokens = int(image_h * image_w //
                                    (vit_cfg.hidden_stride**2))
             image_grid_thw = torch.tensor([[1, image_h, image_w]],