From 36eb2cf9cec7ed62d40f29f28f2af48fef31794f Mon Sep 17 00:00:00 2001
From: Supreet Singh <supreet.singh.palne@intel.com>
Date: Thu, 18 Sep 2025 05:37:04 -0700
Subject: [PATCH 01/20] Fixes HPU graph run for Gemma3 vision inputs (#1865)

Fixes HPU graph issues for gemma3 vision inputs

Text warmup to include attn_mask info, so vision+text data can reuse the
graph for language model that's warmed up already.
Changing slicing to index_select for multimodal bucketing for HPU.
Slicing doesn't produce the same hash for the HPU graph with same input
shape.
Use buckets for the vision tower as well to reduce GC recompile
Accuracy bug fix by clone output data of the multimodal-projector.
Validated with Muirbench datasets.
---
 .../configs/Qwen2.5-VL-7B-Instruct.yaml       |  2 +-
 vllm/model_executor/models/gemma3_mm.py       | 33 ++++++-------
 vllm/worker/hpu_model_runner.py               | 48 +++++++++++--------
 3 files changed, 44 insertions(+), 39 deletions(-)

diff --git a/.jenkins/vision/configs/Qwen2.5-VL-7B-Instruct.yaml b/.jenkins/vision/configs/Qwen2.5-VL-7B-Instruct.yaml
index 294b538633e5..840eaa12a922 100644
--- a/.jenkins/vision/configs/Qwen2.5-VL-7B-Instruct.yaml
+++ b/.jenkins/vision/configs/Qwen2.5-VL-7B-Instruct.yaml
@@ -1,5 +1,5 @@
 model_name: "/mnt/weka/data/pytorch/Qwen/Qwen2.5-VL-7B-Instruct/"
 dtype: "bfloat16"
-max_model_len: 32768
+max_model_len: 35840
 max_num_seqs: 32
 num_prompts: 4
diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py
index b80a4ab5951c..578e49e7fd78 100644
--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
@@ -569,11 +569,6 @@ def _process_image_input(
         pixel_values = image_input["pixel_values"]
         num_patches = image_input["num_patches"]
 
-        image_features = self._image_pixels_to_features(
-            self.vision_tower,
-            pixel_values,
-        )
-
         if is_hpu:
             batch_breakdown = greedy_plan(pixel_values.shape[0], \
                     self.vision_buckets.multimodal_buckets)
@@ -582,22 +577,24 @@ def _process_image_input(
 
             for i in batch_breakdown:
                 end_idx = start_idx + i
-                batch_sliced_image_features = \
-                        image_features[start_idx:end_idx, ...]
-                if is_lazy:
-                    image_embeds_multibatches += \
-                            [self.multi_modal_projector(
-                                batch_sliced_image_features,
-                                bypass_hpu_graphs=i
-                                not in self.graphed_multimodal_buckets
-                                and len(self.graphed_multimodal_buckets) > 0)]
-                else:
-                    image_embeds_multibatches += \
-                            [self.multi_modal_projector( \
-                                batch_sliced_image_features)]
+                indices = torch.arange(start_idx, end_idx)
+                batch_sliced_pixel_values = torch.index_select(pixel_values,
+                                                               dim=0,
+                                                               index=indices)
+
+                image_features = self._image_pixels_to_features(
+                    self.vision_tower,
+                    batch_sliced_pixel_values,
+                )
+                image_embeds = self.multi_modal_projector(image_features)
+                image_embeds_multibatches += [image_embeds.clone()]
                 start_idx = end_idx
             image_embeds = torch.cat(image_embeds_multibatches, dim=0)
         else:
+            image_features = self._image_pixels_to_features(
+                self.vision_tower,
+                pixel_values,
+            )
             image_embeds = self.multi_modal_projector(image_features)
         return [
             e.flatten(0, 1) for e in image_embeds.split(num_patches.tolist())
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 3fd9e2c78158..f400db0d7c07 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -373,7 +373,7 @@ def __init__(self, model, vllm_config, is_causal, sampler):
             if self.is_mm_optimized:
                 if hasattr(self.model, 'vision_tower'):
                     self.model.vision_tower = htorch.hpu.wrap_in_hpu_graph(
-                        self.model.vision_tower, disable_tensor_cache=True)
+                        self.model.vision_tower, disable_tensor_cache=False)
                 if hasattr(self.model, 'multi_modal_projector'):
                     self.model.multi_modal_projector = \
                             htorch.hpu.wrap_in_hpu_graph( \
@@ -619,13 +619,19 @@ def _update_metadata(self,
                                                     device, dtype, True)
         return attn_metadata
 
-    def compute_input_embeddings_for_mm_optimized(self, **kwargs):
+    def compute_input_embeddings_for_mm_optimized(self, warmup_mode, **kwargs):
         input_ids = kwargs['input_ids']
         vision_embeddings = self.model.get_multimodal_embeddings(**kwargs)
         inputs_embeds = self.model.get_input_embeddings(
             input_ids, vision_embeddings)
 
-        if vision_embeddings is not None:
+        # TODO: In warmup, we need to warmup the model with dummy image data for
+        # multimodal model for prompt, here instead of generating a dummy image,
+        # we are just generating attn_mask for the images and pass with
+        # attn_metadata, so we can reuse HPU graph without running
+        # the whole vision tower.
+        if vision_embeddings is not None or (
+                warmup_mode and kwargs['attn_metadata'].is_prompt):
             input_ids = kwargs['input_ids']
             positions = kwargs['positions']
             kwargs = self.model.prepare_attn_masks(
@@ -634,14 +640,16 @@ def compute_input_embeddings_for_mm_optimized(self, **kwargs):
             )
             kwargs['input_ids'] = input_ids
             kwargs['positions'] = positions
-            #input_ids = None
 
         kwargs.update({'inputs_embeds': inputs_embeds})
-        # done compute the visual tokens
+        # done compute the visual tokens and others
         kwargs.pop('pixel_values', None)
+        kwargs.pop("num_crops", None)
+        kwargs.pop("graphed_multimodal_buckets", None)
         return kwargs
 
-    def compute_input_embeddings_for_mrope_mm_optimized(self, **kwargs):
+    def compute_input_embeddings_for_mrope_mm_optimized(
+            self, warmup_mode, **kwargs):
 
         if 'inputs_embeds' in kwargs:
             return kwargs
@@ -680,7 +688,8 @@ def compute_input_embeddings_for_mrope_mm_optimized(self, **kwargs):
                 kwargs.pop('image_grid_thw', None)
                 return kwargs
             else:
-                return self.compute_input_embeddings_for_mm_optimized(**kwargs)
+                return self.compute_input_embeddings_for_mm_optimized(
+                    warmup_mode, **kwargs)
 
     def forward(self, *args, **kwargs):
         kwargs = kwargs.copy()
@@ -692,9 +701,9 @@ def forward(self, *args, **kwargs):
             virtual_engine = kwargs.pop('virtual_engine')
 
         input_ids = kwargs['input_ids']
-        global_attn_masks = kwargs.get("global_attn_masks") \
+        global_attn_masks = kwargs.pop("global_attn_masks") \
                 if kwargs.get("global_attn_masks") else None
-        local_attn_masks = kwargs.get("local_attn_masks") \
+        local_attn_masks = kwargs.pop("local_attn_masks") \
                 if kwargs.get("local_attn_masks") else None
 
         kwargs['attn_metadata'] = self._update_metadata(
@@ -1396,12 +1405,8 @@ def get_model(self) -> torch.nn.Module:
             return self.model.model
         return self.model
 
-    def _use_graphs(self, img_args=None):
-        if not img_args:
-            return not self.enforce_eager
-        #TODO: We might need to check both language bucket and multimodal bucket
-        # and return True only it's avialble, or return separately.
-        return (img_args) in self.graphed_multimodal_buckets
+    def _use_graphs(self):
+        return not self.enforce_eager
 
     def _is_valid_bucket(self, bucket):
         return bucket[0] * bucket[1] <= self.max_num_batched_tokens
@@ -2667,7 +2672,7 @@ def trim_attn_metadata(self, metadata: AttentionMetadata) -> object:
 
     def create_dummy_multi_modal_seq_group_metadata(self, group_id, img_args,
                                                     sampling_params,
-                                                    lora_request):
+                                                    lora_request, seq_len):
         assert self.model_is_mrope or self.is_mm_optimized, \
             ("Warmup compatible with Qwen2vl/Gemma3 models")
         if img_args == UNSET_IMG_ARGS:
@@ -2712,7 +2717,9 @@ def create_dummy_multi_modal_seq_group_metadata(self, group_id, img_args,
             }
 
         image_token_id = self.get_model().config.image_token_id
-        prompt_token_ids = [image_token_id] * num_image_tokens
+        prompt_token_ids_image = [image_token_id] * num_image_tokens
+        prompt_token_ids = [0] * (
+            seq_len - len(prompt_token_ids_image)) + prompt_token_ids_image
         prompt_token_ids_array = array('l', prompt_token_ids)  # noqa: F821
         placeholders_by_modality = {
             'image':
@@ -2756,6 +2763,7 @@ def create_dummy_seq_group_metadata(self,
                     img_args=img_args,
                     sampling_params=sampling_params,
                     lora_request=lora_request,
+                    seq_len=seq_len,
                 )
             else:
                 input_len = seq_len
@@ -2867,7 +2875,7 @@ def warmup_scenario(self,
                         align_worker=False,
                         is_dummy_run=False) -> None:
         phase = 'prompt' if is_prompt else 'decode'
-        use_graphs = is_dummy_run or self._use_graphs(img_args)
+        use_graphs = is_dummy_run or self._use_graphs()
 
         scenario_name = ("warmup_"
                          f"{phase}_"
@@ -3664,8 +3672,7 @@ def execute_model(
                 if not warmup_mode:
                     ctx_blocks = seq_len
                 seq_len = 1
-            img_args = self._get_img_args_from_model_input(model_input)
-            use_graphs = self._use_graphs(img_args=img_args)
+            use_graphs = self._use_graphs()
             self._check_config(batch_size, seq_len, ctx_blocks, attn_metadata,
                                warmup_mode)
             lora_mask: torch.Tensor = None
@@ -3831,6 +3838,7 @@ def try_revert_dummy_output_tokens():
                             # hpu graphs, hence turning it to a list
                         execute_model_kwargs = \
                             self.model.compute_input_embeddings_for_mrope_mm_optimized(
+                                warmup_mode,
                                 **execute_model_kwargs
                             )
                         if warmup_mode and bypass_model_exec:

From ee517a2fcef8ecd72c307abf2b528bc6010a5a8d Mon Sep 17 00:00:00 2001
From: Artur Fierka <artur.fierka@intel.com>
Date: Mon, 22 Sep 2025 14:42:44 +0200
Subject: [PATCH 02/20] Update common.txt (#1956)

Add missing modelscope package - `VLLM_USE_MODELSCOPE` env doesn't work
without it.
---
 requirements/common.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements/common.txt b/requirements/common.txt
index a6a1ffe76196..be513f444a6e 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -48,3 +48,4 @@ opentelemetry-sdk>=1.26.0  # vllm.tracing
 opentelemetry-api>=1.26.0  # vllm.tracing
 opentelemetry-exporter-otlp>=1.26.0  # vllm.tracing
 opentelemetry-semantic-conventions-ai>=0.4.1  # vllm.tracing
+modelscope  # required to support VLLM_USE_MODELSCOPE env

From bb96123af496a8a197b0779a240076f7859f41a7 Mon Sep 17 00:00:00 2001
From: Yeonsil Yoon <yeon.sil.yoon@intel.com>
Date: Mon, 22 Sep 2025 14:00:13 -0700
Subject: [PATCH 03/20] Merge Libint/intervl_bucket (#1965)

## Essential Elements of an Effective PR Description Checklist
- [ ] The purpose of the PR, such as "Fix some issue (link existing
issues this PR will resolve)".
- [ ] The test plan, such as providing test command.
- [ ] The test results, such as pasting the results comparison before
and after, or e2e results


## Purpose

## Test Plan

## Test Result

<!--- pyml disable-next-line no-emphasis-as-heading -->

---------

Signed-off-by: Libin Tang <libin.tang@intel.com>
Co-authored-by: Libin Tang <libin.tang@intel.com>
Co-authored-by: Libin Tang <litang@habana.ai>
---
 vllm/model_executor/models/gemma3_mm.py |   3 +-
 vllm/model_executor/models/internvl.py  |  91 ++++++++++++++++----
 vllm/worker/hpu_model_runner.py         | 107 ++++++++++++++++--------
 3 files changed, 148 insertions(+), 53 deletions(-)

diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py
index 578e49e7fd78..24fb4147e680 100644
--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
@@ -577,7 +577,8 @@ def _process_image_input(
 
             for i in batch_breakdown:
                 end_idx = start_idx + i
-                indices = torch.arange(start_idx, end_idx)
+                indices = torch.arange(start_idx,
+                                       end_idx).to(pixel_values.device)
                 batch_sliced_pixel_values = torch.index_select(pixel_values,
                                                                dim=0,
                                                                index=indices)
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index 0c61369c5f51..c2acb9f69ef0 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -7,6 +7,7 @@
 # Copyright (c) 2023 OpenGVLab
 # Licensed under The MIT License [see LICENSE for details]
 # --------------------------------------------------------
+import os
 from abc import ABC, abstractmethod
 from collections.abc import Iterable, Mapping, Sequence
 from typing import Any, Literal, Optional, TypedDict, TypeVar, Union
@@ -35,13 +36,15 @@
                                         BaseProcessingInfo, PromptReplacement,
                                         PromptUpdate, PromptUpdateDetails)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 
 from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
                          SupportsMultiModal, SupportsPP)
-from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
-                    maybe_prefix, merge_multimodal_embeddings)
+from .utils import (AutoWeightsLoader, flatten_bn, greedy_plan,
+                    init_vllm_registered_model, maybe_prefix,
+                    merge_multimodal_embeddings)
 
 IMG_START = '<img>'
 IMG_END = '</img>'
@@ -50,6 +53,9 @@
 IMAGENET_MEAN = (0.485, 0.456, 0.406)
 IMAGENET_STD = (0.229, 0.224, 0.225)
 
+is_hpu = current_platform.is_hpu()
+is_lazy = os.environ.get('PT_HPU_LAZY_MODE', '0') == '1' if is_hpu else False
+
 
 class InternVLImagePixelInputs(TypedDict):
     type: Literal["pixel_values"]
@@ -1062,6 +1068,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         self.visual_token_mask = None
         self.make_empty_intermediate_tensors = (
             self.language_model.make_empty_intermediate_tensors)
+        if is_hpu:
+            self.graphed_multimodal_buckets = None
 
     def _patch_quant_config(self, config: PretrainedConfig,
                             quant_config: QuantizationConfig):
@@ -1127,16 +1135,64 @@ def pixel_shuffle(self, x, scale_factor=0.5):
         return x
 
     def extract_feature(self, pixel_values: torch.Tensor) -> torch.Tensor:
-        vit_embeds = self.vision_model(pixel_values=pixel_values)
-        vit_embeds = vit_embeds[:, 1:, :]
-
-        h = w = int(vit_embeds.shape[1]**0.5)
-        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
-        vit_embeds = self.pixel_shuffle(vit_embeds,
-                                        scale_factor=self.downsample_ratio)
-        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1,
-                                        vit_embeds.shape[-1])
-        vit_embeds = self.mlp1(vit_embeds)
+        if is_hpu:
+            if self.vision_buckets.multimodal_buckets:
+                batch_breakdown = greedy_plan(pixel_values.shape[0], \
+                        self.vision_buckets.multimodal_buckets)
+            else:
+                batch_breakdown = [pixel_values.shape[0]]
+
+            start_idx = 0
+            vit_embeds_minibatches = []
+
+            for i in batch_breakdown:
+                end_idx = start_idx + i
+                batch_sliced_pixel_values = \
+                        pixel_values[start_idx:end_idx, ...]
+                if is_lazy:
+                    vit_embeds_minibatch = \
+                        self.vision_model(
+                            pixel_values=batch_sliced_pixel_values,
+                            bypass_hpu_graphs=i
+                            not in self.graphed_multimodal_buckets
+                            and len(self.graphed_multimodal_buckets) > 0)
+                else:
+                    vit_embeds_minibatch = \
+                        self.vision_model(
+                            pixel_values=batch_sliced_pixel_values)
+
+                vit_embeds_minibatch = vit_embeds_minibatch[:, 1:, :]
+
+                h = w = int(vit_embeds_minibatch.shape[1]**0.5)
+                vit_embeds_minibatch = vit_embeds_minibatch.reshape(
+                    vit_embeds_minibatch.shape[0], h, w, -1)
+                vit_embeds_minibatch = self.pixel_shuffle(
+                    vit_embeds_minibatch, scale_factor=self.downsample_ratio)
+                vit_embeds_minibatch = vit_embeds_minibatch.reshape(
+                    vit_embeds_minibatch.shape[0], -1,
+                    vit_embeds_minibatch.shape[-1])
+
+                if is_lazy:
+                    vit_embeds_minibatches += [
+                        self.mlp1(vit_embeds_minibatch,
+                                  bypass_hpu_graphs=i
+                                  not in self.graphed_multimodal_buckets
+                                  and len(self.graphed_multimodal_buckets) > 0)
+                    ]
+                else:
+                    vit_embeds_minibatches += [self.mlp1(vit_embeds_minibatch)]
+                start_idx = end_idx
+            vit_embeds = torch.cat(vit_embeds_minibatches, dim=0)
+        else:
+            vit_embeds = self.vision_model(pixel_values=pixel_values)
+            vit_embeds = vit_embeds[:, 1:, :]
+            h = w = int(vit_embeds.shape[1]**0.5)
+            vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
+            vit_embeds = self.pixel_shuffle(vit_embeds,
+                                            scale_factor=self.downsample_ratio)
+            vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1,
+                                            vit_embeds.shape[-1])
+            vit_embeds = self.mlp1(vit_embeds)
         return vit_embeds
 
     def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
@@ -1180,8 +1236,11 @@ def _parse_and_validate_image_input(
 
         image_token_id = kwargs["image_token_id"]
         assert isinstance(image_token_id, torch.Tensor)
-        self.img_context_token_id = image_token_id.flatten().unique().item()
-
+        if is_hpu:
+            self.img_context_token_id = image_token_id.flatten()
+        else:
+            self.img_context_token_id = image_token_id.flatten().unique().item(
+            )
         if pixel_values_flat is not None:
             if not isinstance(pixel_values_flat, (torch.Tensor, list)):
                 raise ValueError("Incorrect type of pixel values. "
@@ -1306,7 +1365,9 @@ def get_language_model(self) -> torch.nn.Module:
 
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
-
+        if is_hpu:
+            self.graphed_multimodal_buckets = kwargs.pop(
+                'graphed_multimodal_buckets', [])
         modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
         if not modalities:
             return None
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index f400db0d7c07..b49c61bfe2a4 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -112,16 +112,22 @@ class VisionBuckets:
     This class is used to bucket image tokens
     '''
 
-    def __init__(self, is_batch_based):
-        self.is_batch_based = is_batch_based
+    def __init__(self, model):
+        self.is_batch_based = True
         envvar = os.environ.get('VLLM_MULTIMODAL_BUCKETS', "")
         if envvar == 'None':
             self.multimodal_buckets = None
         else:
             if envvar == "":
-                if is_batch_based:
+                if 'InternVLChatModel' in str(type(model)):
+                    multimodal_buckets = list(
+                        range(model.config.min_dynamic_patch,
+                              model.config.max_dynamic_patch +
+                              2))  #As use_thumbnail is true
+                elif 'Gemma3ForConditionalGeneration' in str(type(model)):
                     multimodal_buckets = [1, 2, 4, 8]  # batch sizes for gemma3
                 else:
+                    self.is_batch_based = False
                     multimodal_buckets = [
                         1600, 3136, 4096, 6400, 7744, 9216, 12544
                     ]
@@ -159,9 +165,11 @@ def __call__(cls, *args, **kwargs):
 
 
 def is_mm_optimized(model):
-    return 'Gemma3ForConditionalGeneration' in str(type(model.model)) \
-        if hasattr(model, 'model') else \
-        'Gemma3ForConditionalGeneration' in str(type(model))
+    mm_models = ['Gemma3ForConditionalGeneration', 'InternVLChatModel']
+
+    return any(m in str(type(model.model)) for m in mm_models) \
+        if hasattr(model, 'model') \
+        else any(m in str(type(model)) for m in mm_models)
 
 
 def pad_flat_tensor(tensor, desired_size):
@@ -345,6 +353,7 @@ def __init__(self, model, vllm_config, is_causal, sampler):
         model_config = getattr(self.model, "config", None)
 
         self.model_is_mrope = uses_mrope(model_config)
+
         self.is_mm_optimized = is_mm_optimized(self.model)
         text_config = vllm_config.model_config.hf_config.get_text_config()
         self.interleaved_sliding_window = getattr(
@@ -379,6 +388,12 @@ def __init__(self, model, vllm_config, is_causal, sampler):
                             htorch.hpu.wrap_in_hpu_graph( \
                             self.model.multi_modal_projector, \
                             disable_tensor_cache=True)
+                if hasattr(self.model, 'vision_model'):
+                    self.model.vision_model = htorch.hpu.wrap_in_hpu_graph(
+                        self.model.vision_model, disable_tensor_cache=True)
+                if hasattr(self.model, 'mlp1'):
+                    self.model.mlp1 = htorch.hpu.wrap_in_hpu_graph(
+                        self.model.mlp1, disable_tensor_cache=True)
 
         self._rotary_embed_module = self._get_rotary_embedding_module(
             self.model)
@@ -624,7 +639,6 @@ def compute_input_embeddings_for_mm_optimized(self, warmup_mode, **kwargs):
         vision_embeddings = self.model.get_multimodal_embeddings(**kwargs)
         inputs_embeds = self.model.get_input_embeddings(
             input_ids, vision_embeddings)
-
         # TODO: In warmup, we need to warmup the model with dummy image data for
         # multimodal model for prompt, here instead of generating a dummy image,
         # we are just generating attn_mask for the images and pass with
@@ -632,18 +646,23 @@ def compute_input_embeddings_for_mm_optimized(self, warmup_mode, **kwargs):
         # the whole vision tower.
         if vision_embeddings is not None or (
                 warmup_mode and kwargs['attn_metadata'].is_prompt):
-            input_ids = kwargs['input_ids']
-            positions = kwargs['positions']
-            kwargs = self.model.prepare_attn_masks(
-                mask_dtype=self.dtype,
-                **kwargs,
-            )
-            kwargs['input_ids'] = input_ids
-            kwargs['positions'] = positions
+            if hasattr(self.model, 'prepare_attn_masks'):
+                input_ids = kwargs['input_ids']
+                positions = kwargs['positions']
+                kwargs = self.model.prepare_attn_masks(
+                    mask_dtype=self.dtype,
+                    **kwargs,
+                )
+                kwargs['input_ids'] = input_ids
+                kwargs['positions'] = positions
+                # done compute the visual tokens
+                kwargs.pop('pixel_values', None)
+            else:
+                kwargs.pop('pixel_values_flat', None)
+                kwargs.pop("image_num_patches", None)
+                kwargs.pop("image_token_id", None)
 
         kwargs.update({'inputs_embeds': inputs_embeds})
-        # done compute the visual tokens and others
-        kwargs.pop('pixel_values', None)
         kwargs.pop("num_crops", None)
         kwargs.pop("graphed_multimodal_buckets", None)
         return kwargs
@@ -699,7 +718,6 @@ def forward(self, *args, **kwargs):
         virtual_engine = 0
         if 'virtual_engine' in kwargs:
             virtual_engine = kwargs.pop('virtual_engine')
-
         input_ids = kwargs['input_ids']
         global_attn_masks = kwargs.pop("global_attn_masks") \
                 if kwargs.get("global_attn_masks") else None
@@ -1080,6 +1098,8 @@ def __init__(
                                     and not self.lora_config)
         self.use_delayed_sampling = get_config(
         ).use_delayed_sampling and can_use_delayed_sampling
+        self.mm_tokens_per_image = 1
+        self.image_token_id = 0
 
     def _set_gc_threshold(self) -> None:
         """
@@ -1497,10 +1517,16 @@ def move_to_device(self, tensor):
                                                        non_blocking=True)
 
     def add_vision_buckets_to_mrope_mm_optimized(self):
-        model = self.get_model()
-        self.is_mm_optimized = is_mm_optimized(model)
+        self.is_mm_optimized = is_mm_optimized(self.model)
         if self.model_is_mrope or self.is_mm_optimized:
-            model.vision_buckets = VisionBuckets(self.is_mm_optimized)
+            if hasattr(self.model.model.config, 'mm_tokens_per_image'):
+                self.mm_tokens_per_image = \
+                    self.model.model.config.mm_tokens_per_image
+                self.image_token_id = self.model.model.config.image_token_id
+            elif 'InternVLChatModel' in str(type(self.model.model)):
+                self.image_token_id = 151667
+                self.mm_tokens_per_image = self.model.model.num_image_token
+            self.model.model.vision_buckets = VisionBuckets(self.model.model)
 
     def _prepare_prompt(
         self,
@@ -1631,7 +1657,6 @@ def _prepare_prompt(
                     for idx in range(3):
                         seq_data_mrope_positions[idx] \
                             .extend(mrope_positions[idx])
-
                 multi_modal_kwargs_list.append(mm_kwargs)
 
                 for modality, placeholder_map in placeholder_maps.items():
@@ -2709,17 +2734,28 @@ def create_dummy_multi_modal_seq_group_metadata(self, group_id, img_args,
         else:
             s = self.model.model.config.vision_config.image_size
             pixel_values = torch.randn([img_args, 3, s, s])
-            num_image_tokens = self.model.model.config.mm_tokens_per_image \
-                    * img_args
-            multi_modal_data = {
-                "pixel_values": pixel_values,
-                "num_crops": torch.zeros([img_args], dtype=torch.int32)
-            }
 
-        image_token_id = self.get_model().config.image_token_id
-        prompt_token_ids_image = [image_token_id] * num_image_tokens
+            if 'Gemma3ForConditionalGeneration' in str(type(self.model.model)):
+                multi_modal_data = {
+                    "pixel_values": pixel_values,
+                    "num_crops": torch.zeros([img_args], dtype=torch.int32),
+                }
+            elif 'InternVLChatModel' in str(type(self.model.model)):
+                multi_modal_data = {
+                    "pixel_values_flat":
+                    pixel_values.to(torch.bfloat16),
+                    "image_num_patches":
+                    torch.tensor([pixel_values.shape[0]], dtype=torch.int32),
+                    "image_token_id":
+                    torch.tensor([self.image_token_id], dtype=torch.int64),
+                }
+            else:
+                logger.warning("No support for other models yet")
+            num_image_tokens = self.mm_tokens_per_image * img_args
+        prompt_token_ids_image = [self.image_token_id] * num_image_tokens
         prompt_token_ids = [0] * (
             seq_len - len(prompt_token_ids_image)) + prompt_token_ids_image
+
         prompt_token_ids_array = array('l', prompt_token_ids)  # noqa: F821
         placeholders_by_modality = {
             'image':
@@ -3188,9 +3224,7 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None:
             if graphs:
                 self.graphed_buckets.add(cfg)
             if self.is_mm_run():
-                img_args = (int(seq_len) //
-                            self.model.model.config.mm_tokens_per_image
-                            if self.is_mm_optimized else int(seq_len))
+                img_args = int(seq_len) // self.mm_tokens_per_image
             self.warmup_scenario(
                 int(bs),
                 int(seq_len),
@@ -3539,7 +3573,7 @@ def _get_seq_ids(self, model_input):
     def _get_img_args_from_model_input(self, model_input):
         if (not self.model_is_mrope and not self.is_mm_optimized) or \
             not model_input.multi_modal_kwargs or \
-            'pixel_values' not in model_input.multi_modal_kwargs:
+            ('pixel_values') not in model_input.multi_modal_kwargs:
             return None
         if self.model_is_mrope:
             pixel_values_list = model_input.multi_modal_kwargs['pixel_values']
@@ -3816,18 +3850,17 @@ def try_revert_dummy_output_tokens():
                     'real_seq_len': model_input.seq_lens,
                     'real_batch_size': real_batch_size
                 }
-
                 #Need to set the window_slide mask at this point to decide
                 if is_prompt:
                     attn_metadata = self.model._update_use_window_sdpa(
                         execute_model_kwargs['attn_metadata'], seq_len,
                         bool(model_input.multi_modal_kwargs and \
-                       'pixel_values' in model_input.multi_modal_kwargs))
+                       ('pixel_values')in model_input.multi_modal_kwargs))
                     execute_model_kwargs['attn_metadata'] = attn_metadata
 
                 if not bypass_model_exec:
                     if self.model_is_mrope or self.is_mm_optimized:
-                        if 'pixel_values' in execute_model_kwargs and \
+                        if ('pixel_values') in execute_model_kwargs and \
                                 self.is_mm_optimized:
                             if warmup_mode and not is_pt_profiler_run:
                                 bypass_model_exec = True

From 716f3fc2f8368c0ce2d2f03baf68b64190a8635e Mon Sep 17 00:00:00 2001
From: Yeonsil Yoon <yeon.sil.yoon@intel.com>
Date: Mon, 22 Sep 2025 15:17:17 -0700
Subject: [PATCH 04/20] Introduce VLLM_WARMUP_WITH_PENALTY for internVL warmup
 (#1967)

Introduce VLLM_WARMUP_WITH_PENALTY to call apply penalty code in sampler
during warmup also.


https://github.com/HabanaAI/vllm-fork/blob/libint/intervl_bucket/vllm/model_executor/layers/sampler.py#L280
is not called during warmup.
And it causes extra graph compile during runtime as it sets to True for
real run.
---
 vllm/worker/hpu_model_runner.py | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index b49c61bfe2a4..eeadd4a0f963 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -2784,11 +2784,15 @@ def create_dummy_seq_group_metadata(self,
                                         lora_request=None,
                                         img_args=None,
                                         temperature=0,
+                                        presence_penalty=0.0,
+                                        top_p=1.0,
                                         ctx=0):
         if self.is_pooler:
             sampling_params = None
         else:
-            sampling_params = SamplingParams(temperature=temperature)
+            sampling_params = SamplingParams(temperature=temperature,
+                                             presence_penalty=presence_penalty,
+                                             top_p=top_p)
         num_blocks = math.ceil(seq_len / self.block_size)
         seq_len = max(seq_len, 1)
         computed_block_nums = None
@@ -2945,6 +2949,12 @@ def warmup_scenario(self,
                 ]
         self.profiler.start('internal', scenario_name)
         times = num_iters if use_graphs or is_pt_profiler_run else 1
+        presence_penalty = 1.0 if os.getenv('VLLM_WARMUP_WITH_PENALTY',
+                                            '0') == '1' else 0.0
+        top_p = 0.1 if os.getenv('VLLM_WARMUP_WITH_PENALTY',
+                                 '0') == '1' else 1.0
+        temperature = 1.0 if os.getenv('VLLM_WARMUP_WITH_PENALTY',
+                                       '0') == '1' else 0.0
         if is_prompt:
             seqs = [
                 self.create_dummy_seq_group_metadata(
@@ -2955,6 +2965,8 @@ def warmup_scenario(self,
                     if dummy_lora_requests_per_seq else None,
                     img_args=img_args,
                     temperature=temperature,
+                    presence_penalty=presence_penalty,
+                    top_p=top_p,
                     ctx=ctx) for i in range(batch_size)
             ]
         else:
@@ -2968,6 +2980,8 @@ def warmup_scenario(self,
                     lora_request=dummy_lora_requests_per_seq[i]
                     if dummy_lora_requests_per_seq else None,
                     temperature=temperature,
+                    presence_penalty=presence_penalty,
+                    top_p=top_p,
                     ctx=ctx) for i, b in enumerate(blocks)
             ]
         if not is_dummy_run:

From 30c226ecfa1b4a319b81edb22eca62fe6a35ae97 Mon Sep 17 00:00:00 2001
From: Yeonsil Yoon <yeon.sil.yoon@intel.com>
Date: Mon, 22 Sep 2025 15:25:18 -0700
Subject: [PATCH 05/20] Modify merge_multimodal_embeddings to static (#1969)

---
 vllm/model_executor/models/internvl.py | 18 +++++++++++++++++-
 vllm/model_executor/models/utils.py    | 16 ++++++++++++++++
 vllm/worker/hpu_model_runner.py        | 24 ++++++++++++++++++++++--
 3 files changed, 55 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index c2acb9f69ef0..92fe14de128f 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -44,7 +44,8 @@
                          SupportsMultiModal, SupportsPP)
 from .utils import (AutoWeightsLoader, flatten_bn, greedy_plan,
                     init_vllm_registered_model, maybe_prefix,
-                    merge_multimodal_embeddings)
+                    merge_multimodal_embeddings,
+                    merge_multimodal_embeddings_static)
 
 IMG_START = '<img>'
 IMG_END = '</img>'
@@ -1390,6 +1391,21 @@ def get_multimodal_embeddings(
 
         return multimodal_embeddings
 
+    def get_input_embeddings_hpu(
+        self,
+        input_ids: torch.Tensor,
+        image_index_tensor: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None:
+            inputs_embeds = merge_multimodal_embeddings_static(
+                image_index_tensor,
+                inputs_embeds,
+                multimodal_embeddings,
+            )
+        return inputs_embeds
+
     def get_input_embeddings(
         self,
         input_ids: torch.Tensor,
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index 96d529254b7a..822113b52a8a 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -433,6 +433,22 @@ def merge_multimodal_embeddings_from_map(
     return inputs_embeds
 
 
+def merge_multimodal_embeddings_static(
+    is_multimodal_index: torch.Tensor,
+    inputs_embeds: torch.Tensor,
+    multimodal_embeddings: NestedTensors,
+) -> torch.Tensor:
+    assert current_platform.is_hpu(), ("Support HPU only")
+    flattened = _flatten_embeddings(multimodal_embeddings)
+
+    inputs_embeds_s = inputs_embeds.shape
+    inputs_embeds = inputs_embeds.view(inputs_embeds_s[0] * inputs_embeds_s[1],
+                                       inputs_embeds_s[2])
+    inputs_embeds = inputs_embeds.index_copy_(0, is_multimodal_index,
+                                              flattened).view(inputs_embeds_s)
+    return inputs_embeds
+
+
 def _merge_multimodal_embeddings(
     inputs_embeds: torch.Tensor,
     is_multimodal: torch.Tensor,
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index eeadd4a0f963..e514469a942c 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -637,8 +637,14 @@ def _update_metadata(self,
     def compute_input_embeddings_for_mm_optimized(self, warmup_mode, **kwargs):
         input_ids = kwargs['input_ids']
         vision_embeddings = self.model.get_multimodal_embeddings(**kwargs)
-        inputs_embeds = self.model.get_input_embeddings(
-            input_ids, vision_embeddings)
+        if 'image_index' in kwargs:
+            inputs_embeds = self.model.get_input_embeddings_hpu(
+                input_ids, kwargs['image_index'], vision_embeddings)
+            kwargs.pop("image_index", None)
+        else:
+            inputs_embeds = self.model.get_input_embeddings(
+                input_ids, vision_embeddings)
+
         # TODO: In warmup, we need to warmup the model with dummy image data for
         # multimodal model for prompt, here instead of generating a dummy image,
         # we are just generating attn_mask for the images and pass with
@@ -1772,6 +1778,7 @@ def _prepare_prompt(
                                               pad=0,
                                               dtype=torch.long,
                                               flat=self.use_merged_prefill)
+        image_index_tensor = None
         if self.model_is_mrope:
             input_positions = \
                 make_mrope_positions_tensor_with_pad(input_positions=input_positions,
@@ -1785,6 +1792,11 @@ def _prepare_prompt(
                                               dtype=torch.long,
                                               flat=self.use_merged_prefill)
 
+        if seq_group_metadata.multi_modal_data and self.is_mm_optimized and \
+            'InternVLChatModel' in str(type(self.model.model)):
+            is_image_flatten = (
+                input_tokens_tensor == self.image_token_id).flatten()
+            image_index_tensor = is_image_flatten.nonzero().squeeze(-1)
         slot_mapping = make_cpu_tensor(slot_mapping,
                                        max_len=max_prompt_len,
                                        pad=_PAD_SLOT_ID,
@@ -1872,6 +1884,8 @@ def _prepare_prompt(
             input_positions=input_positions,
         )
         multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list)
+        if image_index_tensor is not None:
+            multi_modal_kwargs['image_index'] = image_index_tensor
         multi_modal_kwargs = MultiModalKwargs.as_kwargs(multi_modal_kwargs,
                                                         device=self.device)
 
@@ -3872,6 +3886,12 @@ def try_revert_dummy_output_tokens():
                        ('pixel_values')in model_input.multi_modal_kwargs))
                     execute_model_kwargs['attn_metadata'] = attn_metadata
 
+                    if 'image_index' in model_input.multi_modal_kwargs:
+                        execute_model_kwargs[
+                            'image_index'] = model_input.multi_modal_kwargs[
+                                'image_index']
+                        model_input.multi_modal_kwargs.pop('image_index', None)
+
                 if not bypass_model_exec:
                     if self.model_is_mrope or self.is_mm_optimized:
                         if ('pixel_values') in execute_model_kwargs and \

From dacac74f7e17f4e4314030571c92ad159b26b4a4 Mon Sep 17 00:00:00 2001
From: Seunghyuk Park <seunghyuk.h.park@intel.com>
Date: Tue, 23 Sep 2025 21:05:17 +0300
Subject: [PATCH 06/20] Add Daniel's mediapipe changes

---
 vllm/entrypoints/chat_utils.py         |  10 +-
 vllm/model_executor/models/internvl.py | 322 +++++++++++++++++++++++--
 vllm/multimodal/image.py               |   3 +
 vllm/multimodal/parse.py               |   5 +
 vllm/multimodal/utils.py               |  54 ++++-
 vllm/worker/hpu_model_runner.py        |  12 +-
 6 files changed, 372 insertions(+), 34 deletions(-)

diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index f5f45a62ca2f..9a0525c7933a 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -405,8 +405,14 @@ def _resolve_chat_template_content_format(
     jinja_text = (hf_chat_template if isinstance(hf_chat_template, str)
                   else load_chat_template(chat_template, is_literal=True))
 
-    detected_format = ("string" if jinja_text is None else
-                       _detect_content_format(jinja_text, default="string"))
+    # The InternVL template has mixed content access patterns that fail with automatic detection.
+    # Set string format for proper operation if InternVL is used.
+    model_type = getattr(model_config.hf_config, 'model_type', '')
+    if model_type == 'internvl_chat' or 'internvl' in model_config.model.lower():
+        detected_format = "string"
+    else:
+        detected_format = ("string" if jinja_text is None else
+                           _detect_content_format(jinja_text, default="string"))
 
     return detected_format
 
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index 92fe14de128f..e4b01fd86b5e 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -19,6 +19,19 @@
 from PIL import Image
 from transformers import BatchEncoding, PretrainedConfig, TensorType
 
+from habana_frameworks.mediapipe import fn
+from habana_frameworks.mediapipe.mediapipe import MediaPipe
+from habana_frameworks.mediapipe.media_types import dtype as dt
+from habana_frameworks.mediapipe.media_types import imgtype as it
+from habana_frameworks.mediapipe.media_types import readerOutType as ro
+from habana_frameworks.mediapipe.operators.reader_nodes.reader_nodes import media_ext_reader_op_impl
+from habana_frameworks.mediapipe.operators.reader_nodes.reader_nodes import media_ext_reader_op_tensor_info
+from habana_frameworks.mediapipe.plugins.iterator_pytorch import MediaGenericPytorchIterator
+import numpy as np
+from queue import Queue
+import io
+import time
+
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.quantization.awq import AWQConfig
@@ -295,6 +308,244 @@ def video_to_pixel_values_internvl(
     pixel_values = torch.stack([transform(image) for image in frames_list])
     return pixel_values
 
+# Handle MediaPipe pipe_manager destructor
+from habana_frameworks.mediapipe.backend.cal import pipe_manager, cpp_pipe_manager_list
+
+def _patched_close(self):
+    """Patched close method that handles None cpp_pipe_manager_list during shutdown"""
+    try:
+        # Check if cpp_pipe_manager_list exists and is not None
+        if cpp_pipe_manager_list is not None and self._pm_ in cpp_pipe_manager_list:
+            cpp_pipe_manager_list.remove(self._pm_)
+    except (TypeError, AttributeError):
+        # Handle case where cpp_pipe_manager_list is None or not iterable
+        pass
+
+    # Clean up the pipe manager
+    if self._pm_ is not None:
+        self._pm_.close()
+        self._pm_ = None
+
+pipe_manager.close = _patched_close
+
+# Queue shared between external reader and mediapipe call
+shared_q = Queue()
+
+
+class MediaPytorchIterator(MediaGenericPytorchIterator):
+    def __init__(self, mediapipe):
+        super().__init__(mediapipe=mediapipe, device="hpu", fw_type="PYT_FW")
+
+
+class external_reader(media_ext_reader_op_impl):
+    def __init__(self, params, fw_params):
+        self.batch_size = fw_params.batch_size
+        self.max_file = ""
+        self.num_batches = 1
+
+    def __iter__(self):
+        return self
+
+    def __len__(self):
+        return self.num_batches
+
+    def __next__(self):
+        img_list = shared_q.get()
+        for i in range(len(img_list)):
+            # NOTE: this padding is needed because of HW alignmnet requirment
+            img_list[i] = np.pad(img_list[i],
+                                       (0, 64 - len(img_list[i]) % 64),
+                                       'constant')
+        return img_list
+
+    def get_media_output_type(self):
+        return ro.BUFFER_LIST
+
+    def get_largest_file(self):
+        return self.max_file
+
+    def gen_output_info(self):
+        out_info = []
+        o = media_ext_reader_op_tensor_info(
+            dt.NDT, np.array([self.batch_size], dtype=np.uint32), "")
+        out_info.append(o)
+        return out_info
+
+
+class hpuMediaPipe(MediaPipe):
+    def __init__(self, device, queue_depth, batch_size,
+                 num_threads, op_device,
+                 img_height, img_width):
+        super(
+            hpuMediaPipe,
+            self).__init__(
+            device,
+            queue_depth,
+            batch_size,
+            num_threads,
+            self.__class__.__name__)
+
+        mediapipe_seed = int(time.time_ns() % (2**31 - 1))
+
+        self.input = fn.MediaExtReaderOp(impl=external_reader,
+                                         num_outputs=1,
+                                         seed=mediapipe_seed,
+                                         device=op_device)
+        self.decode = fn.ImageDecoder(
+            device="hpu", output_format=it.RGB_I, resize=[img_width, img_height])
+
+        self.mean_node = fn.MediaConst(
+            data=np.array([127.5, 127.5, 127.5], dtype=dt.FLOAT32),
+            shape=[1, 1, 3],
+            dtype=dt.FLOAT32
+        )
+        self.std_node = fn.MediaConst(
+            data=np.array([1/127.5, 1/127.5, 1/127.5], dtype=dt.FLOAT32),
+            shape=[1, 1, 3],
+            dtype=dt.FLOAT32
+        )
+
+        self.cmn = fn.CropMirrorNorm(crop_w=img_width, crop_h=img_height, dtype=dt.FLOAT32, device="hpu")
+
+        self.transpose = fn.Transpose(
+            device="hpu",
+            tensorDim=4,
+            permutation=[1, 2, 0, 3] #NCHW
+        )
+
+    def definegraph(self):
+        images = self.input()
+        images = self.decode(images)
+        mean = self.mean_node()
+        std = self.std_node()
+        images = self.cmn(images, mean, std)
+        images = self.transpose(images)
+
+        # Return the full processed image - we'll do tiling in Python
+        return images
+
+def get_image_info(data):
+    # Get image info using PIL without decoding
+    try:
+        with Image.open(io.BytesIO(data)) as img:
+            return {
+                'format': img.format,
+                'size': img.size,
+                'mode': img.mode
+            }
+    except Exception as e:
+        raise ValueError(f"Input image bitstream is not in supported format: {str(e)}")
+
+def preprocess_images(
+    images,
+    target_ratios: list[tuple[int, int]],
+    patch_size=448,
+    use_thumbnail=False,
+):
+    batch_size = 0
+    queue_depth = 0
+    num_threads = 1
+
+    # validate images and create batches
+    img_size = None
+    img_sizes = []
+    batch_sizes = []
+    for img in images:
+        img_info = get_image_info(img)
+        if img_info['format'] != 'JPEG' and img_info['mode'] != 'RGB':
+            raise ValueError(f"HPU media pipeline only supports JPEG images in RGB mode. Detected format={img_info['format']}, mode={img_info['mode']}")
+        if img_size==None:
+            img_size = img_info['size']
+        else:
+            if img_info['size'] != img_size:
+                img_sizes.append(img_size)
+                batch_sizes.append(batch_size)
+                batch_size = 0
+                img_size = img_info['size']
+        batch_size += 1
+    img_sizes.append(img_size)
+    batch_sizes.append(batch_size)
+
+    thumbs = None
+    if use_thumbnail and len(images) > 0:
+        batch_size = len(images)
+        pipe = hpuMediaPipe("legacy", queue_depth, batch_size,
+                        num_threads, "cpu",
+                        patch_size, patch_size)
+        pipe.build()
+        data_loader = MediaPytorchIterator(pipe)
+        data_loader = iter(data_loader)
+
+        img_list = np.empty(shape=[batch_size, ], dtype=object)
+        for i in range(batch_size):
+            img_list[i] = np.frombuffer(images[i], np.uint8)
+
+        shared_q.put(img_list)
+        thumbs = next(data_loader)[0]
+
+        shared_q.task_done()
+        pipe.close()
+        del pipe
+
+    image_num_patches = torch.zeros(len(images), dtype=torch.int64)
+
+    patches = []
+    thumb_idx = 0
+    image_num_patches_idx = 0
+    for batch_size, img_size in zip(batch_sizes, img_sizes):
+        # calculate the number of blocks without thumbnail
+        blocks, target_width, target_height = calculate_internvl_targets(
+            orig_width=img_size[0],
+            orig_height=img_size[1],
+            target_ratios=target_ratios,
+            image_size=patch_size,
+            use_thumbnail=False,
+        )
+
+        num_patches = blocks + 1 if use_thumbnail and thumbs is not None and blocks > 1 else blocks
+        image_num_patches[image_num_patches_idx:image_num_patches_idx+batch_size] = num_patches
+
+        pipe = hpuMediaPipe("legacy", queue_depth, batch_size,
+                        num_threads, "cpu",
+                        target_height, target_width)
+        pipe.build()
+        data_loader = MediaPytorchIterator(pipe)
+        data_loader = iter(data_loader)
+
+        img_list = np.empty(shape=[batch_size, ], dtype=object)
+        for i in range(batch_size):
+            img_list[i] = np.frombuffer(images[i], np.uint8)
+
+        shared_q.put(img_list)
+        processed_images = next(data_loader)[0]
+
+        shared_q.task_done()
+        pipe.close()
+        del pipe
+
+        # Extract tiles
+        tiles = []
+        H, W = target_height, target_width
+        for h_idx in range(H // patch_size):
+            for w_idx in range(W // patch_size):
+                h_start = h_idx * patch_size
+                h_end = h_start + patch_size
+                w_start = w_idx * patch_size
+                w_end = w_start + patch_size
+
+                tile = processed_images[:, :, h_start:h_end, w_start:w_end]
+                tiles.append(tile)
+
+        for i in range(batch_size):
+            for t in tiles:
+                patches.append(t[i])
+            if use_thumbnail and thumbs is not None and len(tiles) > 1:
+                patches.append(thumbs[thumb_idx])
+            thumb_idx += 1
+
+    patches_flat = torch.stack(patches, dim=0)
+    return patches_flat, image_num_patches
+
 
 class BaseInternVLProcessor(ABC):
     """
@@ -451,25 +702,58 @@ def _preprocess_image(
         if len(images) == 0:
             image_inputs = {}
         else:
-            pixel_values_lst = self._images_to_pixel_values_lst(
-                images,
-                min_dynamic_patch=min_dynamic_patch,
-                max_dynamic_patch=max_dynamic_patch,
-                dynamic_image_size=dynamic_image_size,
-            )
-            image_inputs: dict[str, NestedTensors] = {
-                "pixel_values_flat":
-                torch.cat(pixel_values_lst),
-                "image_num_patches":
-                torch.tensor([len(item) for item in pixel_values_lst]),
-            }
+            use_mediapipe = os.getenv("VLLM_USE_MEDIA_PIPELINE", "false").lower() in ("1", "true", "yes")
+            if use_mediapipe:
+                # Use HPU media pipeline for image preprocessing
+                min_num, max_num = self.resolve_min_max_num(
+                    min_dynamic_patch=min_dynamic_patch,
+                    max_dynamic_patch=max_dynamic_patch,
+                    dynamic_image_size=dynamic_image_size,
+                    use_thumbnail=False,  # Applied in image_to_pixel_values
+                )
 
-            for pixel_values in pixel_values_lst:
-                num_patches = pixel_values.shape[0]
-                feature_size = num_patches * self.num_image_token
+                target_ratios = get_internvl_target_ratios(min_num, max_num)
+
+                pixel_values_flat, image_num_patches = preprocess_images(
+                    images,
+                    target_ratios=target_ratios,
+                    patch_size=self.image_size,
+                    use_thumbnail=self.use_thumbnail,
+                )
+
+                image_inputs = {
+                    "pixel_values_flat": pixel_values_flat,
+                    "image_num_patches": image_num_patches,
+                }
+
+                for i in range(len(images)):
+                    num_patches = image_num_patches[i].item()
+                    feature_size = num_patches * self.num_image_token
+
+                    image_repl = self.get_image_repl(feature_size, num_patches)
+                    text = [t.replace('<image>', image_repl.full, 1) for t in text]
+
+            else:
+                pixel_values_lst = self._images_to_pixel_values_lst(
+                    images,
+                    min_dynamic_patch=min_dynamic_patch,
+                    max_dynamic_patch=max_dynamic_patch,
+                    dynamic_image_size=dynamic_image_size,
+                )
+                image_inputs: dict[str, NestedTensors] = {
+                    "pixel_values_flat":
+                    torch.cat(pixel_values_lst),
+                    "image_num_patches":
+                    torch.tensor([len(item) for item in pixel_values_lst]),
+                }
+
+                for pixel_values in pixel_values_lst:
+                    num_patches = pixel_values.shape[0]
+                    feature_size = num_patches * self.num_image_token
+
+                    image_repl = self.get_image_repl(feature_size, num_patches)
+                    text = [t.replace('<image>', image_repl.full, 1) for t in text]
 
-                image_repl = self.get_image_repl(feature_size, num_patches)
-                text = [t.replace('<image>', image_repl.full, 1) for t in text]
         return text, image_inputs
 
     def _make_batch_input(self,
@@ -483,7 +767,7 @@ def _make_batch_input(self,
     def __call__(
         self,
         text: Optional[Union[str, list[str]]] = None,
-        images: Optional[Union[Image.Image, list[Image.Image]]] = None,
+        images: Optional[Union[Image.Image, list[Image.Image], bytes, list[bytes]]] = None,
         min_dynamic_patch: Optional[int] = None,
         max_dynamic_patch: Optional[int] = None,
         dynamic_image_size: Optional[bool] = None,
@@ -602,7 +886,7 @@ def _preprocess_video(
     def __call__(
         self,
         text: Optional[Union[str, list[str]]] = None,
-        images: Optional[Union[Image.Image, list[Image.Image]]] = None,
+        images: Optional[Union[Image.Image, list[Image.Image], bytes, list[bytes]]] = None,
         videos: Optional[Union[npt.NDArray, list[npt.NDArray]]] = None,
         min_dynamic_patch: Optional[int] = None,
         max_dynamic_patch: Optional[int] = None,
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index e673632d4366..152d3ee98c01 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -57,6 +57,9 @@ def load_bytes(self, data: bytes) -> Image.Image:
     def load_base64(self, media_type: str, data: str) -> Image.Image:
         return self.load_bytes(base64.b64decode(data))
 
+    def load_base64_bytes(self, media_type: str, data: str) -> bytes:
+        return base64.b64decode(data, validate=True)
+
     def load_file(self, filepath: Path) -> Image.Image:
         image = Image.open(filepath)
         image.load()
diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py
index cae62b2235e4..693a9a230fb3 100644
--- a/vllm/multimodal/parse.py
+++ b/vllm/multimodal/parse.py
@@ -7,6 +7,7 @@
 from typing import (TYPE_CHECKING, Any, Generic, Literal, NamedTuple, Optional,
                     TypeVar, Union)
 
+import io
 import numpy as np
 import torch
 from typing_extensions import TypeAlias, TypeGuard, assert_never
@@ -209,6 +210,9 @@ def get_image_size(self, item_idx: int) -> ImageSize:
 
         if isinstance(image, PILImage.Image):
             return ImageSize(*image.size)
+        if isinstance(image, (bytes, bytearray)):
+            with PILImage.open(io.BytesIO(image)) as img:
+                return ImageSize(*img.size)
         if isinstance(image, (np.ndarray, torch.Tensor)):
             _, h, w = image.shape
             return ImageSize(w, h)
@@ -407,6 +411,7 @@ def _parse_image_data(
             return ImageEmbeddingItems(data)
 
         if (isinstance(data, PILImage.Image)
+                or isinstance(data, (bytes, bytearray))
                 or isinstance(data,
                               (np.ndarray, torch.Tensor)) and data.ndim == 3):
             data_items = [data]
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 11a25f851546..2a5d4a0c2098 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -6,6 +6,7 @@
 from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union
 from urllib.parse import ParseResult, urlparse
 
+import os
 import numpy as np
 import numpy.typing as npt
 import torch
@@ -66,6 +67,7 @@ def _load_data_url(
         self,
         url_spec: ParseResult,
         media_io: MediaIO[_M],
+        load_type: str = "PIL",
     ) -> _M:
         data_spec, data = url_spec.path.split(",", 1)
         media_type, data_type = data_spec.split(";", 1)
@@ -74,7 +76,10 @@ def _load_data_url(
             msg = "Only base64 data URLs are supported for now."
             raise NotImplementedError(msg)
 
-        return media_io.load_base64(media_type, data)
+        if load_type == "bytes":
+            return media_io.load_base64_bytes(media_type, data)
+        else:
+            return media_io.load_base64(media_type, data)
 
     def _load_file_url(
         self,
@@ -100,6 +105,7 @@ def load_from_url(
         media_io: MediaIO[_M],
         *,
         fetch_timeout: Optional[int] = None,
+        load_type: str = "bytes",
     ) -> _M:
         url_spec = urlparse(url)
 
@@ -107,13 +113,24 @@ def load_from_url(
             connection = self.connection
             data = connection.get_bytes(url, timeout=fetch_timeout)
 
-            return media_io.load_bytes(data)
+            if load_type == "bytes":
+                msg = "Only data URLs are currently supported for raw bytes loading."
+                raise NotImplementedError(msg)
+            else:
+                connection = self.connection
+                data = connection.get_bytes(url, timeout=fetch_timeout)
+
+                return media_io.load_bytes(data)
 
         if url_spec.scheme == "data":
-            return self._load_data_url(url_spec, media_io)
+            return self._load_data_url(url_spec, media_io, load_type)
 
         if url_spec.scheme == "file":
-            return self._load_file_url(url_spec, media_io)
+            if load_type == "bytes":
+                msg = "Only data URLs are currently supported for raw bytes loading."
+                raise NotImplementedError(msg)
+            else:
+                return self._load_file_url(url_spec, media_io)
 
         msg = "The URL must be either a HTTP, data or file URL."
         raise ValueError(msg)
@@ -124,20 +141,29 @@ async def load_from_url_async(
         media_io: MediaIO[_M],
         *,
         fetch_timeout: Optional[int] = None,
+        load_type: str = "bytes",
     ) -> _M:
         url_spec = urlparse(url)
 
         if url_spec.scheme.startswith("http"):
-            connection = self.connection
-            data = await connection.async_get_bytes(url, timeout=fetch_timeout)
+            if load_type == "bytes":
+                msg = "Only data URLs are currently supported for raw bytes loading."
+                raise NotImplementedError(msg)
+            else:
+                connection = self.connection
+                data = await connection.async_get_bytes(url, timeout=fetch_timeout)
 
-            return media_io.load_bytes(data)
+                return media_io.load_bytes(data)
 
         if url_spec.scheme == "data":
-            return self._load_data_url(url_spec, media_io)
+            return self._load_data_url(url_spec, media_io, load_type)
 
         if url_spec.scheme == "file":
-            return self._load_file_url(url_spec, media_io)
+            if load_type == "bytes":
+                msg = "Only data URLs are currently supported for raw bytes loading."
+                raise NotImplementedError(msg)
+            else:
+                return self._load_file_url(url_spec, media_io)
 
         msg = "The URL must be either a HTTP, data or file URL."
         raise ValueError(msg)
@@ -179,17 +205,20 @@ def fetch_image(
         image_mode: str = "RGB",
     ) -> Image.Image:
         """
-        Load a PIL image from a HTTP or base64 data URL.
+        Load a PIL image (or raw bytes) from a HTTP or base64 data URL.
 
         By default, the image is converted into RGB format.
         """
         image_io = ImageMediaIO(image_mode=image_mode)
+        use_mediapipe = os.getenv("VLLM_USE_MEDIA_PIPELINE", "false").lower() in ("1", "true", "yes")
+        load_type = "bytes" if use_mediapipe else "PIL"
 
         try:
             return self.load_from_url(
                 image_url,
                 image_io,
                 fetch_timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT,
+                load_type=load_type,
             )
         except UnidentifiedImageError as e:
             # convert to ValueError to be properly caught upstream
@@ -202,17 +231,20 @@ async def fetch_image_async(
         image_mode: str = "RGB",
     ) -> Image.Image:
         """
-        Asynchronously load a PIL image from a HTTP or base64 data URL.
+        Asynchronously load a PIL image (or raw bytes) from a HTTP or base64 data URL.
 
         By default, the image is converted into RGB format.
         """
         image_io = ImageMediaIO(image_mode=image_mode)
+        use_mediapipe = os.getenv("VLLM_USE_MEDIA_PIPELINE", "false").lower() in ("1", "true", "yes")
+        load_type = "bytes" if use_mediapipe else "PIL"
 
         try:
             return await self.load_from_url_async(
                 image_url,
                 image_io,
                 fetch_timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT,
+                load_type=load_type,
             )
         except UnidentifiedImageError as e:
             # convert to ValueError to be properly caught upstream
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index e514469a942c..012b38d45f47 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -1886,8 +1886,16 @@ def _prepare_prompt(
         multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list)
         if image_index_tensor is not None:
             multi_modal_kwargs['image_index'] = image_index_tensor
-        multi_modal_kwargs = MultiModalKwargs.as_kwargs(multi_modal_kwargs,
-                                                        device=self.device)
+
+        use_mediapipe = os.getenv("VLLM_USE_MEDIA_PIPELINE", "false").lower() in ("1", "true", "yes")
+        if use_mediapipe:
+            # With mediapipe path some tensors will already be on HPU, we only move to HPU if needed
+            for key in multi_modal_kwargs.keys():
+                if hasattr(multi_modal_kwargs[key], "device") and multi_modal_kwargs[key].device != self.device:
+                    multi_modal_kwargs[key] = self.move_to_device(multi_modal_kwargs[key])
+        else:
+            multi_modal_kwargs = MultiModalKwargs.as_kwargs(multi_modal_kwargs,
+                                                            device=self.device)
 
         return PreparePromptMetadata(input_tokens=input_tokens_tensor,
                                      input_positions=input_positions,

From 65abdfb8f8d154096af9cea4204fb951058d7d15 Mon Sep 17 00:00:00 2001
From: Yeonsil Yoon <yeon.sil.yoon@intel.com>
Date: Wed, 24 Sep 2025 00:40:40 +0000
Subject: [PATCH 07/20] Call compute_input_embeddings only for prompt to save
 decode time

---
 vllm/model_executor/models/internvl.py | 9 +++++----
 vllm/worker/hpu_model_runner.py        | 6 ++++--
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index e4b01fd86b5e..ea723e87340b 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -1728,10 +1728,11 @@ def forward(
         # NOTE: In v1, inputs_embeds is always generated at model runner, this
         # condition is for v0 compatibility.
         elif inputs_embeds is None:
-            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
-            inputs_embeds = self.get_input_embeddings(input_ids,
-                                                      vision_embeddings)
-            input_ids = None
+            if not is_hpu:
+                vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+                inputs_embeds = self.get_input_embeddings(input_ids,
+                                                          vision_embeddings)
+                input_ids = None
 
         forward_kwargs = {
             "input_ids": input_ids,
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 012b38d45f47..ac6f1eed8098 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -739,7 +739,8 @@ def forward(self, *args, **kwargs):
         if self._rotary_prepare_cos_sin is not None and not self.model_is_mrope:
             self._rotary_prepare_cos_sin(
                 kwargs['positions'], recompute_cos_sin=self.recompute_cos_sin)
-        if self.model_is_mrope or self.is_mm_optimized:
+        if self.model_is_mrope or (self.is_mm_optimized
+                                   and kwargs['attn_metadata'].is_prompt):
             # inputs_embeds was computed on execute_model
             # now we always want to use the inputs_embeds
             # even if the prompt is text only
@@ -3901,7 +3902,8 @@ def try_revert_dummy_output_tokens():
                         model_input.multi_modal_kwargs.pop('image_index', None)
 
                 if not bypass_model_exec:
-                    if self.model_is_mrope or self.is_mm_optimized:
+                    if self.model_is_mrope or (self.is_mm_optimized
+                                               and is_prompt):
                         if ('pixel_values') in execute_model_kwargs and \
                                 self.is_mm_optimized:
                             if warmup_mode and not is_pt_profiler_run:

From b38c808ef9c0c147b7d9a002e7fb4d2bf6a546c5 Mon Sep 17 00:00:00 2001
From: Yeonsil Yoon <yeon.sil.yoon@intel.com>
Date: Wed, 24 Sep 2025 01:44:28 +0000
Subject: [PATCH 08/20] Merge PR1974 intervl:cache prompt_tokens for sampling
 metadata

in sampling if do penalties, the prompt_tokens regenerates for each decode, that takes time. instead
we can use cache it, and reset when requests set changes
---
 vllm/model_executor/layers/sampler.py    | 15 +++++++--
 vllm/model_executor/sampling_metadata.py | 42 ++++++++++++++++--------
 vllm/worker/hpu_model_runner.py          |  6 ++++
 3 files changed, 47 insertions(+), 16 deletions(-)

diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
index 5d18d3b59af4..cc7872f69bc0 100755
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -197,6 +197,9 @@ def __init__(self):
         # speculative decoding and when prompt embeddings are specified.
         self.include_gpu_probs_tensor = False
         self.should_modify_greedy_probs_inplace = False
+        # Add HPU cache class variables
+        self._prompt_tokens_hpu_cache: Optional[torch.Tensor] = None
+        self._cached_seq_ids: Optional[set] = None
 
     def _init_sampling_tensors(
         self,
@@ -216,8 +219,10 @@ def _init_sampling_tensors(
 
         # Initialize new sampling tensors
         (sampling_tensors, do_penalties, do_top_p_top_k, do_min_p,
-         top_k_scalar, top_p_scalar) = SamplingTensors.from_sampling_metadata(
-             sampling_metadata, vocab_size, logits.device, logits.dtype)
+         top_k_scalar, top_p_scalar, current_seq_ids) = \
+            SamplingTensors.from_sampling_metadata(
+             sampling_metadata, vocab_size, logits.device, logits.dtype, \
+             self._prompt_tokens_hpu_cache, self._cached_seq_ids)
 
         self._sampling_tensors = sampling_tensors
         self._do_penalties = do_penalties
@@ -227,6 +232,12 @@ def _init_sampling_tensors(
         self._top_p_scalar = top_p_scalar
 
         self._apply_top_k_top_p_opt = ApplyToppTopkScalar(5)
+        # Check if batch composition changed - if so, invalidate prompt cache
+
+        # After tensors are created, update cache
+        if self._cached_seq_ids != current_seq_ids:
+            self._prompt_tokens_hpu_cache = None
+            self._cached_seq_ids = current_seq_ids
 
     def forward(
         self,
diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py
index 5e40449d6552..08244fe0af28 100644
--- a/vllm/model_executor/sampling_metadata.py
+++ b/vllm/model_executor/sampling_metadata.py
@@ -16,6 +16,8 @@
                         make_tensor_with_pad_align)
 
 _SAMPLING_EPS = 1e-5
+pin_memory = is_pin_memory_available()
+is_hpu = current_platform.is_hpu()
 
 
 @dataclass
@@ -286,7 +288,7 @@ def _prepare_seq_groups(
 
         if seq_group_metadata.is_prompt:
             if sampling_params.seed is not None:
-                if current_platform.is_hpu():
+                if is_hpu:
                     import habana_frameworks.torch.hpu.random as htrandom
                     generator = \
                         htrandom.default_generators[
@@ -420,8 +422,10 @@ def from_sampling_metadata(
         vocab_size: int,
         device: torch.device,
         dtype: torch.dtype,
+        prompt_tokens_cache: torch.tensor,
+        past_seq_ids: set,
     ) -> tuple["SamplingTensors", bool, bool, bool, Optional[int],
-               Optional[float]]:
+               Optional[float], Optional[torch.tensor]]:
         prompt_tokens: list[array] = []
         output_tokens: list[array] = []
         top_ks: list[int] = []
@@ -434,6 +438,7 @@ def from_sampling_metadata(
         do_penalties = False
         do_top_p_top_k = False
         do_min_p = False
+        current_seq_ids = set()
 
         assert sampling_metadata.seq_groups is not None
         for seq_group in sampling_metadata.seq_groups:
@@ -508,6 +513,9 @@ def from_sampling_metadata(
                         seq_data = seq_group.seq_data[seq_id]
                         prompt_tokens.append(seq_data.prompt_token_ids_array)
                         output_tokens.append(seq_data.output_token_ids_array)
+                        current_seq_ids.update(seq_ids)
+            if current_seq_ids != past_seq_ids:
+                prompt_tokens_cache = None
 
         top_k_scalar = top_ks[0] if do_top_p_top_k and all(
             k == top_ks[0] for k in top_ks) else None
@@ -527,9 +535,10 @@ def from_sampling_metadata(
             vocab_size,
             device,
             dtype,
+            prompt_tokens_cache,
         )
         return (sampling_tensors, do_penalties, do_top_p_top_k, do_min_p,
-                top_k_scalar, top_p_scalar)
+                top_k_scalar, top_p_scalar, current_seq_ids)
 
     @classmethod
     def from_lists(
@@ -546,23 +555,28 @@ def from_lists(
         vocab_size: int,
         device: torch.device,
         dtype: torch.dtype,
+        prompt_tokens_cache: torch.tensor,
     ) -> "SamplingTensors":
         # Note that the performance will be very bad without
         # pinned memory.
-        pin_memory = is_pin_memory_available()
 
         do_penalties = prompt_tokens or output_tokens
 
         if do_penalties:
-            if current_platform.is_hpu():
-                prompt_t = make_tensor_with_pad_align(
-                    prompt_tokens,
-                    vocab_size,
-                    device="cpu",
-                    dtype=torch.int64,
-                    pin_memory=pin_memory,
-                    max_len_align=1024,
-                )
+            if is_hpu:
+                if (prompt_tokens_cache is not None and
+                    prompt_tokens_cache.device == device):
+                    # Reuse cached prompt_tokens already on HPU
+                    prompt_t = prompt_tokens_cache
+                else:
+                    prompt_t = make_tensor_with_pad_align(
+                        prompt_tokens,
+                        vocab_size,
+                        device="cpu",
+                        dtype=torch.int64,
+                        pin_memory=pin_memory,
+                        max_len_align=1024,
+                    )
                 output_t = make_tensor_with_pad_align(
                     output_tokens,
                     vocab_size,
@@ -647,6 +661,6 @@ def from_lists(
                                                          non_blocking=True),
             repetition_penalties=repetition_penalties_t.to(device=device,
                                                            non_blocking=True),
-            prompt_tokens=prompt_t.to(device=device, non_blocking=True),
+            prompt_tokens=prompt_t.to(device=device, non_blocking=True) if prompt_t.device != device else prompt_t,
             output_tokens=output_t.to(device=device, non_blocking=True),
         )
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index ac6f1eed8098..dc4ce30cd912 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -4023,6 +4023,12 @@ def try_revert_dummy_output_tokens():
                         self.cached_step_inputs.append(model_input)
                 if self.do_mark_step:
                     htorch.core.mark_step()
+                if hasattr(self.model.sampler, '_sampling_tensors') and \
+                    self.model.sampler._sampling_tensors is not None:
+                    sampling_tensors = self.model.sampler._sampling_tensors
+                    if sampling_tensors.prompt_tokens.numel() > 0:
+                        # Cache the prompt_tokens tensor that's already on HPU
+                        self.model.sampler._prompt_tokens_hpu_cache = sampling_tensors.prompt_tokens
                 if use_delayed_sampling \
                    and model_input.async_callback is not None:
                     model_input.async_callback()

From e684eb59cdfe3862b61d85f9eec653e90c7517e6 Mon Sep 17 00:00:00 2001
From: Yeonsil Yoon <yeon.sil.yoon@intel.com>
Date: Wed, 24 Sep 2025 16:31:23 +0000
Subject: [PATCH 09/20] Add check to only for do_penalities

---
 vllm/worker/hpu_model_runner.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index dc4ce30cd912..c602d25055ec 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -4024,7 +4024,8 @@ def try_revert_dummy_output_tokens():
                 if self.do_mark_step:
                     htorch.core.mark_step()
                 if hasattr(self.model.sampler, '_sampling_tensors') and \
-                    self.model.sampler._sampling_tensors is not None:
+                    self.model.sampler._sampling_tensors is not None and \
+                    self.model.sampler._do_penalties:
                     sampling_tensors = self.model.sampler._sampling_tensors
                     if sampling_tensors.prompt_tokens.numel() > 0:
                         # Cache the prompt_tokens tensor that's already on HPU

From 92e8db3583260a5f1801c2878ae9b2ba162ffca9 Mon Sep 17 00:00:00 2001
From: Yeonsil Yoon <yeon.sil.yoon@intel.com>
Date: Wed, 24 Sep 2025 21:35:03 +0000
Subject: [PATCH 10/20] Fix for merge_multimodal_embeddedings() crash

---
 vllm/model_executor/models/utils.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index 822113b52a8a..0157010b2c1e 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -553,8 +553,18 @@ def merge_multimodal_embeddings(
         This updates ``inputs_embeds`` in place.
     """
     if isinstance(placeholder_token_id, list):
-        placeholder_token_id = torch.tensor(placeholder_token_id,
-                                            device=input_ids.device)
+        flat_ids = []
+        for x in placeholder_token_id:
+            if torch.is_tensor(x):
+                flat_ids.extend(int(v) for v in x.reshape(-1).tolist())
+            elif isinstance(x, (list, tuple)):
+                flat_ids.extend(int(v) for v in x)
+            else:
+                flat_ids.append(int(x))
+
+        placeholder_token_id = torch.as_tensor(
+            flat_ids, device=input_ids.device, dtype=input_ids.dtype
+        )
         return _merge_multimodal_embeddings(
             inputs_embeds,
             torch.isin(input_ids, placeholder_token_id),

From 22128e5a1053371f9956fa95c766f5652411cc84 Mon Sep 17 00:00:00 2001
From: Seunghyuk Park <seunghyuk.h.park@intel.com>
Date: Fri, 26 Sep 2025 18:55:59 +0300
Subject: [PATCH 11/20] Add mediapipe changes more

---
 vllm/model_executor/models/internvl.py | 190 ++++++++++++++++---------
 1 file changed, 122 insertions(+), 68 deletions(-)

diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index ea723e87340b..2717c07b2de9 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -15,6 +15,7 @@
 import numpy.typing as npt
 import torch
 import torch.nn as nn
+import torch.nn.functional as F
 import torchvision.transforms as T
 from PIL import Image
 from transformers import BatchEncoding, PretrainedConfig, TensorType
@@ -31,6 +32,8 @@
 from queue import Queue
 import io
 import time
+import atexit
+from dataclasses import dataclass
 
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.quantization import QuantizationConfig
@@ -353,9 +356,12 @@ def __next__(self):
         img_list = shared_q.get()
         for i in range(len(img_list)):
             # NOTE: this padding is needed because of HW alignmnet requirment
-            img_list[i] = np.pad(img_list[i],
-                                       (0, 64 - len(img_list[i]) % 64),
-                                       'constant')
+            rem = len(img_list[i]) % 64
+            pad = (64 - rem) % 64
+            if pad:
+                img_list[i] = np.pad(img_list[i],
+                                    (0, pad),
+                                    'constant')
         return img_list
 
     def get_media_output_type(self):
@@ -395,15 +401,16 @@ def __init__(self, device, queue_depth, batch_size,
             device="hpu", output_format=it.RGB_I, resize=[img_width, img_height])
 
         self.mean_node = fn.MediaConst(
-            data=np.array([127.5, 127.5, 127.5], dtype=dt.FLOAT32),
-            shape=[1, 1, 3],
-            dtype=dt.FLOAT32
-        )
+            data=np.array([IMAGENET_MEAN[0]*255.0,
+                           IMAGENET_MEAN[1]*255.0,
+                           IMAGENET_MEAN[2]*255.0], dtype=dt.FLOAT32),
+            shape=[1, 1, 3], dtype=dt.FLOAT32)
+
         self.std_node = fn.MediaConst(
-            data=np.array([1/127.5, 1/127.5, 1/127.5], dtype=dt.FLOAT32),
-            shape=[1, 1, 3],
-            dtype=dt.FLOAT32
-        )
+            data=np.array([1.0/(IMAGENET_STD[0]*255.0),
+                           1.0/(IMAGENET_STD[1]*255.0),
+                           1.0/(IMAGENET_STD[2]*255.0)], dtype=dt.FLOAT32),
+            shape=[1, 1, 3], dtype=dt.FLOAT32)
 
         self.cmn = fn.CropMirrorNorm(crop_w=img_width, crop_h=img_height, dtype=dt.FLOAT32, device="hpu")
 
@@ -424,6 +431,61 @@ def definegraph(self):
         # Return the full processed image - we'll do tiling in Python
         return images
 
+
+# -----------------------------------------------------------------------------
+# MediaPipe manager (persist pipes/iterators)
+# -----------------------------------------------------------------------------
+@dataclass
+class _PipeState:
+    pipe:   hpuMediaPipe | None = None
+    it:     MediaGenericPytorchIterator | None = None
+    bsz:    int | None = None
+    H:      int | None = None
+    W:      int | None = None
+
+
+class MediaPipeTiler:
+    """Owns and reuses MediaPipe pipes/iterators for main path"""
+    def __init__(self) -> None:
+        self._main  = _PipeState()
+
+    def _rebuild(self, st: _PipeState, *, bsz: int, H: int, W: int) -> None:
+        if st.pipe is not None:
+            try:
+                st.pipe.close()
+            except Exception:
+                pass
+        pipe = hpuMediaPipe("legacy", 0, bsz, 1, "cpu", H, W)
+        pipe.build()
+        st.pipe, st.it, st.bsz, st.H, st.W = pipe, iter(MediaPytorchIterator(pipe)), bsz, H, W
+
+    def ensure_main(self, *, bsz: int, H: int, W: int) -> tuple[hpuMediaPipe, MediaGenericPytorchIterator]:
+        st = self._main
+        if st.pipe is None or st.bsz != bsz or st.H != H or st.W != W:
+            self._rebuild(st, bsz=bsz, H=H, W=W)
+        return st.pipe, st.it  # type: ignore[return-value]
+
+    def reset_iter(self) -> None:
+        st = self._main
+        if st.pipe is not None:
+            st.it = iter(MediaPytorchIterator(st.pipe))
+
+    def close_all(self) -> None:
+        st = self._main
+        try:
+            if st.pipe is not None:
+                st.pipe.close()
+        except Exception:
+            pass
+        finally:
+            st.pipe = None
+            st.it = None
+            st.bsz = st.H = st.W = None
+
+
+_MP = MediaPipeTiler()
+atexit.register(_MP.close_all)
+
 def get_image_info(data):
     # Get image info using PIL without decoding
     try:
@@ -466,31 +528,9 @@ def preprocess_images(
     img_sizes.append(img_size)
     batch_sizes.append(batch_size)
 
-    thumbs = None
-    if use_thumbnail and len(images) > 0:
-        batch_size = len(images)
-        pipe = hpuMediaPipe("legacy", queue_depth, batch_size,
-                        num_threads, "cpu",
-                        patch_size, patch_size)
-        pipe.build()
-        data_loader = MediaPytorchIterator(pipe)
-        data_loader = iter(data_loader)
-
-        img_list = np.empty(shape=[batch_size, ], dtype=object)
-        for i in range(batch_size):
-            img_list[i] = np.frombuffer(images[i], np.uint8)
-
-        shared_q.put(img_list)
-        thumbs = next(data_loader)[0]
-
-        shared_q.task_done()
-        pipe.close()
-        del pipe
-
     image_num_patches = torch.zeros(len(images), dtype=torch.int64)
 
-    patches = []
-    thumb_idx = 0
+    batch_patches = []
     image_num_patches_idx = 0
     for batch_size, img_size in zip(batch_sizes, img_sizes):
         # calculate the number of blocks without thumbnail
@@ -502,48 +542,62 @@ def preprocess_images(
             use_thumbnail=False,
         )
 
-        num_patches = blocks + 1 if use_thumbnail and thumbs is not None and blocks > 1 else blocks
-        image_num_patches[image_num_patches_idx:image_num_patches_idx+batch_size] = num_patches
-
-        pipe = hpuMediaPipe("legacy", queue_depth, batch_size,
-                        num_threads, "cpu",
-                        target_height, target_width)
-        pipe.build()
-        data_loader = MediaPytorchIterator(pipe)
-        data_loader = iter(data_loader)
+        # if batch, H, W is changed, create new one
+        main_pipe, main_iter = _MP.ensure_main(bsz=batch_size, H=target_height, W=target_width)
 
         img_list = np.empty(shape=[batch_size, ], dtype=object)
         for i in range(batch_size):
             img_list[i] = np.frombuffer(images[i], np.uint8)
 
         shared_q.put(img_list)
-        processed_images = next(data_loader)[0]
-
-        shared_q.task_done()
-        pipe.close()
-        del pipe
-
-        # Extract tiles
-        tiles = []
-        H, W = target_height, target_width
-        for h_idx in range(H // patch_size):
-            for w_idx in range(W // patch_size):
-                h_start = h_idx * patch_size
-                h_end = h_start + patch_size
-                w_start = w_idx * patch_size
-                w_end = w_start + patch_size
-
-                tile = processed_images[:, :, h_start:h_end, w_start:w_end]
-                tiles.append(tile)
+        try:
+            processed_images = next(main_iter)[0]
+        except StopIteration:
+            _MP.reset_iter()
+            _, main_iter = _MP.ensure_main(bsz=batch_size, H=target_height, W=target_width)
+            processed_images = next(main_iter)[0]
+        finally:
+            shared_q.task_done()
+
+        # tiling vectorization: [N,C,H,W] -> [N,Ty,Tx,C,ps,ps] -> [N, T, C, ps, ps]
+        N, C, H, W = processed_images.shape
+        Ty, Tx = H // patch_size, W // patch_size
+        T = Ty * Tx
+
+        use_thumb_now = use_thumbnail and (T > 1)
+
+        x = processed_images.view(N, C, Ty, patch_size, Tx, patch_size) \
+                              .permute(0, 2, 4, 1, 3, 5) \
+                              .contiguous() \
+                              .view(N, T, C, patch_size, patch_size)  # [N,T,C,ps,ps]
+
+        if use_thumb_now:
+            # [N,3,ps,ps]
+            thumbs_batch = F.interpolate(processed_images, size=(patch_size, patch_size),
+                                         mode="bilinear", align_corners=False)
+
+        num_patches = T + 1 if use_thumb_now else T
+        image_num_patches[image_num_patches_idx:image_num_patches_idx+batch_size] = num_patches
+        image_num_patches_idx += batch_size
+
+        # consist tensor batch based (tile + thumbnail)
+        if use_thumb_now:
+            out = torch.empty((N, T + 1, C, patch_size, patch_size),
+                              dtype=processed_images.dtype, device=processed_images.device)
+            out[:, :T] = x
+            out[:, T]  = thumbs_batch
+            out = out.view(N * (T + 1), C, patch_size, patch_size)  # [N*(T+1),C,ps,ps]
+        else:
+            # no thumbnail (T==1 or off)
+            out = x.view(N * T, C, patch_size, patch_size)          # [N*T,C,ps,ps]
 
-        for i in range(batch_size):
-            for t in tiles:
-                patches.append(t[i])
-            if use_thumbnail and thumbs is not None and len(tiles) > 1:
-                patches.append(thumbs[thumb_idx])
-            thumb_idx += 1
+        batch_patches.append(out)
 
-    patches_flat = torch.stack(patches, dim=0)
+    patches_flat = (
+        torch.cat(batch_patches, dim=0)
+        if batch_patches
+        else torch.empty((0, 3, patch_size, patch_size), dtype=torch.float32)
+    )
     return patches_flat, image_num_patches
 
 
From 602d2d29d04e02ac4c2e1736ee14d8c959481cec Mon Sep 17 00:00:00 2001
From: Yeonsil Yoon <yeon.sil.yoon@intel.com>
Date: Mon, 29 Sep 2025 13:07:13 -0700
Subject: [PATCH 12/20] Libint/add samplemetatensorcache3 (#1991)

Co-authored-by: Libin Tang <libin.tang@intel.com>
---
 vllm/model_executor/layers/sampler.py    |  4 ++-
 vllm/model_executor/sampling_metadata.py | 35 ++++++++++++++++--------
 vllm/worker/hpu_model_runner.py          | 10 +++++--
 3 files changed, 33 insertions(+), 16 deletions(-)

diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
index cc7872f69bc0..5c82e31e7c4e 100755
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -199,6 +199,7 @@ def __init__(self):
         self.should_modify_greedy_probs_inplace = False
         # Add HPU cache class variables
         self._prompt_tokens_hpu_cache: Optional[torch.Tensor] = None
+        self._output_tokens_hpu_cache: Optional[torch.Tensor] = None
         self._cached_seq_ids: Optional[set] = None
 
     def _init_sampling_tensors(
@@ -222,7 +223,7 @@ def _init_sampling_tensors(
          top_k_scalar, top_p_scalar, current_seq_ids) = \
             SamplingTensors.from_sampling_metadata(
              sampling_metadata, vocab_size, logits.device, logits.dtype, \
-             self._prompt_tokens_hpu_cache, self._cached_seq_ids)
+             self._prompt_tokens_hpu_cache, self._output_tokens_hpu_cache, self._cached_seq_ids)
 
         self._sampling_tensors = sampling_tensors
         self._do_penalties = do_penalties
@@ -237,6 +238,7 @@ def _init_sampling_tensors(
         # After tensors are created, update cache
         if self._cached_seq_ids != current_seq_ids:
             self._prompt_tokens_hpu_cache = None
+            self._output_tokens_hpu_cache = None
             self._cached_seq_ids = current_seq_ids
 
     def forward(
diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py
index 08244fe0af28..ce3866c81270 100644
--- a/vllm/model_executor/sampling_metadata.py
+++ b/vllm/model_executor/sampling_metadata.py
@@ -5,7 +5,7 @@
 from dataclasses import dataclass
 from typing import Optional
 
-import torch
+import torch,time
 
 from vllm.platforms import current_platform
 from vllm.sampling_params import SamplingParams, SamplingType
@@ -423,6 +423,7 @@ def from_sampling_metadata(
         device: torch.device,
         dtype: torch.dtype,
         prompt_tokens_cache: torch.tensor,
+        output_tokens_cache: torch.tensor,
         past_seq_ids: set,
     ) -> tuple["SamplingTensors", bool, bool, bool, Optional[int],
                Optional[float], Optional[torch.tensor]]:
@@ -516,7 +517,7 @@ def from_sampling_metadata(
                         current_seq_ids.update(seq_ids)
             if current_seq_ids != past_seq_ids:
                 prompt_tokens_cache = None
-
+                output_tokens_cache = None
         top_k_scalar = top_ks[0] if do_top_p_top_k and all(
             k == top_ks[0] for k in top_ks) else None
         top_p_scalar = top_ps[0] if do_top_p_top_k and all(
@@ -536,6 +537,7 @@ def from_sampling_metadata(
             device,
             dtype,
             prompt_tokens_cache,
+            output_tokens_cache,
         )
         return (sampling_tensors, do_penalties, do_top_p_top_k, do_min_p,
                 top_k_scalar, top_p_scalar, current_seq_ids)
@@ -556,6 +558,7 @@ def from_lists(
         device: torch.device,
         dtype: torch.dtype,
         prompt_tokens_cache: torch.tensor,
+        output_tokens_cache: torch.tensor,
     ) -> "SamplingTensors":
         # Note that the performance will be very bad without
         # pinned memory.
@@ -568,6 +571,14 @@ def from_lists(
                     prompt_tokens_cache.device == device):
                     # Reuse cached prompt_tokens already on HPU
                     prompt_t = prompt_tokens_cache
+                    # Get the last element from each list
+                    last_elements = [out[-1] for out in output_tokens]
+                    lengths = [len(out)-1 for out in output_tokens]
+                    indices = torch.tensor(lengths, device=device)
+                    rows = torch.arange(output_tokens_cache.shape[0], device=device)
+                    # Convert to a PyTorch tensor with shape [4, 1]
+                    last_elements_t = torch.tensor(last_elements).unsqueeze(1).to(output_tokens_cache.device)
+                    output_t = output_tokens_cache.index_put_((rows, indices), last_elements_t)
                 else:
                     prompt_t = make_tensor_with_pad_align(
                         prompt_tokens,
@@ -577,14 +588,14 @@ def from_lists(
                         pin_memory=pin_memory,
                         max_len_align=1024,
                     )
-                output_t = make_tensor_with_pad_align(
-                    output_tokens,
-                    vocab_size,
-                    device="cpu",
-                    dtype=torch.int64,
-                    pin_memory=pin_memory,
-                    max_len_align=1024,
-                )
+                    output_t = make_tensor_with_pad_align(
+                        output_tokens,
+                        vocab_size,
+                        device="cpu",
+                        dtype=torch.int64,
+                        pin_memory=pin_memory,
+                        max_len_align=1024,
+                    )
             else:
                 prompt_t = make_tensor_with_pad(
                     prompt_tokens,
@@ -649,7 +660,7 @@ def from_lists(
         )
         # Because the memory is pinned, we can do non-blocking
         # transfer to device.
-
+        output_t=output_t.to(device=device, non_blocking=True) if output_t.device != device else output_t
         return cls(
             temperatures=temperatures_t.to(device=device, non_blocking=True),
             top_ps=top_ps_t.to(device=device, non_blocking=True),
@@ -662,5 +673,5 @@ def from_lists(
             repetition_penalties=repetition_penalties_t.to(device=device,
                                                            non_blocking=True),
             prompt_tokens=prompt_t.to(device=device, non_blocking=True) if prompt_t.device != device else prompt_t,
-            output_tokens=output_t.to(device=device, non_blocking=True),
+            output_tokens=output_t
         )
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index c602d25055ec..4c09ac5b1188 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -1888,12 +1888,15 @@ def _prepare_prompt(
         if image_index_tensor is not None:
             multi_modal_kwargs['image_index'] = image_index_tensor
 
-        use_mediapipe = os.getenv("VLLM_USE_MEDIA_PIPELINE", "false").lower() in ("1", "true", "yes")
+        use_mediapipe = os.getenv("VLLM_USE_MEDIA_PIPELINE",
+                                  "false").lower() in ("1", "true", "yes")
         if use_mediapipe:
             # With mediapipe path some tensors will already be on HPU, we only move to HPU if needed
             for key in multi_modal_kwargs.keys():
-                if hasattr(multi_modal_kwargs[key], "device") and multi_modal_kwargs[key].device != self.device:
-                    multi_modal_kwargs[key] = self.move_to_device(multi_modal_kwargs[key])
+                if hasattr(multi_modal_kwargs[key], "device"
+                           ) and multi_modal_kwargs[key].device != self.device:
+                    multi_modal_kwargs[key] = self.move_to_device(
+                        multi_modal_kwargs[key])
         else:
             multi_modal_kwargs = MultiModalKwargs.as_kwargs(multi_modal_kwargs,
                                                             device=self.device)
@@ -4030,6 +4033,7 @@ def try_revert_dummy_output_tokens():
                     if sampling_tensors.prompt_tokens.numel() > 0:
                         # Cache the prompt_tokens tensor that's already on HPU
                         self.model.sampler._prompt_tokens_hpu_cache = sampling_tensors.prompt_tokens
+                        self.model.sampler._output_tokens_hpu_cache = sampling_tensors.output_tokens
                 if use_delayed_sampling \
                    and model_input.async_callback is not None:
                     model_input.async_callback()

From 8e88b004d72b06116001f3c7d3e543e726692655 Mon Sep 17 00:00:00 2001
From: Libin Tang <libin.tang@intel.com>
Date: Tue, 30 Sep 2025 08:49:44 -0700
Subject: [PATCH 13/20] add fix for output_token length check

---
 vllm/model_executor/sampling_metadata.py | 25 +++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py
index ce3866c81270..16c9292f6bec 100644
--- a/vllm/model_executor/sampling_metadata.py
+++ b/vllm/model_executor/sampling_metadata.py
@@ -5,7 +5,7 @@
 from dataclasses import dataclass
 from typing import Optional
 
-import torch,time
+import torch
 
 from vllm.platforms import current_platform
 from vllm.sampling_params import SamplingParams, SamplingType
@@ -571,14 +571,6 @@ def from_lists(
                     prompt_tokens_cache.device == device):
                     # Reuse cached prompt_tokens already on HPU
                     prompt_t = prompt_tokens_cache
-                    # Get the last element from each list
-                    last_elements = [out[-1] for out in output_tokens]
-                    lengths = [len(out)-1 for out in output_tokens]
-                    indices = torch.tensor(lengths, device=device)
-                    rows = torch.arange(output_tokens_cache.shape[0], device=device)
-                    # Convert to a PyTorch tensor with shape [4, 1]
-                    last_elements_t = torch.tensor(last_elements).unsqueeze(1).to(output_tokens_cache.device)
-                    output_t = output_tokens_cache.index_put_((rows, indices), last_elements_t)
                 else:
                     prompt_t = make_tensor_with_pad_align(
                         prompt_tokens,
@@ -588,6 +580,18 @@ def from_lists(
                         pin_memory=pin_memory,
                         max_len_align=1024,
                     )
+                if (output_tokens_cache is not None and
+                    output_tokens_cache.device == device and
+                    len(output_tokens) > 0 and len(output_tokens_cache[0]) > 0):
+                    # Get the last element from each list
+                    last_elements = [out[-1] for out in output_tokens]
+                    lengths = [len(out)-1 for out in output_tokens]
+                    indices = torch.tensor(lengths, device=device)
+                    rows = torch.arange(output_tokens_cache.shape[0], device=device)
+                    # Convert to a PyTorch tensor with shape [4, 1]
+                    last_elements_t = torch.tensor(last_elements).unsqueeze(1).to(output_tokens_cache.device)
+                    output_t = output_tokens_cache.index_put_((rows, indices), last_elements_t)
+                else:
                     output_t = make_tensor_with_pad_align(
                         output_tokens,
                         vocab_size,
@@ -660,7 +664,6 @@ def from_lists(
         )
         # Because the memory is pinned, we can do non-blocking
         # transfer to device.
-        output_t=output_t.to(device=device, non_blocking=True) if output_t.device != device else output_t
         return cls(
             temperatures=temperatures_t.to(device=device, non_blocking=True),
             top_ps=top_ps_t.to(device=device, non_blocking=True),
@@ -673,5 +676,5 @@ def from_lists(
             repetition_penalties=repetition_penalties_t.to(device=device,
                                                            non_blocking=True),
             prompt_tokens=prompt_t.to(device=device, non_blocking=True) if prompt_t.device != device else prompt_t,
-            output_tokens=output_t
+            output_tokens=output_t.to(device=device, non_blocking=True) if output_t.device != device else output_t
         )

From eed4b4b68abd78d8013e9057bb357ca05b3f39c0 Mon Sep 17 00:00:00 2001
From: Yeonsil Yoon <yeon.sil.yoon@intel.com>
Date: Fri, 3 Oct 2025 10:31:40 -0700
Subject: [PATCH 14/20] Small fixes for internvl (cherry-pick 8751709)

---
 vllm/model_executor/models/internvl.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index 2717c07b2de9..bb9c574eb5cc 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -514,7 +514,7 @@ def preprocess_images(
     batch_sizes = []
     for img in images:
         img_info = get_image_info(img)
-        if img_info['format'] != 'JPEG' and img_info['mode'] != 'RGB':
+        if not (img_info['format'] == 'JPEG' and img_info['mode'] == 'RGB'):
             raise ValueError(f"HPU media pipeline only supports JPEG images in RGB mode. Detected format={img_info['format']}, mode={img_info['mode']}")
         if img_size==None:
             img_size = img_info['size']
@@ -1605,7 +1605,7 @@ def _parse_and_validate_video_input(
             self, **kwargs: object) -> Optional[InternVLVideoPixelInputs]:
         pixel_values_flat_video = kwargs.pop("pixel_values_flat_video", None)
         video_num_patches = kwargs.pop("video_num_patches", None)
-        video_embeds = kwargs.pop("image_embeds", None)
+        video_embeds = kwargs.pop("video_embeds", None)
 
         if pixel_values_flat_video is None and video_embeds is None:
             return None

From aab0a37d1c7e3fc696e37cbc5e87a43dde382875 Mon Sep 17 00:00:00 2001
From: Yeonsil Yoon <yeon.sil.yoon@intel.com>
Date: Fri, 3 Oct 2025 13:08:58 -0700
Subject: [PATCH 15/20] Fix pre-commit

---
 vllm/entrypoints/chat_utils.py           |   6 +-
 vllm/model_executor/layers/sampler.py    |   3 +-
 vllm/model_executor/models/internvl.py   | 217 +++++++++++++----------
 vllm/model_executor/models/utils.py      |   6 +-
 vllm/model_executor/sampling_metadata.py |  29 +--
 vllm/multimodal/parse.py                 |   5 +-
 vllm/multimodal/utils.py                 |  30 +++-
 vllm/worker/hpu_model_runner.py          |  11 +-
 8 files changed, 179 insertions(+), 128 deletions(-)

diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 9a0525c7933a..7d549bb58558 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -405,10 +405,12 @@ def _resolve_chat_template_content_format(
     jinja_text = (hf_chat_template if isinstance(hf_chat_template, str)
                   else load_chat_template(chat_template, is_literal=True))
 
-    # The InternVL template has mixed content access patterns that fail with automatic detection.
+    # The InternVL template has mixed content access patterns
+    # that fail with automatic detection.
     # Set string format for proper operation if InternVL is used.
     model_type = getattr(model_config.hf_config, 'model_type', '')
-    if model_type == 'internvl_chat' or 'internvl' in model_config.model.lower():
+    if model_type == 'internvl_chat' or 'internvl' \
+        in model_config.model.lower():
         detected_format = "string"
     else:
         detected_format = ("string" if jinja_text is None else
diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
index 5c82e31e7c4e..dc60aa2bf5e3 100755
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -223,7 +223,8 @@ def _init_sampling_tensors(
          top_k_scalar, top_p_scalar, current_seq_ids) = \
             SamplingTensors.from_sampling_metadata(
              sampling_metadata, vocab_size, logits.device, logits.dtype, \
-             self._prompt_tokens_hpu_cache, self._output_tokens_hpu_cache, self._cached_seq_ids)
+             self._prompt_tokens_hpu_cache, self._output_tokens_hpu_cache, \
+             self._cached_seq_ids)
 
         self._sampling_tensors = sampling_tensors
         self._do_penalties = do_penalties
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index bb9c574eb5cc..fdb88e5bc83c 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -1,6 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import atexit
+import contextlib
+import io
 # adapted from https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/modeling_internvl_chat.py
 # --------------------------------------------------------
 # InternVL
@@ -8,32 +11,33 @@
 # Licensed under The MIT License [see LICENSE for details]
 # --------------------------------------------------------
 import os
+import time
 from abc import ABC, abstractmethod
 from collections.abc import Iterable, Mapping, Sequence
+from dataclasses import dataclass
+from queue import Queue
 from typing import Any, Literal, Optional, TypedDict, TypeVar, Union
 
+import numpy as np
 import numpy.typing as npt
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import torchvision.transforms as T
-from PIL import Image
-from transformers import BatchEncoding, PretrainedConfig, TensorType
-
 from habana_frameworks.mediapipe import fn
-from habana_frameworks.mediapipe.mediapipe import MediaPipe
+# Handle MediaPipe pipe_manager destructor
+from habana_frameworks.mediapipe.backend.cal import (cpp_pipe_manager_list,
+                                                     pipe_manager)
 from habana_frameworks.mediapipe.media_types import dtype as dt
 from habana_frameworks.mediapipe.media_types import imgtype as it
 from habana_frameworks.mediapipe.media_types import readerOutType as ro
-from habana_frameworks.mediapipe.operators.reader_nodes.reader_nodes import media_ext_reader_op_impl
-from habana_frameworks.mediapipe.operators.reader_nodes.reader_nodes import media_ext_reader_op_tensor_info
-from habana_frameworks.mediapipe.plugins.iterator_pytorch import MediaGenericPytorchIterator
-import numpy as np
-from queue import Queue
-import io
-import time
-import atexit
-from dataclasses import dataclass
+from habana_frameworks.mediapipe.mediapipe import MediaPipe
+from habana_frameworks.mediapipe.operators.reader_nodes.reader_nodes import (
+    media_ext_reader_op_impl, media_ext_reader_op_tensor_info)
+from habana_frameworks.mediapipe.plugins.iterator_pytorch import (
+    MediaGenericPytorchIterator)
+from PIL import Image
+from transformers import BatchEncoding, PretrainedConfig, TensorType
 
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.quantization import QuantizationConfig
@@ -311,14 +315,14 @@ def video_to_pixel_values_internvl(
     pixel_values = torch.stack([transform(image) for image in frames_list])
     return pixel_values
 
-# Handle MediaPipe pipe_manager destructor
-from habana_frameworks.mediapipe.backend.cal import pipe_manager, cpp_pipe_manager_list
 
 def _patched_close(self):
-    """Patched close method that handles None cpp_pipe_manager_list during shutdown"""
+    """Patched close method that handles 
+    None cpp_pipe_manager_list during shutdown"""
     try:
         # Check if cpp_pipe_manager_list exists and is not None
-        if cpp_pipe_manager_list is not None and self._pm_ in cpp_pipe_manager_list:
+        if cpp_pipe_manager_list is not None and \
+            self._pm_ in cpp_pipe_manager_list:
             cpp_pipe_manager_list.remove(self._pm_)
     except (TypeError, AttributeError):
         # Handle case where cpp_pipe_manager_list is None or not iterable
@@ -329,6 +333,7 @@ def _patched_close(self):
         self._pm_.close()
         self._pm_ = None
 
+
 pipe_manager.close = _patched_close
 
 # Queue shared between external reader and mediapipe call
@@ -336,11 +341,13 @@ def _patched_close(self):
 
 
 class MediaPytorchIterator(MediaGenericPytorchIterator):
+
     def __init__(self, mediapipe):
         super().__init__(mediapipe=mediapipe, device="hpu", fw_type="PYT_FW")
 
 
 class external_reader(media_ext_reader_op_impl):
+
     def __init__(self, params, fw_params):
         self.batch_size = fw_params.batch_size
         self.max_file = ""
@@ -355,13 +362,11 @@ def __len__(self):
     def __next__(self):
         img_list = shared_q.get()
         for i in range(len(img_list)):
-            # NOTE: this padding is needed because of HW alignmnet requirment
+            # NOTE: this padding is needed because of HW alignment requirement
             rem = len(img_list[i]) % 64
             pad = (64 - rem) % 64
             if pad:
-                img_list[i] = np.pad(img_list[i],
-                                    (0, pad),
-                                    'constant')
+                img_list[i] = np.pad(img_list[i], (0, pad), 'constant')
         return img_list
 
     def get_media_output_type(self):
@@ -379,17 +384,11 @@ def gen_output_info(self):
 
 
 class hpuMediaPipe(MediaPipe):
-    def __init__(self, device, queue_depth, batch_size,
-                 num_threads, op_device,
+
+    def __init__(self, device, queue_depth, batch_size, num_threads, op_device,
                  img_height, img_width):
-        super(
-            hpuMediaPipe,
-            self).__init__(
-            device,
-            queue_depth,
-            batch_size,
-            num_threads,
-            self.__class__.__name__)
+        super().__init__(device, queue_depth, batch_size, num_threads,
+                         self.__class__.__name__)
 
         mediapipe_seed = int(time.time_ns() % (2**31 - 1))
 
@@ -397,27 +396,35 @@ def __init__(self, device, queue_depth, batch_size,
                                          num_outputs=1,
                                          seed=mediapipe_seed,
                                          device=op_device)
-        self.decode = fn.ImageDecoder(
-            device="hpu", output_format=it.RGB_I, resize=[img_width, img_height])
-
-        self.mean_node = fn.MediaConst(
-            data=np.array([IMAGENET_MEAN[0]*255.0,
-                           IMAGENET_MEAN[1]*255.0,
-                           IMAGENET_MEAN[2]*255.0], dtype=dt.FLOAT32),
-            shape=[1, 1, 3], dtype=dt.FLOAT32)
-
-        self.std_node = fn.MediaConst(
-            data=np.array([1.0/(IMAGENET_STD[0]*255.0),
-                           1.0/(IMAGENET_STD[1]*255.0),
-                           1.0/(IMAGENET_STD[2]*255.0)], dtype=dt.FLOAT32),
-            shape=[1, 1, 3], dtype=dt.FLOAT32)
-
-        self.cmn = fn.CropMirrorNorm(crop_w=img_width, crop_h=img_height, dtype=dt.FLOAT32, device="hpu")
+        self.decode = fn.ImageDecoder(device="hpu",
+                                      output_format=it.RGB_I,
+                                      resize=[img_width, img_height])
+
+        self.mean_node = fn.MediaConst(data=np.array([
+            IMAGENET_MEAN[0] * 255.0, IMAGENET_MEAN[1] * 255.0,
+            IMAGENET_MEAN[2] * 255.0
+        ],
+                                                     dtype=dt.FLOAT32),
+                                       shape=[1, 1, 3],
+                                       dtype=dt.FLOAT32)
+
+        self.std_node = fn.MediaConst(data=np.array([
+            1.0 / (IMAGENET_STD[0] * 255.0), 1.0 /
+            (IMAGENET_STD[1] * 255.0), 1.0 / (IMAGENET_STD[2] * 255.0)
+        ],
+                                                    dtype=dt.FLOAT32),
+                                      shape=[1, 1, 3],
+                                      dtype=dt.FLOAT32)
+
+        self.cmn = fn.CropMirrorNorm(crop_w=img_width,
+                                     crop_h=img_height,
+                                     dtype=dt.FLOAT32,
+                                     device="hpu")
 
         self.transpose = fn.Transpose(
             device="hpu",
             tensorDim=4,
-            permutation=[1, 2, 0, 3] #NCHW
+            permutation=[1, 2, 0, 3]  #NCHW
         )
 
     def definegraph(self):
@@ -437,29 +444,31 @@ def definegraph(self):
 # -----------------------------------------------------------------------------
 @dataclass
 class _PipeState:
-    pipe:   hpuMediaPipe | None = None
-    it:     MediaGenericPytorchIterator | None = None
-    bsz:    int | None = None
-    H:      int | None = None
-    W:      int | None = None
+    pipe: hpuMediaPipe | None = None
+    it: MediaGenericPytorchIterator | None = None
+    bsz: int | None = None
+    H: int | None = None
+    W: int | None = None
 
 
 class MediaPipeTiler:
     """Owns and reuses MediaPipe pipes/iterators for main path"""
+
     def __init__(self) -> None:
-        self._main  = _PipeState()
+        self._main = _PipeState()
 
     def _rebuild(self, st: _PipeState, *, bsz: int, H: int, W: int) -> None:
         if st.pipe is not None:
-            try:
+            with contextlib.suppress(BaseException):
                 st.pipe.close()
-            except Exception:
-                pass
         pipe = hpuMediaPipe("legacy", 0, bsz, 1, "cpu", H, W)
         pipe.build()
-        st.pipe, st.it, st.bsz, st.H, st.W = pipe, iter(MediaPytorchIterator(pipe)), bsz, H, W
+        st.pipe, st.it, st.bsz, st.H, st.W = pipe, iter(
+            MediaPytorchIterator(pipe)), bsz, H, W
 
-    def ensure_main(self, *, bsz: int, H: int, W: int) -> tuple[hpuMediaPipe, MediaGenericPytorchIterator]:
+    def ensure_main(
+            self, *, bsz: int, H: int,
+            W: int) -> tuple[hpuMediaPipe, MediaGenericPytorchIterator]:
         st = self._main
         if st.pipe is None or st.bsz != bsz or st.H != H or st.W != W:
             self._rebuild(st, bsz=bsz, H=H, W=W)
@@ -486,17 +495,17 @@ def close_all(self) -> None:
 _MP = MediaPipeTiler()
 atexit.register(_MP.close_all)
 
+
 def get_image_info(data):
     # Get image info using PIL without decoding
     try:
         with Image.open(io.BytesIO(data)) as img:
-            return {
-                'format': img.format,
-                'size': img.size,
-                'mode': img.mode
-            }
+            return {'format': img.format, 'size': img.size, 'mode': img.mode}
     except Exception as e:
-        raise ValueError(f"Input image bitstream is not in supported format: {str(e)}")
+        raise ValueError(
+            f"Input image bitstream is not in supported format: {str(e)}"
+        ) from None
+
 
 def preprocess_images(
     images,
@@ -505,8 +514,6 @@ def preprocess_images(
     use_thumbnail=False,
 ):
     batch_size = 0
-    queue_depth = 0
-    num_threads = 1
 
     # validate images and create batches
     img_size = None
@@ -515,8 +522,11 @@ def preprocess_images(
     for img in images:
         img_info = get_image_info(img)
         if not (img_info['format'] == 'JPEG' and img_info['mode'] == 'RGB'):
-            raise ValueError(f"HPU media pipeline only supports JPEG images in RGB mode. Detected format={img_info['format']}, mode={img_info['mode']}")
-        if img_size==None:
+            raise ValueError(
+                f"HPU media pipeline only supports JPEG images in RGB mode. \
+                    Detected format={img_info['format']}, \
+                    mode={img_info['mode']}")
+        if img_size is None:
             img_size = img_info['size']
         else:
             if img_info['size'] != img_size:
@@ -543,9 +553,13 @@ def preprocess_images(
         )
 
         # if batch, H, W is changed, create new one
-        main_pipe, main_iter = _MP.ensure_main(bsz=batch_size, H=target_height, W=target_width)
+        main_pipe, main_iter = _MP.ensure_main(bsz=batch_size,
+                                               H=target_height,
+                                               W=target_width)
 
-        img_list = np.empty(shape=[batch_size, ], dtype=object)
+        img_list = np.empty(shape=[
+            batch_size,
+        ], dtype=object)
         for i in range(batch_size):
             img_list[i] = np.frombuffer(images[i], np.uint8)
 
@@ -554,50 +568,58 @@ def preprocess_images(
             processed_images = next(main_iter)[0]
         except StopIteration:
             _MP.reset_iter()
-            _, main_iter = _MP.ensure_main(bsz=batch_size, H=target_height, W=target_width)
+            _, main_iter = _MP.ensure_main(bsz=batch_size,
+                                           H=target_height,
+                                           W=target_width)
             processed_images = next(main_iter)[0]
         finally:
             shared_q.task_done()
 
-        # tiling vectorization: [N,C,H,W] -> [N,Ty,Tx,C,ps,ps] -> [N, T, C, ps, ps]
+        # tiling vectorization:
+        # [N,C,H,W] -> [N,Ty,Tx,C,ps,ps] -> [N, T, C, ps, ps]
         N, C, H, W = processed_images.shape
         Ty, Tx = H // patch_size, W // patch_size
         T = Ty * Tx
 
         use_thumb_now = use_thumbnail and (T > 1)
 
-        x = processed_images.view(N, C, Ty, patch_size, Tx, patch_size) \
+        x = processed_images.view(N, C, Ty, \
+                                  patch_size, Tx, patch_size) \
                               .permute(0, 2, 4, 1, 3, 5) \
                               .contiguous() \
-                              .view(N, T, C, patch_size, patch_size)  # [N,T,C,ps,ps]
+                              .view(N, T, C, \
+                                    patch_size, patch_size)  # [N,T,C,ps,ps]
 
         if use_thumb_now:
             # [N,3,ps,ps]
-            thumbs_batch = F.interpolate(processed_images, size=(patch_size, patch_size),
-                                         mode="bilinear", align_corners=False)
+            thumbs_batch = F.interpolate(processed_images,
+                                         size=(patch_size, patch_size),
+                                         mode="bilinear",
+                                         align_corners=False)
 
         num_patches = T + 1 if use_thumb_now else T
-        image_num_patches[image_num_patches_idx:image_num_patches_idx+batch_size] = num_patches
+        image_num_patches[image_num_patches_idx:image_num_patches_idx +
+                          batch_size] = num_patches
         image_num_patches_idx += batch_size
 
         # consist tensor batch based (tile + thumbnail)
         if use_thumb_now:
             out = torch.empty((N, T + 1, C, patch_size, patch_size),
-                              dtype=processed_images.dtype, device=processed_images.device)
+                              dtype=processed_images.dtype,
+                              device=processed_images.device)
             out[:, :T] = x
-            out[:, T]  = thumbs_batch
-            out = out.view(N * (T + 1), C, patch_size, patch_size)  # [N*(T+1),C,ps,ps]
+            out[:, T] = thumbs_batch
+            out = out.view(N * (T + 1), C, patch_size,
+                           patch_size)  # [N*(T+1),C,ps,ps]
         else:
             # no thumbnail (T==1 or off)
-            out = x.view(N * T, C, patch_size, patch_size)          # [N*T,C,ps,ps]
+            out = x.view(N * T, C, patch_size, patch_size)  # [N*T,C,ps,ps]
 
         batch_patches.append(out)
 
-    patches_flat = (
-        torch.cat(batch_patches, dim=0)
-        if batch_patches
-        else torch.empty((0, 3, patch_size, patch_size), dtype=torch.float32)
-    )
+    patches_flat = (torch.cat(batch_patches, dim=0)
+                    if batch_patches else torch.empty(
+                        (0, 3, patch_size, patch_size), dtype=torch.float32))
     return patches_flat, image_num_patches
 
 
@@ -756,7 +778,8 @@ def _preprocess_image(
         if len(images) == 0:
             image_inputs = {}
         else:
-            use_mediapipe = os.getenv("VLLM_USE_MEDIA_PIPELINE", "false").lower() in ("1", "true", "yes")
+            use_mediapipe = os.getenv("VLLM_USE_MEDIA_PIPELINE",
+                                      "false").lower() in ("1", "true", "yes")
             if use_mediapipe:
                 # Use HPU media pipeline for image preprocessing
                 min_num, max_num = self.resolve_min_max_num(
@@ -785,7 +808,9 @@ def _preprocess_image(
                     feature_size = num_patches * self.num_image_token
 
                     image_repl = self.get_image_repl(feature_size, num_patches)
-                    text = [t.replace('<image>', image_repl.full, 1) for t in text]
+                    text = [
+                        t.replace('<image>', image_repl.full, 1) for t in text
+                    ]
 
             else:
                 pixel_values_lst = self._images_to_pixel_values_lst(
@@ -806,7 +831,9 @@ def _preprocess_image(
                     feature_size = num_patches * self.num_image_token
 
                     image_repl = self.get_image_repl(feature_size, num_patches)
-                    text = [t.replace('<image>', image_repl.full, 1) for t in text]
+                    text = [
+                        t.replace('<image>', image_repl.full, 1) for t in text
+                    ]
 
         return text, image_inputs
 
@@ -821,7 +848,8 @@ def _make_batch_input(self,
     def __call__(
         self,
         text: Optional[Union[str, list[str]]] = None,
-        images: Optional[Union[Image.Image, list[Image.Image], bytes, list[bytes]]] = None,
+        images: Optional[Union[Image.Image, list[Image.Image], bytes,
+                               list[bytes]]] = None,
         min_dynamic_patch: Optional[int] = None,
         max_dynamic_patch: Optional[int] = None,
         dynamic_image_size: Optional[bool] = None,
@@ -940,7 +968,8 @@ def _preprocess_video(
     def __call__(
         self,
         text: Optional[Union[str, list[str]]] = None,
-        images: Optional[Union[Image.Image, list[Image.Image], bytes, list[bytes]]] = None,
+        images: Optional[Union[Image.Image, list[Image.Image], bytes,
+                               list[bytes]]] = None,
         videos: Optional[Union[npt.NDArray, list[npt.NDArray]]] = None,
         min_dynamic_patch: Optional[int] = None,
         max_dynamic_patch: Optional[int] = None,
@@ -1784,8 +1813,8 @@ def forward(
         elif inputs_embeds is None:
             if not is_hpu:
                 vision_embeddings = self.get_multimodal_embeddings(**kwargs)
-                inputs_embeds = self.get_input_embeddings(input_ids,
-                                                          vision_embeddings)
+                inputs_embeds = self.get_input_embeddings(
+                    input_ids, vision_embeddings)
                 input_ids = None
 
         forward_kwargs = {
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index 0157010b2c1e..0229f3bd6169 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -562,9 +562,9 @@ def merge_multimodal_embeddings(
             else:
                 flat_ids.append(int(x))
 
-        placeholder_token_id = torch.as_tensor(
-            flat_ids, device=input_ids.device, dtype=input_ids.dtype
-        )
+        placeholder_token_id = torch.as_tensor(flat_ids,
+                                               device=input_ids.device,
+                                               dtype=input_ids.dtype)
         return _merge_multimodal_embeddings(
             inputs_embeds,
             torch.isin(input_ids, placeholder_token_id),
diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py
index 16c9292f6bec..efef82b261e4 100644
--- a/vllm/model_executor/sampling_metadata.py
+++ b/vllm/model_executor/sampling_metadata.py
@@ -567,8 +567,8 @@ def from_lists(
 
         if do_penalties:
             if is_hpu:
-                if (prompt_tokens_cache is not None and
-                    prompt_tokens_cache.device == device):
+                if (prompt_tokens_cache is not None
+                        and prompt_tokens_cache.device == device):
                     # Reuse cached prompt_tokens already on HPU
                     prompt_t = prompt_tokens_cache
                 else:
@@ -580,17 +580,21 @@ def from_lists(
                         pin_memory=pin_memory,
                         max_len_align=1024,
                     )
-                if (output_tokens_cache is not None and
-                    output_tokens_cache.device == device and
-                    len(output_tokens) > 0 and len(output_tokens_cache[0]) > 0):
+                if (output_tokens_cache is not None
+                        and output_tokens_cache.device == device
+                        and len(output_tokens) > 0
+                        and len(output_tokens_cache[0]) > 0):
                     # Get the last element from each list
                     last_elements = [out[-1] for out in output_tokens]
-                    lengths = [len(out)-1 for out in output_tokens]
+                    lengths = [len(out) - 1 for out in output_tokens]
                     indices = torch.tensor(lengths, device=device)
-                    rows = torch.arange(output_tokens_cache.shape[0], device=device)
+                    rows = torch.arange(output_tokens_cache.shape[0],
+                                        device=device)
                     # Convert to a PyTorch tensor with shape [4, 1]
-                    last_elements_t = torch.tensor(last_elements).unsqueeze(1).to(output_tokens_cache.device)
-                    output_t = output_tokens_cache.index_put_((rows, indices), last_elements_t)
+                    last_elements_t = torch.tensor(last_elements).unsqueeze(
+                        1).to(output_tokens_cache.device)
+                    output_t = output_tokens_cache.index_put_((rows, indices),
+                                                              last_elements_t)
                 else:
                     output_t = make_tensor_with_pad_align(
                         output_tokens,
@@ -675,6 +679,7 @@ def from_lists(
                                                          non_blocking=True),
             repetition_penalties=repetition_penalties_t.to(device=device,
                                                            non_blocking=True),
-            prompt_tokens=prompt_t.to(device=device, non_blocking=True) if prompt_t.device != device else prompt_t,
-            output_tokens=output_t.to(device=device, non_blocking=True) if output_t.device != device else output_t
-        )
+            prompt_tokens=prompt_t.to(device=device, non_blocking=True)
+            if prompt_t.device != device else prompt_t,
+            output_tokens=output_t.to(device=device, non_blocking=True)
+            if output_t.device != device else output_t)
diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py
index 693a9a230fb3..53bee48d180c 100644
--- a/vllm/multimodal/parse.py
+++ b/vllm/multimodal/parse.py
@@ -1,13 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import io
 from abc import ABC, abstractmethod
 from collections import UserDict
 from collections.abc import Callable, Iterator, Mapping, Sequence
 from typing import (TYPE_CHECKING, Any, Generic, Literal, NamedTuple, Optional,
                     TypeVar, Union)
 
-import io
 import numpy as np
 import torch
 from typing_extensions import TypeAlias, TypeGuard, assert_never
@@ -410,8 +410,7 @@ def _parse_image_data(
         if self._is_embeddings(data):
             return ImageEmbeddingItems(data)
 
-        if (isinstance(data, PILImage.Image)
-                or isinstance(data, (bytes, bytearray))
+        if (isinstance(data, (PILImage.Image, bytes, bytearray))
                 or isinstance(data,
                               (np.ndarray, torch.Tensor)) and data.ndim == 3):
             data_items = [data]
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 2a5d4a0c2098..2df6d92ee750 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -1,12 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import os
 from itertools import groupby
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union
 from urllib.parse import ParseResult, urlparse
 
-import os
 import numpy as np
 import numpy.typing as npt
 import torch
@@ -114,7 +114,9 @@ def load_from_url(
             data = connection.get_bytes(url, timeout=fetch_timeout)
 
             if load_type == "bytes":
-                msg = "Only data URLs are currently supported for raw bytes loading."
+                msg = "Only data URLs are currently supported \
+                    for raw bytes loading."
+
                 raise NotImplementedError(msg)
             else:
                 connection = self.connection
@@ -127,7 +129,9 @@ def load_from_url(
 
         if url_spec.scheme == "file":
             if load_type == "bytes":
-                msg = "Only data URLs are currently supported for raw bytes loading."
+                msg = "Only data URLs are currently supported \
+                    for raw bytes loading."
+
                 raise NotImplementedError(msg)
             else:
                 return self._load_file_url(url_spec, media_io)
@@ -147,11 +151,14 @@ async def load_from_url_async(
 
         if url_spec.scheme.startswith("http"):
             if load_type == "bytes":
-                msg = "Only data URLs are currently supported for raw bytes loading."
+                msg = "Only data URLs are currently supported \
+                    for raw bytes loading."
+
                 raise NotImplementedError(msg)
             else:
                 connection = self.connection
-                data = await connection.async_get_bytes(url, timeout=fetch_timeout)
+                data = await connection.async_get_bytes(url,
+                                                        timeout=fetch_timeout)
 
                 return media_io.load_bytes(data)
 
@@ -160,7 +167,9 @@ async def load_from_url_async(
 
         if url_spec.scheme == "file":
             if load_type == "bytes":
-                msg = "Only data URLs are currently supported for raw bytes loading."
+                msg = "Only data URLs are currently supported \
+                    for raw bytes loading."
+
                 raise NotImplementedError(msg)
             else:
                 return self._load_file_url(url_spec, media_io)
@@ -210,7 +219,8 @@ def fetch_image(
         By default, the image is converted into RGB format.
         """
         image_io = ImageMediaIO(image_mode=image_mode)
-        use_mediapipe = os.getenv("VLLM_USE_MEDIA_PIPELINE", "false").lower() in ("1", "true", "yes")
+        use_mediapipe = os.getenv("VLLM_USE_MEDIA_PIPELINE",
+                                  "false").lower() in ("1", "true", "yes")
         load_type = "bytes" if use_mediapipe else "PIL"
 
         try:
@@ -231,12 +241,14 @@ async def fetch_image_async(
         image_mode: str = "RGB",
     ) -> Image.Image:
         """
-        Asynchronously load a PIL image (or raw bytes) from a HTTP or base64 data URL.
+        Asynchronously load a PIL image (or raw bytes) 
+        from a HTTP or base64 data URL.
 
         By default, the image is converted into RGB format.
         """
         image_io = ImageMediaIO(image_mode=image_mode)
-        use_mediapipe = os.getenv("VLLM_USE_MEDIA_PIPELINE", "false").lower() in ("1", "true", "yes")
+        use_mediapipe = os.getenv("VLLM_USE_MEDIA_PIPELINE",
+                                  "false").lower() in ("1", "true", "yes")
         load_type = "bytes" if use_mediapipe else "PIL"
 
         try:
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 4c09ac5b1188..2aecded1b3b8 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -1891,8 +1891,9 @@ def _prepare_prompt(
         use_mediapipe = os.getenv("VLLM_USE_MEDIA_PIPELINE",
                                   "false").lower() in ("1", "true", "yes")
         if use_mediapipe:
-            # With mediapipe path some tensors will already be on HPU, we only move to HPU if needed
-            for key in multi_modal_kwargs.keys():
+            # With mediapipe path some tensors will already be on HPU,
+            # we only move to HPU if needed
+            for key in multi_modal_kwargs:
                 if hasattr(multi_modal_kwargs[key], "device"
                            ) and multi_modal_kwargs[key].device != self.device:
                     multi_modal_kwargs[key] = self.move_to_device(
@@ -4032,8 +4033,10 @@ def try_revert_dummy_output_tokens():
                     sampling_tensors = self.model.sampler._sampling_tensors
                     if sampling_tensors.prompt_tokens.numel() > 0:
                         # Cache the prompt_tokens tensor that's already on HPU
-                        self.model.sampler._prompt_tokens_hpu_cache = sampling_tensors.prompt_tokens
-                        self.model.sampler._output_tokens_hpu_cache = sampling_tensors.output_tokens
+                        self.model.sampler._prompt_tokens_hpu_cache = \
+                            sampling_tensors.prompt_tokens
+                        self.model.sampler._output_tokens_hpu_cache = \
+                            sampling_tensors.output_tokens
                 if use_delayed_sampling \
                    and model_input.async_callback is not None:
                     model_input.async_callback()

From bced3c57b0265b5eaa496d327f28a7ea2b8e9d51 Mon Sep 17 00:00:00 2001
From: Yeonsil Yoon <yeon.sil.yoon@intel.com>
Date: Fri, 3 Oct 2025 14:34:53 -0700
Subject: [PATCH 16/20] Fix pre-commit

---
 vllm/entrypoints/chat_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 7d549bb58558..c1e12a7fc4f2 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -411,10 +411,10 @@ def _resolve_chat_template_content_format(
     model_type = getattr(model_config.hf_config, 'model_type', '')
     if model_type == 'internvl_chat' or 'internvl' \
         in model_config.model.lower():
-        detected_format = "string"
+        detected_format:_ChatTemplateContentFormat = "string"
     else:
         detected_format = ("string" if jinja_text is None else
-                           _detect_content_format(jinja_text, default="string"))
+                        _detect_content_format(jinja_text, default="string"))
 
     return detected_format
 

From 55f0d819cca4a3df7211f3d7f70c5d41d502d580 Mon Sep 17 00:00:00 2001
From: Yeonsil Yoon <yeon.sil.yoon@intel.com>
Date: Mon, 6 Oct 2025 10:00:32 -0700
Subject: [PATCH 17/20] Fix pre-commit

---
 vllm/entrypoints/chat_utils.py        |  12 +-
 vllm/model_executor/layers/sampler.py |   4 +-
 vllm/multimodal/base.py               |   4 +
 vllm/multimodal/utils.py              | 180 +++++++++++++++++++-------
 vllm/worker/hpu_model_runner.py       |  10 +-
 5 files changed, 153 insertions(+), 57 deletions(-)

diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index c1e12a7fc4f2..9d99dc97325e 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -411,12 +411,12 @@ def _resolve_chat_template_content_format(
     model_type = getattr(model_config.hf_config, 'model_type', '')
     if model_type == 'internvl_chat' or 'internvl' \
         in model_config.model.lower():
-        detected_format:_ChatTemplateContentFormat = "string"
+        detected_format = "string"
     else:
         detected_format = ("string" if jinja_text is None else
-                        _detect_content_format(jinja_text, default="string"))
+                           _detect_content_format(jinja_text, default="string"))
 
-    return detected_format
+    return cast(_ChatTemplateContentFormat, detected_format)
 
 
 @lru_cache
@@ -734,7 +734,7 @@ def __init__(self, tracker: MultiModalItemTracker) -> None:
         )
 
     def parse_image(self, image_url: str) -> None:
-        image = self._connector.fetch_image(image_url)
+        image = self._connector.fetch_image(image_url, load_type="PIL")
 
         placeholder = self._tracker.add("image", image)
         self._add_placeholder(placeholder)
@@ -785,7 +785,9 @@ def __init__(self, tracker: AsyncMultiModalItemTracker) -> None:
         )
 
     def parse_image(self, image_url: str) -> None:
-        image_coro = self._connector.fetch_image_async(image_url)
+        image_coro = self._connector.fetch_image_async(
+                image_url, load_type="PIL"
+        )
 
         placeholder = self._tracker.add("image", image_coro)
         self._add_placeholder(placeholder)
diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
index dc60aa2bf5e3..a40cac6a86b6 100755
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -218,13 +218,15 @@ def _init_sampling_tensors(
         # have pinned memory.
         self._sampling_tensors = None
 
+        csi = self._cached_seq_ids if self._cached_seq_ids is not None else set(
+        )
         # Initialize new sampling tensors
         (sampling_tensors, do_penalties, do_top_p_top_k, do_min_p,
          top_k_scalar, top_p_scalar, current_seq_ids) = \
             SamplingTensors.from_sampling_metadata(
              sampling_metadata, vocab_size, logits.device, logits.dtype, \
              self._prompt_tokens_hpu_cache, self._output_tokens_hpu_cache, \
-             self._cached_seq_ids)
+             csi)
 
         self._sampling_tensors = sampling_tensors
         self._do_penalties = do_penalties
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 7188ed14c573..eef9726d3228 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -214,6 +214,10 @@ def load_base64(self, media_type: str, data: str) -> _T:
         """
         raise NotImplementedError
 
+    def load_base64_bytes(self, media_type: str, data: str) -> bytes:
+        """ Optional """
+        raise NotImplementedError
+
     @abstractmethod
     def load_file(self, filepath: Path) -> _T:
         raise NotImplementedError
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 2df6d92ee750..a9a1e98f1413 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -4,7 +4,8 @@
 import os
 from itertools import groupby
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union
+from typing import (TYPE_CHECKING, Any, Literal, Optional, TypeVar, Union,
+                    overload)
 from urllib.parse import ParseResult, urlparse
 
 import numpy as np
@@ -63,18 +64,36 @@ def __init__(
 
         self.allowed_local_media_path = allowed_local_media_path_
 
+    @overload
     def _load_data_url(
         self,
         url_spec: ParseResult,
         media_io: MediaIO[_M],
-        load_type: str = "PIL",
+        load_type: Literal["PIL"] = "PIL",
     ) -> _M:
+        ...
+
+    @overload
+    def _load_data_url(
+        self,
+        url_spec: ParseResult,
+        media_io: MediaIO[_M],
+        load_type: Literal["bytes"],
+    ) -> bytes:
+        ...
+
+    def _load_data_url(
+        self,
+        url_spec: ParseResult,
+        media_io: MediaIO[_M],
+        load_type: Literal["PIL", "bytes"] = "PIL",
+    ) -> Union[_M, bytes]:
         data_spec, data = url_spec.path.split(",", 1)
         media_type, data_type = data_spec.split(";", 1)
 
         if data_type != "base64":
-            msg = "Only base64 data URLs are supported for now."
-            raise NotImplementedError(msg)
+            raise NotImplementedError(
+                "Only base64 data URLs are supported for now.")
 
         if load_type == "bytes":
             return media_io.load_base64_bytes(media_type, data)
@@ -99,45 +118,79 @@ def _load_file_url(
 
         return media_io.load_file(filepath)
 
+    @overload
     def load_from_url(
         self,
         url: str,
         media_io: MediaIO[_M],
         *,
-        fetch_timeout: Optional[int] = None,
-        load_type: str = "bytes",
+        fetch_timeout: Optional[int] = ...,
+        load_type: Literal["PIL"] = "PIL",
     ) -> _M:
+        ...
+
+    @overload
+    def load_from_url(
+        self,
+        url: str,
+        media_io: MediaIO[_M],
+        *,
+        fetch_timeout: Optional[int] = ...,
+        load_type: Literal["bytes"],
+    ) -> bytes:
+        ...
+
+    def load_from_url(
+        self,
+        url: str,
+        media_io: MediaIO[_M],
+        *,
+        fetch_timeout: Optional[int] = None,
+        load_type: Literal["PIL", "bytes"] = "bytes",
+    ) -> Union[_M, bytes]:
         url_spec = urlparse(url)
 
         if url_spec.scheme.startswith("http"):
-            connection = self.connection
-            data = connection.get_bytes(url, timeout=fetch_timeout)
-
             if load_type == "bytes":
-                msg = "Only data URLs are currently supported \
-                    for raw bytes loading."
-
-                raise NotImplementedError(msg)
-            else:
-                connection = self.connection
-                data = connection.get_bytes(url, timeout=fetch_timeout)
-
-                return media_io.load_bytes(data)
+                raise NotImplementedError(
+                    "Only data URLs are currently supported for bytes loading."
+                )
+            data = self.connection.get_bytes(url, timeout=fetch_timeout)
+            return media_io.load_bytes(data)
 
         if url_spec.scheme == "data":
             return self._load_data_url(url_spec, media_io, load_type)
 
         if url_spec.scheme == "file":
             if load_type == "bytes":
-                msg = "Only data URLs are currently supported \
-                    for raw bytes loading."
+                raise NotImplementedError(
+                    "Only data URLs are currently supported for bytes loading."
+                )
+            return self._load_file_url(url_spec, media_io)
 
-                raise NotImplementedError(msg)
-            else:
-                return self._load_file_url(url_spec, media_io)
+        raise ValueError("The URL must be either a HTTP, data or file URL.")
 
-        msg = "The URL must be either a HTTP, data or file URL."
-        raise ValueError(msg)
+    @overload
+    async def load_from_url_async(
+        self,
+        url: str,
+        media_io: MediaIO[_M],
+        *,
+        fetch_timeout: Optional[int] = ...,
+        load_type: Literal["PIL"] = "PIL",
+    ) -> _M:
+        ...
+
+    @overload
+    async def load_from_url_async(
+        self,
+        url: str,
+        media_io: MediaIO[_M],
+        *,
+        fetch_timeout: Optional[int] = ...,
+        load_type: Literal["bytes"],
+    ) -> bytes:
+        ...
 
     async def load_from_url_async(
         self,
@@ -145,37 +198,30 @@ async def load_from_url_async(
         media_io: MediaIO[_M],
         *,
         fetch_timeout: Optional[int] = None,
-        load_type: str = "bytes",
-    ) -> _M:
+        load_type: Literal["PIL", "bytes"] = "bytes",
+    ) -> Union[_M, bytes]:
         url_spec = urlparse(url)
 
         if url_spec.scheme.startswith("http"):
             if load_type == "bytes":
-                msg = "Only data URLs are currently supported \
-                    for raw bytes loading."
-
-                raise NotImplementedError(msg)
-            else:
-                connection = self.connection
-                data = await connection.async_get_bytes(url,
-                                                        timeout=fetch_timeout)
-
-                return media_io.load_bytes(data)
+                raise NotImplementedError(
+                    "Only data URLs are currently supported for bytes loading."
+                )
+            data = await self.connection.async_get_bytes(url,
+                                                         timeout=fetch_timeout)
+            return media_io.load_bytes(data)
 
         if url_spec.scheme == "data":
             return self._load_data_url(url_spec, media_io, load_type)
 
         if url_spec.scheme == "file":
             if load_type == "bytes":
-                msg = "Only data URLs are currently supported \
-                    for raw bytes loading."
-
-                raise NotImplementedError(msg)
-            else:
-                return self._load_file_url(url_spec, media_io)
+                raise NotImplementedError(
+                    "Only data URLs are currently supported for bytes loading."
+                )
+            return self._load_file_url(url_spec, media_io)
 
-        msg = "The URL must be either a HTTP, data or file URL."
-        raise ValueError(msg)
+        raise ValueError("The URL must be either a HTTP, data or file URL.")
 
     def fetch_audio(
         self,
@@ -207,14 +253,35 @@ async def fetch_audio_async(
             fetch_timeout=envs.VLLM_AUDIO_FETCH_TIMEOUT,
         )
 
+    @overload
     def fetch_image(
         self,
         image_url: str,
         *,
         image_mode: str = "RGB",
+        load_type: Literal["PIL"],
     ) -> Image.Image:
+        ...
+
+    @overload
+    def fetch_image(
+        self,
+        image_url: str,
+        *,
+        image_mode: str = "RGB",
+        load_type: Literal["bytes"],
+    ) -> bytes:
+        ...
+
+    def fetch_image(
+        self,
+        image_url: str,
+        *,
+        image_mode: str = "RGB",
+        load_type: Optional[Literal["PIL", "bytes"]] = None,
+    ) -> Union[Image.Image, bytes]:
         """
-        Load a PIL image (or raw bytes) from a HTTP or base64 data URL.
+        Load a PIL image or raw bytes from a HTTP/base64 data URL.
 
         By default, the image is converted into RGB format.
         """
@@ -234,12 +301,33 @@ def fetch_image(
             # convert to ValueError to be properly caught upstream
             raise ValueError(str(e)) from e
 
+    @overload
     async def fetch_image_async(
         self,
         image_url: str,
         *,
         image_mode: str = "RGB",
+        load_type: Literal["PIL"],
     ) -> Image.Image:
+        ...
+
+    @overload
+    async def fetch_image_async(
+        self,
+        image_url: str,
+        *,
+        image_mode: str = "RGB",
+        load_type: Literal["bytes"],
+    ) -> bytes:
+        ...
+
+    async def fetch_image_async(
+        self,
+        image_url: str,
+        *,
+        image_mode: str = "RGB",
+        load_type: Optional[Literal["PIL", "bytes"]] = None,
+    ) -> Union[Image.Image, bytes]:
         """
         Asynchronously load a PIL image (or raw bytes) 
         from a HTTP or base64 data URL.
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 2aecded1b3b8..af1f83c10c6b 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -3899,11 +3899,11 @@ def try_revert_dummy_output_tokens():
                        ('pixel_values')in model_input.multi_modal_kwargs))
                     execute_model_kwargs['attn_metadata'] = attn_metadata
 
-                    if 'image_index' in model_input.multi_modal_kwargs:
-                        execute_model_kwargs[
-                            'image_index'] = model_input.multi_modal_kwargs[
-                                'image_index']
-                        model_input.multi_modal_kwargs.pop('image_index', None)
+                    mmks = model_input.multi_modal_kwargs
+                    if mmks is not None and 'image_index' in mmks:
+                        execute_model_kwargs['image_index'] = mmks[
+                            'image_index']
+                        mmks.pop('image_index', None)
 
                 if not bypass_model_exec:
                     if self.model_is_mrope or (self.is_mm_optimized

From 08c5f9ecc6b175b0d75bbcd2633f874965380da8 Mon Sep 17 00:00:00 2001
From: Yeonsil Yoon <yeon.sil.yoon@intel.com>
Date: Mon, 6 Oct 2025 13:41:40 -0700
Subject: [PATCH 18/20] GEMMA3:move decode embedding from hpu_model_runner to
 gemma3 (https://github.com/HabanaAI/vllm-fork/pull/1848)

---
 vllm/model_executor/models/gemma3_mm.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py
index 24fb4147e680..b267425e8556 100644
--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
@@ -641,10 +641,7 @@ def forward(self,
 
         # NOTE: In v1, inputs_embeds is always generated at model runner, this
         # condition is for v0 compatibility.
-        elif inputs_embeds is None:
-            if is_hpu:
-                raise AssertionError("hpu_model_runner should be computing \
-                        inputs_embeds")
+        elif inputs_embeds is None and not is_hpu:
             vision_embeddings = self.get_multimodal_embeddings(**kwargs)
 
             inputs_embeds = self.get_input_embeddings(input_ids,

From 2d5ad937b56cadf4220e763db4bdf8cf680b22b9 Mon Sep 17 00:00:00 2001
From: Yeonsil Yoon <yeon.sil.yoon@intel.com>
Date: Mon, 6 Oct 2025 13:54:29 -0700
Subject: [PATCH 19/20] Fix vit_embeds duplication issue when N breakdown > 1
 (From 5902ec7)

---
 vllm/model_executor/models/internvl.py | 29 +++++++++++++++++++-------
 1 file changed, 21 insertions(+), 8 deletions(-)

diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index fdb88e5bc83c..4fe54f16531c 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -1510,10 +1510,13 @@ def extract_feature(self, pixel_values: torch.Tensor) -> torch.Tensor:
             else:
                 batch_breakdown = [pixel_values.shape[0]]
 
+            total_mb = len(batch_breakdown)
+            need_copy = total_mb > 1
+
             start_idx = 0
             vit_embeds_minibatches = []
 
-            for i in batch_breakdown:
+            for mb_idx, i in enumerate(batch_breakdown):
                 end_idx = start_idx + i
                 batch_sliced_pixel_values = \
                         pixel_values[start_idx:end_idx, ...]
@@ -1541,14 +1544,24 @@ def extract_feature(self, pixel_values: torch.Tensor) -> torch.Tensor:
                     vit_embeds_minibatch.shape[-1])
 
                 if is_lazy:
-                    vit_embeds_minibatches += [
-                        self.mlp1(vit_embeds_minibatch,
-                                  bypass_hpu_graphs=i
-                                  not in self.graphed_multimodal_buckets
-                                  and len(self.graphed_multimodal_buckets) > 0)
-                    ]
+                    proj = self.mlp1(
+                        vit_embeds_minibatch,
+                        bypass_hpu_graphs=i
+                        not in self.graphed_multimodal_buckets
+                        and len(self.graphed_multimodal_buckets) > 0)
                 else:
-                    vit_embeds_minibatches += [self.mlp1(vit_embeds_minibatch)]
+                    proj = self.mlp1(vit_embeds_minibatch)
+
+                if need_copy and (mb_idx < total_mb - 1):
+                    proj_safe = torch.empty_like(proj)
+                    proj_safe.copy_(proj)
+                    if is_lazy:
+                        import habana_frameworks.torch as htorch
+                        htorch.core.mark_step()
+                    vit_embeds_minibatches.append(proj_safe)
+                else:
+                    vit_embeds_minibatches.append(proj)
+
                 start_idx = end_idx
             vit_embeds = torch.cat(vit_embeds_minibatches, dim=0)
         else:

From b96a809439940b95733e97e5e3ec854bfb062da8 Mon Sep 17 00:00:00 2001
From: Yeonsil Yoon <yeon.sil.yoon@intel.com>
Date: Wed, 8 Oct 2025 15:13:52 -0700
Subject: [PATCH 20/20] Fix review comment

---
 vllm/worker/hpu_model_runner.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 787c33e5c0cb..442c723cc92b 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -4036,6 +4036,7 @@ def try_revert_dummy_output_tokens():
                         # Cache the prompt_tokens tensor that's already on HPU
                         self.model.sampler._prompt_tokens_hpu_cache = \
                             sampling_tensors.prompt_tokens
+                    if sampling_tensors.output_tokens.numel() > 0:
                         self.model.sampler._output_tokens_hpu_cache = \
                             sampling_tensors.output_tokens
                 if use_delayed_sampling \