From df3d4c25653508f90b74030643b2b00684324541 Mon Sep 17 00:00:00 2001 From: Jianhong-Zhang Date: Tue, 14 Oct 2025 17:44:50 -0700 Subject: [PATCH 1/2] Fix cache miss for InternVL --------- Signed-off-by: Jianhong Zhang --- vllm/model_executor/models/internvl.py | 1 + vllm/worker/hpu_model_runner.py | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index ef5861c5a41d..5e67c0a90146 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -1491,6 +1491,7 @@ def extract_feature(self, pixel_values: torch.Tensor) -> torch.Tensor: end_idx = start_idx + i batch_sliced_pixel_values = \ pixel_values[start_idx:end_idx, ...] + batch_sliced_pixel_values = batch_sliced_pixel_values.contiguous().clone() if is_lazy: vit_embeds_minibatch = \ self.vision_model( diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 9d7b81c7bcde..31b978fb47b4 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -3960,6 +3960,13 @@ def try_revert_dummy_output_tokens(): with self.profiler.record_event('internal', model_event_name, args=profiler_args): + # Clone inputs_embeds early to prevent cache miss failure + if 'InternVLChatModel' in str(type(self.model.model)): + if "inputs_embeds" in execute_model_kwargs: + orig = execute_model_kwargs["inputs_embeds"] + execute_model_kwargs["inputs_embeds"] = orig.contiguous().clone() + if sampling_metadata.selected_token_indices is not None: + sampling_metadata.selected_token_indices = sampling_metadata.selected_token_indices.clone() hidden_states = self.model.forward( **execute_model_kwargs, selected_token_indices=sampling_metadata. From 3b3241af9411f10aed246766bfd63a7659118f25 Mon Sep 17 00:00:00 2001 From: Jianhong-Zhang Date: Tue, 14 Oct 2025 22:13:10 -0700 Subject: [PATCH 2/2] Fix cache miss for Ovis2.5 --------- Signed-off-by: Jianhong Zhang Co-authored-by: Socek, Daniel --- vllm/model_executor/models/ovis2_5.py | 3 +++ vllm/worker/hpu_model_runner.py | 9 ++++++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/ovis2_5.py b/vllm/model_executor/models/ovis2_5.py index f2d44cc1a057..f4383adab63a 100644 --- a/vllm/model_executor/models/ovis2_5.py +++ b/vllm/model_executor/models/ovis2_5.py @@ -548,8 +548,11 @@ def _process_image_input( image_patches_flat.to(target_dtype), grid_thws, self.vision_buckets) + visual_embeds = visual_embeds.contiguous().clone() + grid_thws = grid_thws.contiguous().clone() visual_tokens = self.visual_tokenizer(visual_embeds, grid_thws) visual_embeds = self.vte(visual_tokens) # 1:1 numeric eq. + indicator_tokens = indicator_tokens.contiguous().clone() indicator_embeds = self.vte(indicator_tokens) padded_patches_per_image = [ grid[1] * grid[2] // (self.config.vit_config.hidden_stride**2) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 31b978fb47b4..be65c3d4e9c6 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -3961,12 +3961,19 @@ def try_revert_dummy_output_tokens(): model_event_name, args=profiler_args): # Clone inputs_embeds early to prevent cache miss failure - if 'InternVLChatModel' in str(type(self.model.model)): + if 'InternVLChatModel' in str(type(self.model.model)) or 'Ovis2_5' in str(type(self.model.model)): if "inputs_embeds" in execute_model_kwargs: orig = execute_model_kwargs["inputs_embeds"] execute_model_kwargs["inputs_embeds"] = orig.contiguous().clone() if sampling_metadata.selected_token_indices is not None: sampling_metadata.selected_token_indices = sampling_metadata.selected_token_indices.clone() + if 'Ovis2_5' in str(type(self.model.model)): + if "indicator_tokens" in execute_model_kwargs: + execute_model_kwargs.pop('indicator_tokens', None) + if "pixel_values" in execute_model_kwargs: + execute_model_kwargs.pop('pixel_values', None) + if "grids" in execute_model_kwargs: + execute_model_kwargs.pop('grids', None) hidden_states = self.model.forward( **execute_model_kwargs, selected_token_indices=sampling_metadata.