From 36eb2cf9cec7ed62d40f29f28f2af48fef31794f Mon Sep 17 00:00:00 2001 From: Supreet Singh Date: Thu, 18 Sep 2025 05:37:04 -0700 Subject: [PATCH 01/20] Fixes HPU graph run for Gemma3 vision inputs (#1865) Fixes HPU graph issues for gemma3 vision inputs Text warmup to include attn_mask info, so vision+text data can reuse the graph for language model that's warmed up already. Changing slicing to index_select for multimodal bucketing for HPU. Slicing doesn't produce the same hash for the HPU graph with same input shape. Use buckets for the vision tower as well to reduce GC recompile Accuracy bug fix by clone output data of the multimodal-projector. Validated with Muirbench datasets. --- .../configs/Qwen2.5-VL-7B-Instruct.yaml | 2 +- vllm/model_executor/models/gemma3_mm.py | 33 ++++++------- vllm/worker/hpu_model_runner.py | 48 +++++++++++-------- 3 files changed, 44 insertions(+), 39 deletions(-) diff --git a/.jenkins/vision/configs/Qwen2.5-VL-7B-Instruct.yaml b/.jenkins/vision/configs/Qwen2.5-VL-7B-Instruct.yaml index 294b538633e5..840eaa12a922 100644 --- a/.jenkins/vision/configs/Qwen2.5-VL-7B-Instruct.yaml +++ b/.jenkins/vision/configs/Qwen2.5-VL-7B-Instruct.yaml @@ -1,5 +1,5 @@ model_name: "/mnt/weka/data/pytorch/Qwen/Qwen2.5-VL-7B-Instruct/" dtype: "bfloat16" -max_model_len: 32768 +max_model_len: 35840 max_num_seqs: 32 num_prompts: 4 diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py index b80a4ab5951c..578e49e7fd78 100644 --- a/vllm/model_executor/models/gemma3_mm.py +++ b/vllm/model_executor/models/gemma3_mm.py @@ -569,11 +569,6 @@ def _process_image_input( pixel_values = image_input["pixel_values"] num_patches = image_input["num_patches"] - image_features = self._image_pixels_to_features( - self.vision_tower, - pixel_values, - ) - if is_hpu: batch_breakdown = greedy_plan(pixel_values.shape[0], \ self.vision_buckets.multimodal_buckets) @@ -582,22 +577,24 @@ def _process_image_input( for i in batch_breakdown: end_idx = start_idx + i - batch_sliced_image_features = \ - image_features[start_idx:end_idx, ...] - if is_lazy: - image_embeds_multibatches += \ - [self.multi_modal_projector( - batch_sliced_image_features, - bypass_hpu_graphs=i - not in self.graphed_multimodal_buckets - and len(self.graphed_multimodal_buckets) > 0)] - else: - image_embeds_multibatches += \ - [self.multi_modal_projector( \ - batch_sliced_image_features)] + indices = torch.arange(start_idx, end_idx) + batch_sliced_pixel_values = torch.index_select(pixel_values, + dim=0, + index=indices) + + image_features = self._image_pixels_to_features( + self.vision_tower, + batch_sliced_pixel_values, + ) + image_embeds = self.multi_modal_projector(image_features) + image_embeds_multibatches += [image_embeds.clone()] start_idx = end_idx image_embeds = torch.cat(image_embeds_multibatches, dim=0) else: + image_features = self._image_pixels_to_features( + self.vision_tower, + pixel_values, + ) image_embeds = self.multi_modal_projector(image_features) return [ e.flatten(0, 1) for e in image_embeds.split(num_patches.tolist()) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 3fd9e2c78158..f400db0d7c07 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -373,7 +373,7 @@ def __init__(self, model, vllm_config, is_causal, sampler): if self.is_mm_optimized: if hasattr(self.model, 'vision_tower'): self.model.vision_tower = htorch.hpu.wrap_in_hpu_graph( - self.model.vision_tower, disable_tensor_cache=True) + self.model.vision_tower, disable_tensor_cache=False) if hasattr(self.model, 'multi_modal_projector'): self.model.multi_modal_projector = \ htorch.hpu.wrap_in_hpu_graph( \ @@ -619,13 +619,19 @@ def _update_metadata(self, device, dtype, True) return attn_metadata - def compute_input_embeddings_for_mm_optimized(self, **kwargs): + def compute_input_embeddings_for_mm_optimized(self, warmup_mode, **kwargs): input_ids = kwargs['input_ids'] vision_embeddings = self.model.get_multimodal_embeddings(**kwargs) inputs_embeds = self.model.get_input_embeddings( input_ids, vision_embeddings) - if vision_embeddings is not None: + # TODO: In warmup, we need to warmup the model with dummy image data for + # multimodal model for prompt, here instead of generating a dummy image, + # we are just generating attn_mask for the images and pass with + # attn_metadata, so we can reuse HPU graph without running + # the whole vision tower. + if vision_embeddings is not None or ( + warmup_mode and kwargs['attn_metadata'].is_prompt): input_ids = kwargs['input_ids'] positions = kwargs['positions'] kwargs = self.model.prepare_attn_masks( @@ -634,14 +640,16 @@ def compute_input_embeddings_for_mm_optimized(self, **kwargs): ) kwargs['input_ids'] = input_ids kwargs['positions'] = positions - #input_ids = None kwargs.update({'inputs_embeds': inputs_embeds}) - # done compute the visual tokens + # done compute the visual tokens and others kwargs.pop('pixel_values', None) + kwargs.pop("num_crops", None) + kwargs.pop("graphed_multimodal_buckets", None) return kwargs - def compute_input_embeddings_for_mrope_mm_optimized(self, **kwargs): + def compute_input_embeddings_for_mrope_mm_optimized( + self, warmup_mode, **kwargs): if 'inputs_embeds' in kwargs: return kwargs @@ -680,7 +688,8 @@ def compute_input_embeddings_for_mrope_mm_optimized(self, **kwargs): kwargs.pop('image_grid_thw', None) return kwargs else: - return self.compute_input_embeddings_for_mm_optimized(**kwargs) + return self.compute_input_embeddings_for_mm_optimized( + warmup_mode, **kwargs) def forward(self, *args, **kwargs): kwargs = kwargs.copy() @@ -692,9 +701,9 @@ def forward(self, *args, **kwargs): virtual_engine = kwargs.pop('virtual_engine') input_ids = kwargs['input_ids'] - global_attn_masks = kwargs.get("global_attn_masks") \ + global_attn_masks = kwargs.pop("global_attn_masks") \ if kwargs.get("global_attn_masks") else None - local_attn_masks = kwargs.get("local_attn_masks") \ + local_attn_masks = kwargs.pop("local_attn_masks") \ if kwargs.get("local_attn_masks") else None kwargs['attn_metadata'] = self._update_metadata( @@ -1396,12 +1405,8 @@ def get_model(self) -> torch.nn.Module: return self.model.model return self.model - def _use_graphs(self, img_args=None): - if not img_args: - return not self.enforce_eager - #TODO: We might need to check both language bucket and multimodal bucket - # and return True only it's avialble, or return separately. - return (img_args) in self.graphed_multimodal_buckets + def _use_graphs(self): + return not self.enforce_eager def _is_valid_bucket(self, bucket): return bucket[0] * bucket[1] <= self.max_num_batched_tokens @@ -2667,7 +2672,7 @@ def trim_attn_metadata(self, metadata: AttentionMetadata) -> object: def create_dummy_multi_modal_seq_group_metadata(self, group_id, img_args, sampling_params, - lora_request): + lora_request, seq_len): assert self.model_is_mrope or self.is_mm_optimized, \ ("Warmup compatible with Qwen2vl/Gemma3 models") if img_args == UNSET_IMG_ARGS: @@ -2712,7 +2717,9 @@ def create_dummy_multi_modal_seq_group_metadata(self, group_id, img_args, } image_token_id = self.get_model().config.image_token_id - prompt_token_ids = [image_token_id] * num_image_tokens + prompt_token_ids_image = [image_token_id] * num_image_tokens + prompt_token_ids = [0] * ( + seq_len - len(prompt_token_ids_image)) + prompt_token_ids_image prompt_token_ids_array = array('l', prompt_token_ids) # noqa: F821 placeholders_by_modality = { 'image': @@ -2756,6 +2763,7 @@ def create_dummy_seq_group_metadata(self, img_args=img_args, sampling_params=sampling_params, lora_request=lora_request, + seq_len=seq_len, ) else: input_len = seq_len @@ -2867,7 +2875,7 @@ def warmup_scenario(self, align_worker=False, is_dummy_run=False) -> None: phase = 'prompt' if is_prompt else 'decode' - use_graphs = is_dummy_run or self._use_graphs(img_args) + use_graphs = is_dummy_run or self._use_graphs() scenario_name = ("warmup_" f"{phase}_" @@ -3664,8 +3672,7 @@ def execute_model( if not warmup_mode: ctx_blocks = seq_len seq_len = 1 - img_args = self._get_img_args_from_model_input(model_input) - use_graphs = self._use_graphs(img_args=img_args) + use_graphs = self._use_graphs() self._check_config(batch_size, seq_len, ctx_blocks, attn_metadata, warmup_mode) lora_mask: torch.Tensor = None @@ -3831,6 +3838,7 @@ def try_revert_dummy_output_tokens(): # hpu graphs, hence turning it to a list execute_model_kwargs = \ self.model.compute_input_embeddings_for_mrope_mm_optimized( + warmup_mode, **execute_model_kwargs ) if warmup_mode and bypass_model_exec: From ee517a2fcef8ecd72c307abf2b528bc6010a5a8d Mon Sep 17 00:00:00 2001 From: Artur Fierka Date: Mon, 22 Sep 2025 14:42:44 +0200 Subject: [PATCH 02/20] Update common.txt (#1956) Add missing modelscope package - `VLLM_USE_MODELSCOPE` env doesn't work without it. --- requirements/common.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements/common.txt b/requirements/common.txt index a6a1ffe76196..be513f444a6e 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -48,3 +48,4 @@ opentelemetry-sdk>=1.26.0 # vllm.tracing opentelemetry-api>=1.26.0 # vllm.tracing opentelemetry-exporter-otlp>=1.26.0 # vllm.tracing opentelemetry-semantic-conventions-ai>=0.4.1 # vllm.tracing +modelscope # required to support VLLM_USE_MODELSCOPE env From bb96123af496a8a197b0779a240076f7859f41a7 Mon Sep 17 00:00:00 2001 From: Yeonsil Yoon Date: Mon, 22 Sep 2025 14:00:13 -0700 Subject: [PATCH 03/20] Merge Libint/intervl_bucket (#1965) ## Essential Elements of an Effective PR Description Checklist - [ ] The purpose of the PR, such as "Fix some issue (link existing issues this PR will resolve)". - [ ] The test plan, such as providing test command. - [ ] The test results, such as pasting the results comparison before and after, or e2e results ## Purpose ## Test Plan ## Test Result --------- Signed-off-by: Libin Tang Co-authored-by: Libin Tang Co-authored-by: Libin Tang --- vllm/model_executor/models/gemma3_mm.py | 3 +- vllm/model_executor/models/internvl.py | 91 ++++++++++++++++---- vllm/worker/hpu_model_runner.py | 107 ++++++++++++++++-------- 3 files changed, 148 insertions(+), 53 deletions(-) diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py index 578e49e7fd78..24fb4147e680 100644 --- a/vllm/model_executor/models/gemma3_mm.py +++ b/vllm/model_executor/models/gemma3_mm.py @@ -577,7 +577,8 @@ def _process_image_input( for i in batch_breakdown: end_idx = start_idx + i - indices = torch.arange(start_idx, end_idx) + indices = torch.arange(start_idx, + end_idx).to(pixel_values.device) batch_sliced_pixel_values = torch.index_select(pixel_values, dim=0, index=indices) diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 0c61369c5f51..c2acb9f69ef0 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -7,6 +7,7 @@ # Copyright (c) 2023 OpenGVLab # Licensed under The MIT License [see LICENSE for details] # -------------------------------------------------------- +import os from abc import ABC, abstractmethod from collections.abc import Iterable, Mapping, Sequence from typing import Any, Literal, Optional, TypedDict, TypeVar, Union @@ -35,13 +36,15 @@ BaseProcessingInfo, PromptReplacement, PromptUpdate, PromptUpdateDetails) from vllm.multimodal.profiling import BaseDummyInputsBuilder +from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors from vllm.transformers_utils.tokenizer import AnyTokenizer from .interfaces import (MultiModalEmbeddings, SupportsLoRA, SupportsMultiModal, SupportsPP) -from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model, - maybe_prefix, merge_multimodal_embeddings) +from .utils import (AutoWeightsLoader, flatten_bn, greedy_plan, + init_vllm_registered_model, maybe_prefix, + merge_multimodal_embeddings) IMG_START = '' IMG_END = '' @@ -50,6 +53,9 @@ IMAGENET_MEAN = (0.485, 0.456, 0.406) IMAGENET_STD = (0.229, 0.224, 0.225) +is_hpu = current_platform.is_hpu() +is_lazy = os.environ.get('PT_HPU_LAZY_MODE', '0') == '1' if is_hpu else False + class InternVLImagePixelInputs(TypedDict): type: Literal["pixel_values"] @@ -1062,6 +1068,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: self.visual_token_mask = None self.make_empty_intermediate_tensors = ( self.language_model.make_empty_intermediate_tensors) + if is_hpu: + self.graphed_multimodal_buckets = None def _patch_quant_config(self, config: PretrainedConfig, quant_config: QuantizationConfig): @@ -1127,16 +1135,64 @@ def pixel_shuffle(self, x, scale_factor=0.5): return x def extract_feature(self, pixel_values: torch.Tensor) -> torch.Tensor: - vit_embeds = self.vision_model(pixel_values=pixel_values) - vit_embeds = vit_embeds[:, 1:, :] - - h = w = int(vit_embeds.shape[1]**0.5) - vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1) - vit_embeds = self.pixel_shuffle(vit_embeds, - scale_factor=self.downsample_ratio) - vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1, - vit_embeds.shape[-1]) - vit_embeds = self.mlp1(vit_embeds) + if is_hpu: + if self.vision_buckets.multimodal_buckets: + batch_breakdown = greedy_plan(pixel_values.shape[0], \ + self.vision_buckets.multimodal_buckets) + else: + batch_breakdown = [pixel_values.shape[0]] + + start_idx = 0 + vit_embeds_minibatches = [] + + for i in batch_breakdown: + end_idx = start_idx + i + batch_sliced_pixel_values = \ + pixel_values[start_idx:end_idx, ...] + if is_lazy: + vit_embeds_minibatch = \ + self.vision_model( + pixel_values=batch_sliced_pixel_values, + bypass_hpu_graphs=i + not in self.graphed_multimodal_buckets + and len(self.graphed_multimodal_buckets) > 0) + else: + vit_embeds_minibatch = \ + self.vision_model( + pixel_values=batch_sliced_pixel_values) + + vit_embeds_minibatch = vit_embeds_minibatch[:, 1:, :] + + h = w = int(vit_embeds_minibatch.shape[1]**0.5) + vit_embeds_minibatch = vit_embeds_minibatch.reshape( + vit_embeds_minibatch.shape[0], h, w, -1) + vit_embeds_minibatch = self.pixel_shuffle( + vit_embeds_minibatch, scale_factor=self.downsample_ratio) + vit_embeds_minibatch = vit_embeds_minibatch.reshape( + vit_embeds_minibatch.shape[0], -1, + vit_embeds_minibatch.shape[-1]) + + if is_lazy: + vit_embeds_minibatches += [ + self.mlp1(vit_embeds_minibatch, + bypass_hpu_graphs=i + not in self.graphed_multimodal_buckets + and len(self.graphed_multimodal_buckets) > 0) + ] + else: + vit_embeds_minibatches += [self.mlp1(vit_embeds_minibatch)] + start_idx = end_idx + vit_embeds = torch.cat(vit_embeds_minibatches, dim=0) + else: + vit_embeds = self.vision_model(pixel_values=pixel_values) + vit_embeds = vit_embeds[:, 1:, :] + h = w = int(vit_embeds.shape[1]**0.5) + vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1) + vit_embeds = self.pixel_shuffle(vit_embeds, + scale_factor=self.downsample_ratio) + vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1, + vit_embeds.shape[-1]) + vit_embeds = self.mlp1(vit_embeds) return vit_embeds def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor: @@ -1180,8 +1236,11 @@ def _parse_and_validate_image_input( image_token_id = kwargs["image_token_id"] assert isinstance(image_token_id, torch.Tensor) - self.img_context_token_id = image_token_id.flatten().unique().item() - + if is_hpu: + self.img_context_token_id = image_token_id.flatten() + else: + self.img_context_token_id = image_token_id.flatten().unique().item( + ) if pixel_values_flat is not None: if not isinstance(pixel_values_flat, (torch.Tensor, list)): raise ValueError("Incorrect type of pixel values. " @@ -1306,7 +1365,9 @@ def get_language_model(self) -> torch.nn.Module: def get_multimodal_embeddings( self, **kwargs: object) -> Optional[MultiModalEmbeddings]: - + if is_hpu: + self.graphed_multimodal_buckets = kwargs.pop( + 'graphed_multimodal_buckets', []) modalities = self._parse_and_validate_multimodal_inputs(**kwargs) if not modalities: return None diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index f400db0d7c07..b49c61bfe2a4 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -112,16 +112,22 @@ class VisionBuckets: This class is used to bucket image tokens ''' - def __init__(self, is_batch_based): - self.is_batch_based = is_batch_based + def __init__(self, model): + self.is_batch_based = True envvar = os.environ.get('VLLM_MULTIMODAL_BUCKETS', "") if envvar == 'None': self.multimodal_buckets = None else: if envvar == "": - if is_batch_based: + if 'InternVLChatModel' in str(type(model)): + multimodal_buckets = list( + range(model.config.min_dynamic_patch, + model.config.max_dynamic_patch + + 2)) #As use_thumbnail is true + elif 'Gemma3ForConditionalGeneration' in str(type(model)): multimodal_buckets = [1, 2, 4, 8] # batch sizes for gemma3 else: + self.is_batch_based = False multimodal_buckets = [ 1600, 3136, 4096, 6400, 7744, 9216, 12544 ] @@ -159,9 +165,11 @@ def __call__(cls, *args, **kwargs): def is_mm_optimized(model): - return 'Gemma3ForConditionalGeneration' in str(type(model.model)) \ - if hasattr(model, 'model') else \ - 'Gemma3ForConditionalGeneration' in str(type(model)) + mm_models = ['Gemma3ForConditionalGeneration', 'InternVLChatModel'] + + return any(m in str(type(model.model)) for m in mm_models) \ + if hasattr(model, 'model') \ + else any(m in str(type(model)) for m in mm_models) def pad_flat_tensor(tensor, desired_size): @@ -345,6 +353,7 @@ def __init__(self, model, vllm_config, is_causal, sampler): model_config = getattr(self.model, "config", None) self.model_is_mrope = uses_mrope(model_config) + self.is_mm_optimized = is_mm_optimized(self.model) text_config = vllm_config.model_config.hf_config.get_text_config() self.interleaved_sliding_window = getattr( @@ -379,6 +388,12 @@ def __init__(self, model, vllm_config, is_causal, sampler): htorch.hpu.wrap_in_hpu_graph( \ self.model.multi_modal_projector, \ disable_tensor_cache=True) + if hasattr(self.model, 'vision_model'): + self.model.vision_model = htorch.hpu.wrap_in_hpu_graph( + self.model.vision_model, disable_tensor_cache=True) + if hasattr(self.model, 'mlp1'): + self.model.mlp1 = htorch.hpu.wrap_in_hpu_graph( + self.model.mlp1, disable_tensor_cache=True) self._rotary_embed_module = self._get_rotary_embedding_module( self.model) @@ -624,7 +639,6 @@ def compute_input_embeddings_for_mm_optimized(self, warmup_mode, **kwargs): vision_embeddings = self.model.get_multimodal_embeddings(**kwargs) inputs_embeds = self.model.get_input_embeddings( input_ids, vision_embeddings) - # TODO: In warmup, we need to warmup the model with dummy image data for # multimodal model for prompt, here instead of generating a dummy image, # we are just generating attn_mask for the images and pass with @@ -632,18 +646,23 @@ def compute_input_embeddings_for_mm_optimized(self, warmup_mode, **kwargs): # the whole vision tower. if vision_embeddings is not None or ( warmup_mode and kwargs['attn_metadata'].is_prompt): - input_ids = kwargs['input_ids'] - positions = kwargs['positions'] - kwargs = self.model.prepare_attn_masks( - mask_dtype=self.dtype, - **kwargs, - ) - kwargs['input_ids'] = input_ids - kwargs['positions'] = positions + if hasattr(self.model, 'prepare_attn_masks'): + input_ids = kwargs['input_ids'] + positions = kwargs['positions'] + kwargs = self.model.prepare_attn_masks( + mask_dtype=self.dtype, + **kwargs, + ) + kwargs['input_ids'] = input_ids + kwargs['positions'] = positions + # done compute the visual tokens + kwargs.pop('pixel_values', None) + else: + kwargs.pop('pixel_values_flat', None) + kwargs.pop("image_num_patches", None) + kwargs.pop("image_token_id", None) kwargs.update({'inputs_embeds': inputs_embeds}) - # done compute the visual tokens and others - kwargs.pop('pixel_values', None) kwargs.pop("num_crops", None) kwargs.pop("graphed_multimodal_buckets", None) return kwargs @@ -699,7 +718,6 @@ def forward(self, *args, **kwargs): virtual_engine = 0 if 'virtual_engine' in kwargs: virtual_engine = kwargs.pop('virtual_engine') - input_ids = kwargs['input_ids'] global_attn_masks = kwargs.pop("global_attn_masks") \ if kwargs.get("global_attn_masks") else None @@ -1080,6 +1098,8 @@ def __init__( and not self.lora_config) self.use_delayed_sampling = get_config( ).use_delayed_sampling and can_use_delayed_sampling + self.mm_tokens_per_image = 1 + self.image_token_id = 0 def _set_gc_threshold(self) -> None: """ @@ -1497,10 +1517,16 @@ def move_to_device(self, tensor): non_blocking=True) def add_vision_buckets_to_mrope_mm_optimized(self): - model = self.get_model() - self.is_mm_optimized = is_mm_optimized(model) + self.is_mm_optimized = is_mm_optimized(self.model) if self.model_is_mrope or self.is_mm_optimized: - model.vision_buckets = VisionBuckets(self.is_mm_optimized) + if hasattr(self.model.model.config, 'mm_tokens_per_image'): + self.mm_tokens_per_image = \ + self.model.model.config.mm_tokens_per_image + self.image_token_id = self.model.model.config.image_token_id + elif 'InternVLChatModel' in str(type(self.model.model)): + self.image_token_id = 151667 + self.mm_tokens_per_image = self.model.model.num_image_token + self.model.model.vision_buckets = VisionBuckets(self.model.model) def _prepare_prompt( self, @@ -1631,7 +1657,6 @@ def _prepare_prompt( for idx in range(3): seq_data_mrope_positions[idx] \ .extend(mrope_positions[idx]) - multi_modal_kwargs_list.append(mm_kwargs) for modality, placeholder_map in placeholder_maps.items(): @@ -2709,17 +2734,28 @@ def create_dummy_multi_modal_seq_group_metadata(self, group_id, img_args, else: s = self.model.model.config.vision_config.image_size pixel_values = torch.randn([img_args, 3, s, s]) - num_image_tokens = self.model.model.config.mm_tokens_per_image \ - * img_args - multi_modal_data = { - "pixel_values": pixel_values, - "num_crops": torch.zeros([img_args], dtype=torch.int32) - } - image_token_id = self.get_model().config.image_token_id - prompt_token_ids_image = [image_token_id] * num_image_tokens + if 'Gemma3ForConditionalGeneration' in str(type(self.model.model)): + multi_modal_data = { + "pixel_values": pixel_values, + "num_crops": torch.zeros([img_args], dtype=torch.int32), + } + elif 'InternVLChatModel' in str(type(self.model.model)): + multi_modal_data = { + "pixel_values_flat": + pixel_values.to(torch.bfloat16), + "image_num_patches": + torch.tensor([pixel_values.shape[0]], dtype=torch.int32), + "image_token_id": + torch.tensor([self.image_token_id], dtype=torch.int64), + } + else: + logger.warning("No support for other models yet") + num_image_tokens = self.mm_tokens_per_image * img_args + prompt_token_ids_image = [self.image_token_id] * num_image_tokens prompt_token_ids = [0] * ( seq_len - len(prompt_token_ids_image)) + prompt_token_ids_image + prompt_token_ids_array = array('l', prompt_token_ids) # noqa: F821 placeholders_by_modality = { 'image': @@ -3188,9 +3224,7 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None: if graphs: self.graphed_buckets.add(cfg) if self.is_mm_run(): - img_args = (int(seq_len) // - self.model.model.config.mm_tokens_per_image - if self.is_mm_optimized else int(seq_len)) + img_args = int(seq_len) // self.mm_tokens_per_image self.warmup_scenario( int(bs), int(seq_len), @@ -3539,7 +3573,7 @@ def _get_seq_ids(self, model_input): def _get_img_args_from_model_input(self, model_input): if (not self.model_is_mrope and not self.is_mm_optimized) or \ not model_input.multi_modal_kwargs or \ - 'pixel_values' not in model_input.multi_modal_kwargs: + ('pixel_values') not in model_input.multi_modal_kwargs: return None if self.model_is_mrope: pixel_values_list = model_input.multi_modal_kwargs['pixel_values'] @@ -3816,18 +3850,17 @@ def try_revert_dummy_output_tokens(): 'real_seq_len': model_input.seq_lens, 'real_batch_size': real_batch_size } - #Need to set the window_slide mask at this point to decide if is_prompt: attn_metadata = self.model._update_use_window_sdpa( execute_model_kwargs['attn_metadata'], seq_len, bool(model_input.multi_modal_kwargs and \ - 'pixel_values' in model_input.multi_modal_kwargs)) + ('pixel_values')in model_input.multi_modal_kwargs)) execute_model_kwargs['attn_metadata'] = attn_metadata if not bypass_model_exec: if self.model_is_mrope or self.is_mm_optimized: - if 'pixel_values' in execute_model_kwargs and \ + if ('pixel_values') in execute_model_kwargs and \ self.is_mm_optimized: if warmup_mode and not is_pt_profiler_run: bypass_model_exec = True From 716f3fc2f8368c0ce2d2f03baf68b64190a8635e Mon Sep 17 00:00:00 2001 From: Yeonsil Yoon Date: Mon, 22 Sep 2025 15:17:17 -0700 Subject: [PATCH 04/20] Introduce VLLM_WARMUP_WITH_PENALTY for internVL warmup (#1967) Introduce VLLM_WARMUP_WITH_PENALTY to call apply penalty code in sampler during warmup also. https://github.com/HabanaAI/vllm-fork/blob/libint/intervl_bucket/vllm/model_executor/layers/sampler.py#L280 is not called during warmup. And it causes extra graph compile during runtime as it sets to True for real run. --- vllm/worker/hpu_model_runner.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index b49c61bfe2a4..eeadd4a0f963 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -2784,11 +2784,15 @@ def create_dummy_seq_group_metadata(self, lora_request=None, img_args=None, temperature=0, + presence_penalty=0.0, + top_p=1.0, ctx=0): if self.is_pooler: sampling_params = None else: - sampling_params = SamplingParams(temperature=temperature) + sampling_params = SamplingParams(temperature=temperature, + presence_penalty=presence_penalty, + top_p=top_p) num_blocks = math.ceil(seq_len / self.block_size) seq_len = max(seq_len, 1) computed_block_nums = None @@ -2945,6 +2949,12 @@ def warmup_scenario(self, ] self.profiler.start('internal', scenario_name) times = num_iters if use_graphs or is_pt_profiler_run else 1 + presence_penalty = 1.0 if os.getenv('VLLM_WARMUP_WITH_PENALTY', + '0') == '1' else 0.0 + top_p = 0.1 if os.getenv('VLLM_WARMUP_WITH_PENALTY', + '0') == '1' else 1.0 + temperature = 1.0 if os.getenv('VLLM_WARMUP_WITH_PENALTY', + '0') == '1' else 0.0 if is_prompt: seqs = [ self.create_dummy_seq_group_metadata( @@ -2955,6 +2965,8 @@ def warmup_scenario(self, if dummy_lora_requests_per_seq else None, img_args=img_args, temperature=temperature, + presence_penalty=presence_penalty, + top_p=top_p, ctx=ctx) for i in range(batch_size) ] else: @@ -2968,6 +2980,8 @@ def warmup_scenario(self, lora_request=dummy_lora_requests_per_seq[i] if dummy_lora_requests_per_seq else None, temperature=temperature, + presence_penalty=presence_penalty, + top_p=top_p, ctx=ctx) for i, b in enumerate(blocks) ] if not is_dummy_run: From 30c226ecfa1b4a319b81edb22eca62fe6a35ae97 Mon Sep 17 00:00:00 2001 From: Yeonsil Yoon Date: Mon, 22 Sep 2025 15:25:18 -0700 Subject: [PATCH 05/20] Modify merge_multimodal_embeddings to static (#1969) --- vllm/model_executor/models/internvl.py | 18 +++++++++++++++++- vllm/model_executor/models/utils.py | 16 ++++++++++++++++ vllm/worker/hpu_model_runner.py | 24 ++++++++++++++++++++++-- 3 files changed, 55 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index c2acb9f69ef0..92fe14de128f 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -44,7 +44,8 @@ SupportsMultiModal, SupportsPP) from .utils import (AutoWeightsLoader, flatten_bn, greedy_plan, init_vllm_registered_model, maybe_prefix, - merge_multimodal_embeddings) + merge_multimodal_embeddings, + merge_multimodal_embeddings_static) IMG_START = '' IMG_END = '' @@ -1390,6 +1391,21 @@ def get_multimodal_embeddings( return multimodal_embeddings + def get_input_embeddings_hpu( + self, + input_ids: torch.Tensor, + image_index_tensor: torch.Tensor, + multimodal_embeddings: Optional[MultiModalEmbeddings] = None, + ) -> torch.Tensor: + inputs_embeds = self.language_model.get_input_embeddings(input_ids) + if multimodal_embeddings is not None: + inputs_embeds = merge_multimodal_embeddings_static( + image_index_tensor, + inputs_embeds, + multimodal_embeddings, + ) + return inputs_embeds + def get_input_embeddings( self, input_ids: torch.Tensor, diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index 96d529254b7a..822113b52a8a 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -433,6 +433,22 @@ def merge_multimodal_embeddings_from_map( return inputs_embeds +def merge_multimodal_embeddings_static( + is_multimodal_index: torch.Tensor, + inputs_embeds: torch.Tensor, + multimodal_embeddings: NestedTensors, +) -> torch.Tensor: + assert current_platform.is_hpu(), ("Support HPU only") + flattened = _flatten_embeddings(multimodal_embeddings) + + inputs_embeds_s = inputs_embeds.shape + inputs_embeds = inputs_embeds.view(inputs_embeds_s[0] * inputs_embeds_s[1], + inputs_embeds_s[2]) + inputs_embeds = inputs_embeds.index_copy_(0, is_multimodal_index, + flattened).view(inputs_embeds_s) + return inputs_embeds + + def _merge_multimodal_embeddings( inputs_embeds: torch.Tensor, is_multimodal: torch.Tensor, diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index eeadd4a0f963..e514469a942c 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -637,8 +637,14 @@ def _update_metadata(self, def compute_input_embeddings_for_mm_optimized(self, warmup_mode, **kwargs): input_ids = kwargs['input_ids'] vision_embeddings = self.model.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.model.get_input_embeddings( - input_ids, vision_embeddings) + if 'image_index' in kwargs: + inputs_embeds = self.model.get_input_embeddings_hpu( + input_ids, kwargs['image_index'], vision_embeddings) + kwargs.pop("image_index", None) + else: + inputs_embeds = self.model.get_input_embeddings( + input_ids, vision_embeddings) + # TODO: In warmup, we need to warmup the model with dummy image data for # multimodal model for prompt, here instead of generating a dummy image, # we are just generating attn_mask for the images and pass with @@ -1772,6 +1778,7 @@ def _prepare_prompt( pad=0, dtype=torch.long, flat=self.use_merged_prefill) + image_index_tensor = None if self.model_is_mrope: input_positions = \ make_mrope_positions_tensor_with_pad(input_positions=input_positions, @@ -1785,6 +1792,11 @@ def _prepare_prompt( dtype=torch.long, flat=self.use_merged_prefill) + if seq_group_metadata.multi_modal_data and self.is_mm_optimized and \ + 'InternVLChatModel' in str(type(self.model.model)): + is_image_flatten = ( + input_tokens_tensor == self.image_token_id).flatten() + image_index_tensor = is_image_flatten.nonzero().squeeze(-1) slot_mapping = make_cpu_tensor(slot_mapping, max_len=max_prompt_len, pad=_PAD_SLOT_ID, @@ -1872,6 +1884,8 @@ def _prepare_prompt( input_positions=input_positions, ) multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list) + if image_index_tensor is not None: + multi_modal_kwargs['image_index'] = image_index_tensor multi_modal_kwargs = MultiModalKwargs.as_kwargs(multi_modal_kwargs, device=self.device) @@ -3872,6 +3886,12 @@ def try_revert_dummy_output_tokens(): ('pixel_values')in model_input.multi_modal_kwargs)) execute_model_kwargs['attn_metadata'] = attn_metadata + if 'image_index' in model_input.multi_modal_kwargs: + execute_model_kwargs[ + 'image_index'] = model_input.multi_modal_kwargs[ + 'image_index'] + model_input.multi_modal_kwargs.pop('image_index', None) + if not bypass_model_exec: if self.model_is_mrope or self.is_mm_optimized: if ('pixel_values') in execute_model_kwargs and \ From dacac74f7e17f4e4314030571c92ad159b26b4a4 Mon Sep 17 00:00:00 2001 From: Seunghyuk Park Date: Tue, 23 Sep 2025 21:05:17 +0300 Subject: [PATCH 06/20] Add Daniel's mediapipe changes --- vllm/entrypoints/chat_utils.py | 10 +- vllm/model_executor/models/internvl.py | 322 +++++++++++++++++++++++-- vllm/multimodal/image.py | 3 + vllm/multimodal/parse.py | 5 + vllm/multimodal/utils.py | 54 ++++- vllm/worker/hpu_model_runner.py | 12 +- 6 files changed, 372 insertions(+), 34 deletions(-) diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index f5f45a62ca2f..9a0525c7933a 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -405,8 +405,14 @@ def _resolve_chat_template_content_format( jinja_text = (hf_chat_template if isinstance(hf_chat_template, str) else load_chat_template(chat_template, is_literal=True)) - detected_format = ("string" if jinja_text is None else - _detect_content_format(jinja_text, default="string")) + # The InternVL template has mixed content access patterns that fail with automatic detection. + # Set string format for proper operation if InternVL is used. + model_type = getattr(model_config.hf_config, 'model_type', '') + if model_type == 'internvl_chat' or 'internvl' in model_config.model.lower(): + detected_format = "string" + else: + detected_format = ("string" if jinja_text is None else + _detect_content_format(jinja_text, default="string")) return detected_format diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 92fe14de128f..e4b01fd86b5e 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -19,6 +19,19 @@ from PIL import Image from transformers import BatchEncoding, PretrainedConfig, TensorType +from habana_frameworks.mediapipe import fn +from habana_frameworks.mediapipe.mediapipe import MediaPipe +from habana_frameworks.mediapipe.media_types import dtype as dt +from habana_frameworks.mediapipe.media_types import imgtype as it +from habana_frameworks.mediapipe.media_types import readerOutType as ro +from habana_frameworks.mediapipe.operators.reader_nodes.reader_nodes import media_ext_reader_op_impl +from habana_frameworks.mediapipe.operators.reader_nodes.reader_nodes import media_ext_reader_op_tensor_info +from habana_frameworks.mediapipe.plugins.iterator_pytorch import MediaGenericPytorchIterator +import numpy as np +from queue import Queue +import io +import time + from vllm.config import VllmConfig from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization.awq import AWQConfig @@ -295,6 +308,244 @@ def video_to_pixel_values_internvl( pixel_values = torch.stack([transform(image) for image in frames_list]) return pixel_values +# Handle MediaPipe pipe_manager destructor +from habana_frameworks.mediapipe.backend.cal import pipe_manager, cpp_pipe_manager_list + +def _patched_close(self): + """Patched close method that handles None cpp_pipe_manager_list during shutdown""" + try: + # Check if cpp_pipe_manager_list exists and is not None + if cpp_pipe_manager_list is not None and self._pm_ in cpp_pipe_manager_list: + cpp_pipe_manager_list.remove(self._pm_) + except (TypeError, AttributeError): + # Handle case where cpp_pipe_manager_list is None or not iterable + pass + + # Clean up the pipe manager + if self._pm_ is not None: + self._pm_.close() + self._pm_ = None + +pipe_manager.close = _patched_close + +# Queue shared between external reader and mediapipe call +shared_q = Queue() + + +class MediaPytorchIterator(MediaGenericPytorchIterator): + def __init__(self, mediapipe): + super().__init__(mediapipe=mediapipe, device="hpu", fw_type="PYT_FW") + + +class external_reader(media_ext_reader_op_impl): + def __init__(self, params, fw_params): + self.batch_size = fw_params.batch_size + self.max_file = "" + self.num_batches = 1 + + def __iter__(self): + return self + + def __len__(self): + return self.num_batches + + def __next__(self): + img_list = shared_q.get() + for i in range(len(img_list)): + # NOTE: this padding is needed because of HW alignmnet requirment + img_list[i] = np.pad(img_list[i], + (0, 64 - len(img_list[i]) % 64), + 'constant') + return img_list + + def get_media_output_type(self): + return ro.BUFFER_LIST + + def get_largest_file(self): + return self.max_file + + def gen_output_info(self): + out_info = [] + o = media_ext_reader_op_tensor_info( + dt.NDT, np.array([self.batch_size], dtype=np.uint32), "") + out_info.append(o) + return out_info + + +class hpuMediaPipe(MediaPipe): + def __init__(self, device, queue_depth, batch_size, + num_threads, op_device, + img_height, img_width): + super( + hpuMediaPipe, + self).__init__( + device, + queue_depth, + batch_size, + num_threads, + self.__class__.__name__) + + mediapipe_seed = int(time.time_ns() % (2**31 - 1)) + + self.input = fn.MediaExtReaderOp(impl=external_reader, + num_outputs=1, + seed=mediapipe_seed, + device=op_device) + self.decode = fn.ImageDecoder( + device="hpu", output_format=it.RGB_I, resize=[img_width, img_height]) + + self.mean_node = fn.MediaConst( + data=np.array([127.5, 127.5, 127.5], dtype=dt.FLOAT32), + shape=[1, 1, 3], + dtype=dt.FLOAT32 + ) + self.std_node = fn.MediaConst( + data=np.array([1/127.5, 1/127.5, 1/127.5], dtype=dt.FLOAT32), + shape=[1, 1, 3], + dtype=dt.FLOAT32 + ) + + self.cmn = fn.CropMirrorNorm(crop_w=img_width, crop_h=img_height, dtype=dt.FLOAT32, device="hpu") + + self.transpose = fn.Transpose( + device="hpu", + tensorDim=4, + permutation=[1, 2, 0, 3] #NCHW + ) + + def definegraph(self): + images = self.input() + images = self.decode(images) + mean = self.mean_node() + std = self.std_node() + images = self.cmn(images, mean, std) + images = self.transpose(images) + + # Return the full processed image - we'll do tiling in Python + return images + +def get_image_info(data): + # Get image info using PIL without decoding + try: + with Image.open(io.BytesIO(data)) as img: + return { + 'format': img.format, + 'size': img.size, + 'mode': img.mode + } + except Exception as e: + raise ValueError(f"Input image bitstream is not in supported format: {str(e)}") + +def preprocess_images( + images, + target_ratios: list[tuple[int, int]], + patch_size=448, + use_thumbnail=False, +): + batch_size = 0 + queue_depth = 0 + num_threads = 1 + + # validate images and create batches + img_size = None + img_sizes = [] + batch_sizes = [] + for img in images: + img_info = get_image_info(img) + if img_info['format'] != 'JPEG' and img_info['mode'] != 'RGB': + raise ValueError(f"HPU media pipeline only supports JPEG images in RGB mode. Detected format={img_info['format']}, mode={img_info['mode']}") + if img_size==None: + img_size = img_info['size'] + else: + if img_info['size'] != img_size: + img_sizes.append(img_size) + batch_sizes.append(batch_size) + batch_size = 0 + img_size = img_info['size'] + batch_size += 1 + img_sizes.append(img_size) + batch_sizes.append(batch_size) + + thumbs = None + if use_thumbnail and len(images) > 0: + batch_size = len(images) + pipe = hpuMediaPipe("legacy", queue_depth, batch_size, + num_threads, "cpu", + patch_size, patch_size) + pipe.build() + data_loader = MediaPytorchIterator(pipe) + data_loader = iter(data_loader) + + img_list = np.empty(shape=[batch_size, ], dtype=object) + for i in range(batch_size): + img_list[i] = np.frombuffer(images[i], np.uint8) + + shared_q.put(img_list) + thumbs = next(data_loader)[0] + + shared_q.task_done() + pipe.close() + del pipe + + image_num_patches = torch.zeros(len(images), dtype=torch.int64) + + patches = [] + thumb_idx = 0 + image_num_patches_idx = 0 + for batch_size, img_size in zip(batch_sizes, img_sizes): + # calculate the number of blocks without thumbnail + blocks, target_width, target_height = calculate_internvl_targets( + orig_width=img_size[0], + orig_height=img_size[1], + target_ratios=target_ratios, + image_size=patch_size, + use_thumbnail=False, + ) + + num_patches = blocks + 1 if use_thumbnail and thumbs is not None and blocks > 1 else blocks + image_num_patches[image_num_patches_idx:image_num_patches_idx+batch_size] = num_patches + + pipe = hpuMediaPipe("legacy", queue_depth, batch_size, + num_threads, "cpu", + target_height, target_width) + pipe.build() + data_loader = MediaPytorchIterator(pipe) + data_loader = iter(data_loader) + + img_list = np.empty(shape=[batch_size, ], dtype=object) + for i in range(batch_size): + img_list[i] = np.frombuffer(images[i], np.uint8) + + shared_q.put(img_list) + processed_images = next(data_loader)[0] + + shared_q.task_done() + pipe.close() + del pipe + + # Extract tiles + tiles = [] + H, W = target_height, target_width + for h_idx in range(H // patch_size): + for w_idx in range(W // patch_size): + h_start = h_idx * patch_size + h_end = h_start + patch_size + w_start = w_idx * patch_size + w_end = w_start + patch_size + + tile = processed_images[:, :, h_start:h_end, w_start:w_end] + tiles.append(tile) + + for i in range(batch_size): + for t in tiles: + patches.append(t[i]) + if use_thumbnail and thumbs is not None and len(tiles) > 1: + patches.append(thumbs[thumb_idx]) + thumb_idx += 1 + + patches_flat = torch.stack(patches, dim=0) + return patches_flat, image_num_patches + class BaseInternVLProcessor(ABC): """ @@ -451,25 +702,58 @@ def _preprocess_image( if len(images) == 0: image_inputs = {} else: - pixel_values_lst = self._images_to_pixel_values_lst( - images, - min_dynamic_patch=min_dynamic_patch, - max_dynamic_patch=max_dynamic_patch, - dynamic_image_size=dynamic_image_size, - ) - image_inputs: dict[str, NestedTensors] = { - "pixel_values_flat": - torch.cat(pixel_values_lst), - "image_num_patches": - torch.tensor([len(item) for item in pixel_values_lst]), - } + use_mediapipe = os.getenv("VLLM_USE_MEDIA_PIPELINE", "false").lower() in ("1", "true", "yes") + if use_mediapipe: + # Use HPU media pipeline for image preprocessing + min_num, max_num = self.resolve_min_max_num( + min_dynamic_patch=min_dynamic_patch, + max_dynamic_patch=max_dynamic_patch, + dynamic_image_size=dynamic_image_size, + use_thumbnail=False, # Applied in image_to_pixel_values + ) - for pixel_values in pixel_values_lst: - num_patches = pixel_values.shape[0] - feature_size = num_patches * self.num_image_token + target_ratios = get_internvl_target_ratios(min_num, max_num) + + pixel_values_flat, image_num_patches = preprocess_images( + images, + target_ratios=target_ratios, + patch_size=self.image_size, + use_thumbnail=self.use_thumbnail, + ) + + image_inputs = { + "pixel_values_flat": pixel_values_flat, + "image_num_patches": image_num_patches, + } + + for i in range(len(images)): + num_patches = image_num_patches[i].item() + feature_size = num_patches * self.num_image_token + + image_repl = self.get_image_repl(feature_size, num_patches) + text = [t.replace('', image_repl.full, 1) for t in text] + + else: + pixel_values_lst = self._images_to_pixel_values_lst( + images, + min_dynamic_patch=min_dynamic_patch, + max_dynamic_patch=max_dynamic_patch, + dynamic_image_size=dynamic_image_size, + ) + image_inputs: dict[str, NestedTensors] = { + "pixel_values_flat": + torch.cat(pixel_values_lst), + "image_num_patches": + torch.tensor([len(item) for item in pixel_values_lst]), + } + + for pixel_values in pixel_values_lst: + num_patches = pixel_values.shape[0] + feature_size = num_patches * self.num_image_token + + image_repl = self.get_image_repl(feature_size, num_patches) + text = [t.replace('', image_repl.full, 1) for t in text] - image_repl = self.get_image_repl(feature_size, num_patches) - text = [t.replace('', image_repl.full, 1) for t in text] return text, image_inputs def _make_batch_input(self, @@ -483,7 +767,7 @@ def _make_batch_input(self, def __call__( self, text: Optional[Union[str, list[str]]] = None, - images: Optional[Union[Image.Image, list[Image.Image]]] = None, + images: Optional[Union[Image.Image, list[Image.Image], bytes, list[bytes]]] = None, min_dynamic_patch: Optional[int] = None, max_dynamic_patch: Optional[int] = None, dynamic_image_size: Optional[bool] = None, @@ -602,7 +886,7 @@ def _preprocess_video( def __call__( self, text: Optional[Union[str, list[str]]] = None, - images: Optional[Union[Image.Image, list[Image.Image]]] = None, + images: Optional[Union[Image.Image, list[Image.Image], bytes, list[bytes]]] = None, videos: Optional[Union[npt.NDArray, list[npt.NDArray]]] = None, min_dynamic_patch: Optional[int] = None, max_dynamic_patch: Optional[int] = None, diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py index e673632d4366..152d3ee98c01 100644 --- a/vllm/multimodal/image.py +++ b/vllm/multimodal/image.py @@ -57,6 +57,9 @@ def load_bytes(self, data: bytes) -> Image.Image: def load_base64(self, media_type: str, data: str) -> Image.Image: return self.load_bytes(base64.b64decode(data)) + def load_base64_bytes(self, media_type: str, data: str) -> bytes: + return base64.b64decode(data, validate=True) + def load_file(self, filepath: Path) -> Image.Image: image = Image.open(filepath) image.load() diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py index cae62b2235e4..693a9a230fb3 100644 --- a/vllm/multimodal/parse.py +++ b/vllm/multimodal/parse.py @@ -7,6 +7,7 @@ from typing import (TYPE_CHECKING, Any, Generic, Literal, NamedTuple, Optional, TypeVar, Union) +import io import numpy as np import torch from typing_extensions import TypeAlias, TypeGuard, assert_never @@ -209,6 +210,9 @@ def get_image_size(self, item_idx: int) -> ImageSize: if isinstance(image, PILImage.Image): return ImageSize(*image.size) + if isinstance(image, (bytes, bytearray)): + with PILImage.open(io.BytesIO(image)) as img: + return ImageSize(*img.size) if isinstance(image, (np.ndarray, torch.Tensor)): _, h, w = image.shape return ImageSize(w, h) @@ -407,6 +411,7 @@ def _parse_image_data( return ImageEmbeddingItems(data) if (isinstance(data, PILImage.Image) + or isinstance(data, (bytes, bytearray)) or isinstance(data, (np.ndarray, torch.Tensor)) and data.ndim == 3): data_items = [data] diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index 11a25f851546..2a5d4a0c2098 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -6,6 +6,7 @@ from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union from urllib.parse import ParseResult, urlparse +import os import numpy as np import numpy.typing as npt import torch @@ -66,6 +67,7 @@ def _load_data_url( self, url_spec: ParseResult, media_io: MediaIO[_M], + load_type: str = "PIL", ) -> _M: data_spec, data = url_spec.path.split(",", 1) media_type, data_type = data_spec.split(";", 1) @@ -74,7 +76,10 @@ def _load_data_url( msg = "Only base64 data URLs are supported for now." raise NotImplementedError(msg) - return media_io.load_base64(media_type, data) + if load_type == "bytes": + return media_io.load_base64_bytes(media_type, data) + else: + return media_io.load_base64(media_type, data) def _load_file_url( self, @@ -100,6 +105,7 @@ def load_from_url( media_io: MediaIO[_M], *, fetch_timeout: Optional[int] = None, + load_type: str = "bytes", ) -> _M: url_spec = urlparse(url) @@ -107,13 +113,24 @@ def load_from_url( connection = self.connection data = connection.get_bytes(url, timeout=fetch_timeout) - return media_io.load_bytes(data) + if load_type == "bytes": + msg = "Only data URLs are currently supported for raw bytes loading." + raise NotImplementedError(msg) + else: + connection = self.connection + data = connection.get_bytes(url, timeout=fetch_timeout) + + return media_io.load_bytes(data) if url_spec.scheme == "data": - return self._load_data_url(url_spec, media_io) + return self._load_data_url(url_spec, media_io, load_type) if url_spec.scheme == "file": - return self._load_file_url(url_spec, media_io) + if load_type == "bytes": + msg = "Only data URLs are currently supported for raw bytes loading." + raise NotImplementedError(msg) + else: + return self._load_file_url(url_spec, media_io) msg = "The URL must be either a HTTP, data or file URL." raise ValueError(msg) @@ -124,20 +141,29 @@ async def load_from_url_async( media_io: MediaIO[_M], *, fetch_timeout: Optional[int] = None, + load_type: str = "bytes", ) -> _M: url_spec = urlparse(url) if url_spec.scheme.startswith("http"): - connection = self.connection - data = await connection.async_get_bytes(url, timeout=fetch_timeout) + if load_type == "bytes": + msg = "Only data URLs are currently supported for raw bytes loading." + raise NotImplementedError(msg) + else: + connection = self.connection + data = await connection.async_get_bytes(url, timeout=fetch_timeout) - return media_io.load_bytes(data) + return media_io.load_bytes(data) if url_spec.scheme == "data": - return self._load_data_url(url_spec, media_io) + return self._load_data_url(url_spec, media_io, load_type) if url_spec.scheme == "file": - return self._load_file_url(url_spec, media_io) + if load_type == "bytes": + msg = "Only data URLs are currently supported for raw bytes loading." + raise NotImplementedError(msg) + else: + return self._load_file_url(url_spec, media_io) msg = "The URL must be either a HTTP, data or file URL." raise ValueError(msg) @@ -179,17 +205,20 @@ def fetch_image( image_mode: str = "RGB", ) -> Image.Image: """ - Load a PIL image from a HTTP or base64 data URL. + Load a PIL image (or raw bytes) from a HTTP or base64 data URL. By default, the image is converted into RGB format. """ image_io = ImageMediaIO(image_mode=image_mode) + use_mediapipe = os.getenv("VLLM_USE_MEDIA_PIPELINE", "false").lower() in ("1", "true", "yes") + load_type = "bytes" if use_mediapipe else "PIL" try: return self.load_from_url( image_url, image_io, fetch_timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT, + load_type=load_type, ) except UnidentifiedImageError as e: # convert to ValueError to be properly caught upstream @@ -202,17 +231,20 @@ async def fetch_image_async( image_mode: str = "RGB", ) -> Image.Image: """ - Asynchronously load a PIL image from a HTTP or base64 data URL. + Asynchronously load a PIL image (or raw bytes) from a HTTP or base64 data URL. By default, the image is converted into RGB format. """ image_io = ImageMediaIO(image_mode=image_mode) + use_mediapipe = os.getenv("VLLM_USE_MEDIA_PIPELINE", "false").lower() in ("1", "true", "yes") + load_type = "bytes" if use_mediapipe else "PIL" try: return await self.load_from_url_async( image_url, image_io, fetch_timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT, + load_type=load_type, ) except UnidentifiedImageError as e: # convert to ValueError to be properly caught upstream diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index e514469a942c..012b38d45f47 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -1886,8 +1886,16 @@ def _prepare_prompt( multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list) if image_index_tensor is not None: multi_modal_kwargs['image_index'] = image_index_tensor - multi_modal_kwargs = MultiModalKwargs.as_kwargs(multi_modal_kwargs, - device=self.device) + + use_mediapipe = os.getenv("VLLM_USE_MEDIA_PIPELINE", "false").lower() in ("1", "true", "yes") + if use_mediapipe: + # With mediapipe path some tensors will already be on HPU, we only move to HPU if needed + for key in multi_modal_kwargs.keys(): + if hasattr(multi_modal_kwargs[key], "device") and multi_modal_kwargs[key].device != self.device: + multi_modal_kwargs[key] = self.move_to_device(multi_modal_kwargs[key]) + else: + multi_modal_kwargs = MultiModalKwargs.as_kwargs(multi_modal_kwargs, + device=self.device) return PreparePromptMetadata(input_tokens=input_tokens_tensor, input_positions=input_positions, From 65abdfb8f8d154096af9cea4204fb951058d7d15 Mon Sep 17 00:00:00 2001 From: Yeonsil Yoon Date: Wed, 24 Sep 2025 00:40:40 +0000 Subject: [PATCH 07/20] Call compute_input_embeddings only for prompt to save decode time --- vllm/model_executor/models/internvl.py | 9 +++++---- vllm/worker/hpu_model_runner.py | 6 ++++-- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index e4b01fd86b5e..ea723e87340b 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -1728,10 +1728,11 @@ def forward( # NOTE: In v1, inputs_embeds is always generated at model runner, this # condition is for v0 compatibility. elif inputs_embeds is None: - vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings(input_ids, - vision_embeddings) - input_ids = None + if not is_hpu: + vision_embeddings = self.get_multimodal_embeddings(**kwargs) + inputs_embeds = self.get_input_embeddings(input_ids, + vision_embeddings) + input_ids = None forward_kwargs = { "input_ids": input_ids, diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 012b38d45f47..ac6f1eed8098 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -739,7 +739,8 @@ def forward(self, *args, **kwargs): if self._rotary_prepare_cos_sin is not None and not self.model_is_mrope: self._rotary_prepare_cos_sin( kwargs['positions'], recompute_cos_sin=self.recompute_cos_sin) - if self.model_is_mrope or self.is_mm_optimized: + if self.model_is_mrope or (self.is_mm_optimized + and kwargs['attn_metadata'].is_prompt): # inputs_embeds was computed on execute_model # now we always want to use the inputs_embeds # even if the prompt is text only @@ -3901,7 +3902,8 @@ def try_revert_dummy_output_tokens(): model_input.multi_modal_kwargs.pop('image_index', None) if not bypass_model_exec: - if self.model_is_mrope or self.is_mm_optimized: + if self.model_is_mrope or (self.is_mm_optimized + and is_prompt): if ('pixel_values') in execute_model_kwargs and \ self.is_mm_optimized: if warmup_mode and not is_pt_profiler_run: From b38c808ef9c0c147b7d9a002e7fb4d2bf6a546c5 Mon Sep 17 00:00:00 2001 From: Yeonsil Yoon Date: Wed, 24 Sep 2025 01:44:28 +0000 Subject: [PATCH 08/20] Merge PR1974 intervl:cache prompt_tokens for sampling metadata in sampling if do penalties, the prompt_tokens regenerates for each decode, that takes time. instead we can use cache it, and reset when requests set changes --- vllm/model_executor/layers/sampler.py | 15 +++++++-- vllm/model_executor/sampling_metadata.py | 42 ++++++++++++++++-------- vllm/worker/hpu_model_runner.py | 6 ++++ 3 files changed, 47 insertions(+), 16 deletions(-) diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index 5d18d3b59af4..cc7872f69bc0 100755 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -197,6 +197,9 @@ def __init__(self): # speculative decoding and when prompt embeddings are specified. self.include_gpu_probs_tensor = False self.should_modify_greedy_probs_inplace = False + # Add HPU cache class variables + self._prompt_tokens_hpu_cache: Optional[torch.Tensor] = None + self._cached_seq_ids: Optional[set] = None def _init_sampling_tensors( self, @@ -216,8 +219,10 @@ def _init_sampling_tensors( # Initialize new sampling tensors (sampling_tensors, do_penalties, do_top_p_top_k, do_min_p, - top_k_scalar, top_p_scalar) = SamplingTensors.from_sampling_metadata( - sampling_metadata, vocab_size, logits.device, logits.dtype) + top_k_scalar, top_p_scalar, current_seq_ids) = \ + SamplingTensors.from_sampling_metadata( + sampling_metadata, vocab_size, logits.device, logits.dtype, \ + self._prompt_tokens_hpu_cache, self._cached_seq_ids) self._sampling_tensors = sampling_tensors self._do_penalties = do_penalties @@ -227,6 +232,12 @@ def _init_sampling_tensors( self._top_p_scalar = top_p_scalar self._apply_top_k_top_p_opt = ApplyToppTopkScalar(5) + # Check if batch composition changed - if so, invalidate prompt cache + + # After tensors are created, update cache + if self._cached_seq_ids != current_seq_ids: + self._prompt_tokens_hpu_cache = None + self._cached_seq_ids = current_seq_ids def forward( self, diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py index 5e40449d6552..08244fe0af28 100644 --- a/vllm/model_executor/sampling_metadata.py +++ b/vllm/model_executor/sampling_metadata.py @@ -16,6 +16,8 @@ make_tensor_with_pad_align) _SAMPLING_EPS = 1e-5 +pin_memory = is_pin_memory_available() +is_hpu = current_platform.is_hpu() @dataclass @@ -286,7 +288,7 @@ def _prepare_seq_groups( if seq_group_metadata.is_prompt: if sampling_params.seed is not None: - if current_platform.is_hpu(): + if is_hpu: import habana_frameworks.torch.hpu.random as htrandom generator = \ htrandom.default_generators[ @@ -420,8 +422,10 @@ def from_sampling_metadata( vocab_size: int, device: torch.device, dtype: torch.dtype, + prompt_tokens_cache: torch.tensor, + past_seq_ids: set, ) -> tuple["SamplingTensors", bool, bool, bool, Optional[int], - Optional[float]]: + Optional[float], Optional[torch.tensor]]: prompt_tokens: list[array] = [] output_tokens: list[array] = [] top_ks: list[int] = [] @@ -434,6 +438,7 @@ def from_sampling_metadata( do_penalties = False do_top_p_top_k = False do_min_p = False + current_seq_ids = set() assert sampling_metadata.seq_groups is not None for seq_group in sampling_metadata.seq_groups: @@ -508,6 +513,9 @@ def from_sampling_metadata( seq_data = seq_group.seq_data[seq_id] prompt_tokens.append(seq_data.prompt_token_ids_array) output_tokens.append(seq_data.output_token_ids_array) + current_seq_ids.update(seq_ids) + if current_seq_ids != past_seq_ids: + prompt_tokens_cache = None top_k_scalar = top_ks[0] if do_top_p_top_k and all( k == top_ks[0] for k in top_ks) else None @@ -527,9 +535,10 @@ def from_sampling_metadata( vocab_size, device, dtype, + prompt_tokens_cache, ) return (sampling_tensors, do_penalties, do_top_p_top_k, do_min_p, - top_k_scalar, top_p_scalar) + top_k_scalar, top_p_scalar, current_seq_ids) @classmethod def from_lists( @@ -546,23 +555,28 @@ def from_lists( vocab_size: int, device: torch.device, dtype: torch.dtype, + prompt_tokens_cache: torch.tensor, ) -> "SamplingTensors": # Note that the performance will be very bad without # pinned memory. - pin_memory = is_pin_memory_available() do_penalties = prompt_tokens or output_tokens if do_penalties: - if current_platform.is_hpu(): - prompt_t = make_tensor_with_pad_align( - prompt_tokens, - vocab_size, - device="cpu", - dtype=torch.int64, - pin_memory=pin_memory, - max_len_align=1024, - ) + if is_hpu: + if (prompt_tokens_cache is not None and + prompt_tokens_cache.device == device): + # Reuse cached prompt_tokens already on HPU + prompt_t = prompt_tokens_cache + else: + prompt_t = make_tensor_with_pad_align( + prompt_tokens, + vocab_size, + device="cpu", + dtype=torch.int64, + pin_memory=pin_memory, + max_len_align=1024, + ) output_t = make_tensor_with_pad_align( output_tokens, vocab_size, @@ -647,6 +661,6 @@ def from_lists( non_blocking=True), repetition_penalties=repetition_penalties_t.to(device=device, non_blocking=True), - prompt_tokens=prompt_t.to(device=device, non_blocking=True), + prompt_tokens=prompt_t.to(device=device, non_blocking=True) if prompt_t.device != device else prompt_t, output_tokens=output_t.to(device=device, non_blocking=True), ) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index ac6f1eed8098..dc4ce30cd912 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -4023,6 +4023,12 @@ def try_revert_dummy_output_tokens(): self.cached_step_inputs.append(model_input) if self.do_mark_step: htorch.core.mark_step() + if hasattr(self.model.sampler, '_sampling_tensors') and \ + self.model.sampler._sampling_tensors is not None: + sampling_tensors = self.model.sampler._sampling_tensors + if sampling_tensors.prompt_tokens.numel() > 0: + # Cache the prompt_tokens tensor that's already on HPU + self.model.sampler._prompt_tokens_hpu_cache = sampling_tensors.prompt_tokens if use_delayed_sampling \ and model_input.async_callback is not None: model_input.async_callback() From e684eb59cdfe3862b61d85f9eec653e90c7517e6 Mon Sep 17 00:00:00 2001 From: Yeonsil Yoon Date: Wed, 24 Sep 2025 16:31:23 +0000 Subject: [PATCH 09/20] Add check to only for do_penalities --- vllm/worker/hpu_model_runner.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index dc4ce30cd912..c602d25055ec 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -4024,7 +4024,8 @@ def try_revert_dummy_output_tokens(): if self.do_mark_step: htorch.core.mark_step() if hasattr(self.model.sampler, '_sampling_tensors') and \ - self.model.sampler._sampling_tensors is not None: + self.model.sampler._sampling_tensors is not None and \ + self.model.sampler._do_penalties: sampling_tensors = self.model.sampler._sampling_tensors if sampling_tensors.prompt_tokens.numel() > 0: # Cache the prompt_tokens tensor that's already on HPU From 92e8db3583260a5f1801c2878ae9b2ba162ffca9 Mon Sep 17 00:00:00 2001 From: Yeonsil Yoon Date: Wed, 24 Sep 2025 21:35:03 +0000 Subject: [PATCH 10/20] Fix for merge_multimodal_embeddedings() crash --- vllm/model_executor/models/utils.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index 822113b52a8a..0157010b2c1e 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -553,8 +553,18 @@ def merge_multimodal_embeddings( This updates ``inputs_embeds`` in place. """ if isinstance(placeholder_token_id, list): - placeholder_token_id = torch.tensor(placeholder_token_id, - device=input_ids.device) + flat_ids = [] + for x in placeholder_token_id: + if torch.is_tensor(x): + flat_ids.extend(int(v) for v in x.reshape(-1).tolist()) + elif isinstance(x, (list, tuple)): + flat_ids.extend(int(v) for v in x) + else: + flat_ids.append(int(x)) + + placeholder_token_id = torch.as_tensor( + flat_ids, device=input_ids.device, dtype=input_ids.dtype + ) return _merge_multimodal_embeddings( inputs_embeds, torch.isin(input_ids, placeholder_token_id), From 22128e5a1053371f9956fa95c766f5652411cc84 Mon Sep 17 00:00:00 2001 From: Seunghyuk Park Date: Fri, 26 Sep 2025 18:55:59 +0300 Subject: [PATCH 11/20] Add mediapipe changes more --- vllm/model_executor/models/internvl.py | 190 ++++++++++++++++--------- 1 file changed, 122 insertions(+), 68 deletions(-) diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index ea723e87340b..2717c07b2de9 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -15,6 +15,7 @@ import numpy.typing as npt import torch import torch.nn as nn +import torch.nn.functional as F import torchvision.transforms as T from PIL import Image from transformers import BatchEncoding, PretrainedConfig, TensorType @@ -31,6 +32,8 @@ from queue import Queue import io import time +import atexit +from dataclasses import dataclass from vllm.config import VllmConfig from vllm.model_executor.layers.quantization import QuantizationConfig @@ -353,9 +356,12 @@ def __next__(self): img_list = shared_q.get() for i in range(len(img_list)): # NOTE: this padding is needed because of HW alignmnet requirment - img_list[i] = np.pad(img_list[i], - (0, 64 - len(img_list[i]) % 64), - 'constant') + rem = len(img_list[i]) % 64 + pad = (64 - rem) % 64 + if pad: + img_list[i] = np.pad(img_list[i], + (0, pad), + 'constant') return img_list def get_media_output_type(self): @@ -395,15 +401,16 @@ def __init__(self, device, queue_depth, batch_size, device="hpu", output_format=it.RGB_I, resize=[img_width, img_height]) self.mean_node = fn.MediaConst( - data=np.array([127.5, 127.5, 127.5], dtype=dt.FLOAT32), - shape=[1, 1, 3], - dtype=dt.FLOAT32 - ) + data=np.array([IMAGENET_MEAN[0]*255.0, + IMAGENET_MEAN[1]*255.0, + IMAGENET_MEAN[2]*255.0], dtype=dt.FLOAT32), + shape=[1, 1, 3], dtype=dt.FLOAT32) + self.std_node = fn.MediaConst( - data=np.array([1/127.5, 1/127.5, 1/127.5], dtype=dt.FLOAT32), - shape=[1, 1, 3], - dtype=dt.FLOAT32 - ) + data=np.array([1.0/(IMAGENET_STD[0]*255.0), + 1.0/(IMAGENET_STD[1]*255.0), + 1.0/(IMAGENET_STD[2]*255.0)], dtype=dt.FLOAT32), + shape=[1, 1, 3], dtype=dt.FLOAT32) self.cmn = fn.CropMirrorNorm(crop_w=img_width, crop_h=img_height, dtype=dt.FLOAT32, device="hpu") @@ -424,6 +431,61 @@ def definegraph(self): # Return the full processed image - we'll do tiling in Python return images + +# ----------------------------------------------------------------------------- +# MediaPipe manager (persist pipes/iterators) +# ----------------------------------------------------------------------------- +@dataclass +class _PipeState: + pipe: hpuMediaPipe | None = None + it: MediaGenericPytorchIterator | None = None + bsz: int | None = None + H: int | None = None + W: int | None = None + + +class MediaPipeTiler: + """Owns and reuses MediaPipe pipes/iterators for main path""" + def __init__(self) -> None: + self._main = _PipeState() + + def _rebuild(self, st: _PipeState, *, bsz: int, H: int, W: int) -> None: + if st.pipe is not None: + try: + st.pipe.close() + except Exception: + pass + pipe = hpuMediaPipe("legacy", 0, bsz, 1, "cpu", H, W) + pipe.build() + st.pipe, st.it, st.bsz, st.H, st.W = pipe, iter(MediaPytorchIterator(pipe)), bsz, H, W + + def ensure_main(self, *, bsz: int, H: int, W: int) -> tuple[hpuMediaPipe, MediaGenericPytorchIterator]: + st = self._main + if st.pipe is None or st.bsz != bsz or st.H != H or st.W != W: + self._rebuild(st, bsz=bsz, H=H, W=W) + return st.pipe, st.it # type: ignore[return-value] + + def reset_iter(self) -> None: + st = self._main + if st.pipe is not None: + st.it = iter(MediaPytorchIterator(st.pipe)) + + def close_all(self) -> None: + st = self._main + try: + if st.pipe is not None: + st.pipe.close() + except Exception: + pass + finally: + st.pipe = None + st.it = None + st.bsz = st.H = st.W = None + + +_MP = MediaPipeTiler() +atexit.register(_MP.close_all) + def get_image_info(data): # Get image info using PIL without decoding try: @@ -466,31 +528,9 @@ def preprocess_images( img_sizes.append(img_size) batch_sizes.append(batch_size) - thumbs = None - if use_thumbnail and len(images) > 0: - batch_size = len(images) - pipe = hpuMediaPipe("legacy", queue_depth, batch_size, - num_threads, "cpu", - patch_size, patch_size) - pipe.build() - data_loader = MediaPytorchIterator(pipe) - data_loader = iter(data_loader) - - img_list = np.empty(shape=[batch_size, ], dtype=object) - for i in range(batch_size): - img_list[i] = np.frombuffer(images[i], np.uint8) - - shared_q.put(img_list) - thumbs = next(data_loader)[0] - - shared_q.task_done() - pipe.close() - del pipe - image_num_patches = torch.zeros(len(images), dtype=torch.int64) - patches = [] - thumb_idx = 0 + batch_patches = [] image_num_patches_idx = 0 for batch_size, img_size in zip(batch_sizes, img_sizes): # calculate the number of blocks without thumbnail @@ -502,48 +542,62 @@ def preprocess_images( use_thumbnail=False, ) - num_patches = blocks + 1 if use_thumbnail and thumbs is not None and blocks > 1 else blocks - image_num_patches[image_num_patches_idx:image_num_patches_idx+batch_size] = num_patches - - pipe = hpuMediaPipe("legacy", queue_depth, batch_size, - num_threads, "cpu", - target_height, target_width) - pipe.build() - data_loader = MediaPytorchIterator(pipe) - data_loader = iter(data_loader) + # if batch, H, W is changed, create new one + main_pipe, main_iter = _MP.ensure_main(bsz=batch_size, H=target_height, W=target_width) img_list = np.empty(shape=[batch_size, ], dtype=object) for i in range(batch_size): img_list[i] = np.frombuffer(images[i], np.uint8) shared_q.put(img_list) - processed_images = next(data_loader)[0] - - shared_q.task_done() - pipe.close() - del pipe - - # Extract tiles - tiles = [] - H, W = target_height, target_width - for h_idx in range(H // patch_size): - for w_idx in range(W // patch_size): - h_start = h_idx * patch_size - h_end = h_start + patch_size - w_start = w_idx * patch_size - w_end = w_start + patch_size - - tile = processed_images[:, :, h_start:h_end, w_start:w_end] - tiles.append(tile) + try: + processed_images = next(main_iter)[0] + except StopIteration: + _MP.reset_iter() + _, main_iter = _MP.ensure_main(bsz=batch_size, H=target_height, W=target_width) + processed_images = next(main_iter)[0] + finally: + shared_q.task_done() + + # tiling vectorization: [N,C,H,W] -> [N,Ty,Tx,C,ps,ps] -> [N, T, C, ps, ps] + N, C, H, W = processed_images.shape + Ty, Tx = H // patch_size, W // patch_size + T = Ty * Tx + + use_thumb_now = use_thumbnail and (T > 1) + + x = processed_images.view(N, C, Ty, patch_size, Tx, patch_size) \ + .permute(0, 2, 4, 1, 3, 5) \ + .contiguous() \ + .view(N, T, C, patch_size, patch_size) # [N,T,C,ps,ps] + + if use_thumb_now: + # [N,3,ps,ps] + thumbs_batch = F.interpolate(processed_images, size=(patch_size, patch_size), + mode="bilinear", align_corners=False) + + num_patches = T + 1 if use_thumb_now else T + image_num_patches[image_num_patches_idx:image_num_patches_idx+batch_size] = num_patches + image_num_patches_idx += batch_size + + # consist tensor batch based (tile + thumbnail) + if use_thumb_now: + out = torch.empty((N, T + 1, C, patch_size, patch_size), + dtype=processed_images.dtype, device=processed_images.device) + out[:, :T] = x + out[:, T] = thumbs_batch + out = out.view(N * (T + 1), C, patch_size, patch_size) # [N*(T+1),C,ps,ps] + else: + # no thumbnail (T==1 or off) + out = x.view(N * T, C, patch_size, patch_size) # [N*T,C,ps,ps] - for i in range(batch_size): - for t in tiles: - patches.append(t[i]) - if use_thumbnail and thumbs is not None and len(tiles) > 1: - patches.append(thumbs[thumb_idx]) - thumb_idx += 1 + batch_patches.append(out) - patches_flat = torch.stack(patches, dim=0) + patches_flat = ( + torch.cat(batch_patches, dim=0) + if batch_patches + else torch.empty((0, 3, patch_size, patch_size), dtype=torch.float32) + ) return patches_flat, image_num_patches From 602d2d29d04e02ac4c2e1736ee14d8c959481cec Mon Sep 17 00:00:00 2001 From: Yeonsil Yoon Date: Mon, 29 Sep 2025 13:07:13 -0700 Subject: [PATCH 12/20] Libint/add samplemetatensorcache3 (#1991) Co-authored-by: Libin Tang --- vllm/model_executor/layers/sampler.py | 4 ++- vllm/model_executor/sampling_metadata.py | 35 ++++++++++++++++-------- vllm/worker/hpu_model_runner.py | 10 +++++-- 3 files changed, 33 insertions(+), 16 deletions(-) diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index cc7872f69bc0..5c82e31e7c4e 100755 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -199,6 +199,7 @@ def __init__(self): self.should_modify_greedy_probs_inplace = False # Add HPU cache class variables self._prompt_tokens_hpu_cache: Optional[torch.Tensor] = None + self._output_tokens_hpu_cache: Optional[torch.Tensor] = None self._cached_seq_ids: Optional[set] = None def _init_sampling_tensors( @@ -222,7 +223,7 @@ def _init_sampling_tensors( top_k_scalar, top_p_scalar, current_seq_ids) = \ SamplingTensors.from_sampling_metadata( sampling_metadata, vocab_size, logits.device, logits.dtype, \ - self._prompt_tokens_hpu_cache, self._cached_seq_ids) + self._prompt_tokens_hpu_cache, self._output_tokens_hpu_cache, self._cached_seq_ids) self._sampling_tensors = sampling_tensors self._do_penalties = do_penalties @@ -237,6 +238,7 @@ def _init_sampling_tensors( # After tensors are created, update cache if self._cached_seq_ids != current_seq_ids: self._prompt_tokens_hpu_cache = None + self._output_tokens_hpu_cache = None self._cached_seq_ids = current_seq_ids def forward( diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py index 08244fe0af28..ce3866c81270 100644 --- a/vllm/model_executor/sampling_metadata.py +++ b/vllm/model_executor/sampling_metadata.py @@ -5,7 +5,7 @@ from dataclasses import dataclass from typing import Optional -import torch +import torch,time from vllm.platforms import current_platform from vllm.sampling_params import SamplingParams, SamplingType @@ -423,6 +423,7 @@ def from_sampling_metadata( device: torch.device, dtype: torch.dtype, prompt_tokens_cache: torch.tensor, + output_tokens_cache: torch.tensor, past_seq_ids: set, ) -> tuple["SamplingTensors", bool, bool, bool, Optional[int], Optional[float], Optional[torch.tensor]]: @@ -516,7 +517,7 @@ def from_sampling_metadata( current_seq_ids.update(seq_ids) if current_seq_ids != past_seq_ids: prompt_tokens_cache = None - + output_tokens_cache = None top_k_scalar = top_ks[0] if do_top_p_top_k and all( k == top_ks[0] for k in top_ks) else None top_p_scalar = top_ps[0] if do_top_p_top_k and all( @@ -536,6 +537,7 @@ def from_sampling_metadata( device, dtype, prompt_tokens_cache, + output_tokens_cache, ) return (sampling_tensors, do_penalties, do_top_p_top_k, do_min_p, top_k_scalar, top_p_scalar, current_seq_ids) @@ -556,6 +558,7 @@ def from_lists( device: torch.device, dtype: torch.dtype, prompt_tokens_cache: torch.tensor, + output_tokens_cache: torch.tensor, ) -> "SamplingTensors": # Note that the performance will be very bad without # pinned memory. @@ -568,6 +571,14 @@ def from_lists( prompt_tokens_cache.device == device): # Reuse cached prompt_tokens already on HPU prompt_t = prompt_tokens_cache + # Get the last element from each list + last_elements = [out[-1] for out in output_tokens] + lengths = [len(out)-1 for out in output_tokens] + indices = torch.tensor(lengths, device=device) + rows = torch.arange(output_tokens_cache.shape[0], device=device) + # Convert to a PyTorch tensor with shape [4, 1] + last_elements_t = torch.tensor(last_elements).unsqueeze(1).to(output_tokens_cache.device) + output_t = output_tokens_cache.index_put_((rows, indices), last_elements_t) else: prompt_t = make_tensor_with_pad_align( prompt_tokens, @@ -577,14 +588,14 @@ def from_lists( pin_memory=pin_memory, max_len_align=1024, ) - output_t = make_tensor_with_pad_align( - output_tokens, - vocab_size, - device="cpu", - dtype=torch.int64, - pin_memory=pin_memory, - max_len_align=1024, - ) + output_t = make_tensor_with_pad_align( + output_tokens, + vocab_size, + device="cpu", + dtype=torch.int64, + pin_memory=pin_memory, + max_len_align=1024, + ) else: prompt_t = make_tensor_with_pad( prompt_tokens, @@ -649,7 +660,7 @@ def from_lists( ) # Because the memory is pinned, we can do non-blocking # transfer to device. - + output_t=output_t.to(device=device, non_blocking=True) if output_t.device != device else output_t return cls( temperatures=temperatures_t.to(device=device, non_blocking=True), top_ps=top_ps_t.to(device=device, non_blocking=True), @@ -662,5 +673,5 @@ def from_lists( repetition_penalties=repetition_penalties_t.to(device=device, non_blocking=True), prompt_tokens=prompt_t.to(device=device, non_blocking=True) if prompt_t.device != device else prompt_t, - output_tokens=output_t.to(device=device, non_blocking=True), + output_tokens=output_t ) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index c602d25055ec..4c09ac5b1188 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -1888,12 +1888,15 @@ def _prepare_prompt( if image_index_tensor is not None: multi_modal_kwargs['image_index'] = image_index_tensor - use_mediapipe = os.getenv("VLLM_USE_MEDIA_PIPELINE", "false").lower() in ("1", "true", "yes") + use_mediapipe = os.getenv("VLLM_USE_MEDIA_PIPELINE", + "false").lower() in ("1", "true", "yes") if use_mediapipe: # With mediapipe path some tensors will already be on HPU, we only move to HPU if needed for key in multi_modal_kwargs.keys(): - if hasattr(multi_modal_kwargs[key], "device") and multi_modal_kwargs[key].device != self.device: - multi_modal_kwargs[key] = self.move_to_device(multi_modal_kwargs[key]) + if hasattr(multi_modal_kwargs[key], "device" + ) and multi_modal_kwargs[key].device != self.device: + multi_modal_kwargs[key] = self.move_to_device( + multi_modal_kwargs[key]) else: multi_modal_kwargs = MultiModalKwargs.as_kwargs(multi_modal_kwargs, device=self.device) @@ -4030,6 +4033,7 @@ def try_revert_dummy_output_tokens(): if sampling_tensors.prompt_tokens.numel() > 0: # Cache the prompt_tokens tensor that's already on HPU self.model.sampler._prompt_tokens_hpu_cache = sampling_tensors.prompt_tokens + self.model.sampler._output_tokens_hpu_cache = sampling_tensors.output_tokens if use_delayed_sampling \ and model_input.async_callback is not None: model_input.async_callback() From 8e88b004d72b06116001f3c7d3e543e726692655 Mon Sep 17 00:00:00 2001 From: Libin Tang Date: Tue, 30 Sep 2025 08:49:44 -0700 Subject: [PATCH 13/20] add fix for output_token length check --- vllm/model_executor/sampling_metadata.py | 25 +++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py index ce3866c81270..16c9292f6bec 100644 --- a/vllm/model_executor/sampling_metadata.py +++ b/vllm/model_executor/sampling_metadata.py @@ -5,7 +5,7 @@ from dataclasses import dataclass from typing import Optional -import torch,time +import torch from vllm.platforms import current_platform from vllm.sampling_params import SamplingParams, SamplingType @@ -571,14 +571,6 @@ def from_lists( prompt_tokens_cache.device == device): # Reuse cached prompt_tokens already on HPU prompt_t = prompt_tokens_cache - # Get the last element from each list - last_elements = [out[-1] for out in output_tokens] - lengths = [len(out)-1 for out in output_tokens] - indices = torch.tensor(lengths, device=device) - rows = torch.arange(output_tokens_cache.shape[0], device=device) - # Convert to a PyTorch tensor with shape [4, 1] - last_elements_t = torch.tensor(last_elements).unsqueeze(1).to(output_tokens_cache.device) - output_t = output_tokens_cache.index_put_((rows, indices), last_elements_t) else: prompt_t = make_tensor_with_pad_align( prompt_tokens, @@ -588,6 +580,18 @@ def from_lists( pin_memory=pin_memory, max_len_align=1024, ) + if (output_tokens_cache is not None and + output_tokens_cache.device == device and + len(output_tokens) > 0 and len(output_tokens_cache[0]) > 0): + # Get the last element from each list + last_elements = [out[-1] for out in output_tokens] + lengths = [len(out)-1 for out in output_tokens] + indices = torch.tensor(lengths, device=device) + rows = torch.arange(output_tokens_cache.shape[0], device=device) + # Convert to a PyTorch tensor with shape [4, 1] + last_elements_t = torch.tensor(last_elements).unsqueeze(1).to(output_tokens_cache.device) + output_t = output_tokens_cache.index_put_((rows, indices), last_elements_t) + else: output_t = make_tensor_with_pad_align( output_tokens, vocab_size, @@ -660,7 +664,6 @@ def from_lists( ) # Because the memory is pinned, we can do non-blocking # transfer to device. - output_t=output_t.to(device=device, non_blocking=True) if output_t.device != device else output_t return cls( temperatures=temperatures_t.to(device=device, non_blocking=True), top_ps=top_ps_t.to(device=device, non_blocking=True), @@ -673,5 +676,5 @@ def from_lists( repetition_penalties=repetition_penalties_t.to(device=device, non_blocking=True), prompt_tokens=prompt_t.to(device=device, non_blocking=True) if prompt_t.device != device else prompt_t, - output_tokens=output_t + output_tokens=output_t.to(device=device, non_blocking=True) if output_t.device != device else output_t ) From eed4b4b68abd78d8013e9057bb357ca05b3f39c0 Mon Sep 17 00:00:00 2001 From: Yeonsil Yoon Date: Fri, 3 Oct 2025 10:31:40 -0700 Subject: [PATCH 14/20] Small fixes for internvl (cherry-pick 8751709) --- vllm/model_executor/models/internvl.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 2717c07b2de9..bb9c574eb5cc 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -514,7 +514,7 @@ def preprocess_images( batch_sizes = [] for img in images: img_info = get_image_info(img) - if img_info['format'] != 'JPEG' and img_info['mode'] != 'RGB': + if not (img_info['format'] == 'JPEG' and img_info['mode'] == 'RGB'): raise ValueError(f"HPU media pipeline only supports JPEG images in RGB mode. Detected format={img_info['format']}, mode={img_info['mode']}") if img_size==None: img_size = img_info['size'] @@ -1605,7 +1605,7 @@ def _parse_and_validate_video_input( self, **kwargs: object) -> Optional[InternVLVideoPixelInputs]: pixel_values_flat_video = kwargs.pop("pixel_values_flat_video", None) video_num_patches = kwargs.pop("video_num_patches", None) - video_embeds = kwargs.pop("image_embeds", None) + video_embeds = kwargs.pop("video_embeds", None) if pixel_values_flat_video is None and video_embeds is None: return None From aab0a37d1c7e3fc696e37cbc5e87a43dde382875 Mon Sep 17 00:00:00 2001 From: Yeonsil Yoon Date: Fri, 3 Oct 2025 13:08:58 -0700 Subject: [PATCH 15/20] Fix pre-commit --- vllm/entrypoints/chat_utils.py | 6 +- vllm/model_executor/layers/sampler.py | 3 +- vllm/model_executor/models/internvl.py | 217 +++++++++++++---------- vllm/model_executor/models/utils.py | 6 +- vllm/model_executor/sampling_metadata.py | 29 +-- vllm/multimodal/parse.py | 5 +- vllm/multimodal/utils.py | 30 +++- vllm/worker/hpu_model_runner.py | 11 +- 8 files changed, 179 insertions(+), 128 deletions(-) diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 9a0525c7933a..7d549bb58558 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -405,10 +405,12 @@ def _resolve_chat_template_content_format( jinja_text = (hf_chat_template if isinstance(hf_chat_template, str) else load_chat_template(chat_template, is_literal=True)) - # The InternVL template has mixed content access patterns that fail with automatic detection. + # The InternVL template has mixed content access patterns + # that fail with automatic detection. # Set string format for proper operation if InternVL is used. model_type = getattr(model_config.hf_config, 'model_type', '') - if model_type == 'internvl_chat' or 'internvl' in model_config.model.lower(): + if model_type == 'internvl_chat' or 'internvl' \ + in model_config.model.lower(): detected_format = "string" else: detected_format = ("string" if jinja_text is None else diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index 5c82e31e7c4e..dc60aa2bf5e3 100755 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -223,7 +223,8 @@ def _init_sampling_tensors( top_k_scalar, top_p_scalar, current_seq_ids) = \ SamplingTensors.from_sampling_metadata( sampling_metadata, vocab_size, logits.device, logits.dtype, \ - self._prompt_tokens_hpu_cache, self._output_tokens_hpu_cache, self._cached_seq_ids) + self._prompt_tokens_hpu_cache, self._output_tokens_hpu_cache, \ + self._cached_seq_ids) self._sampling_tensors = sampling_tensors self._do_penalties = do_penalties diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index bb9c574eb5cc..fdb88e5bc83c 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -1,6 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import atexit +import contextlib +import io # adapted from https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/modeling_internvl_chat.py # -------------------------------------------------------- # InternVL @@ -8,32 +11,33 @@ # Licensed under The MIT License [see LICENSE for details] # -------------------------------------------------------- import os +import time from abc import ABC, abstractmethod from collections.abc import Iterable, Mapping, Sequence +from dataclasses import dataclass +from queue import Queue from typing import Any, Literal, Optional, TypedDict, TypeVar, Union +import numpy as np import numpy.typing as npt import torch import torch.nn as nn import torch.nn.functional as F import torchvision.transforms as T -from PIL import Image -from transformers import BatchEncoding, PretrainedConfig, TensorType - from habana_frameworks.mediapipe import fn -from habana_frameworks.mediapipe.mediapipe import MediaPipe +# Handle MediaPipe pipe_manager destructor +from habana_frameworks.mediapipe.backend.cal import (cpp_pipe_manager_list, + pipe_manager) from habana_frameworks.mediapipe.media_types import dtype as dt from habana_frameworks.mediapipe.media_types import imgtype as it from habana_frameworks.mediapipe.media_types import readerOutType as ro -from habana_frameworks.mediapipe.operators.reader_nodes.reader_nodes import media_ext_reader_op_impl -from habana_frameworks.mediapipe.operators.reader_nodes.reader_nodes import media_ext_reader_op_tensor_info -from habana_frameworks.mediapipe.plugins.iterator_pytorch import MediaGenericPytorchIterator -import numpy as np -from queue import Queue -import io -import time -import atexit -from dataclasses import dataclass +from habana_frameworks.mediapipe.mediapipe import MediaPipe +from habana_frameworks.mediapipe.operators.reader_nodes.reader_nodes import ( + media_ext_reader_op_impl, media_ext_reader_op_tensor_info) +from habana_frameworks.mediapipe.plugins.iterator_pytorch import ( + MediaGenericPytorchIterator) +from PIL import Image +from transformers import BatchEncoding, PretrainedConfig, TensorType from vllm.config import VllmConfig from vllm.model_executor.layers.quantization import QuantizationConfig @@ -311,14 +315,14 @@ def video_to_pixel_values_internvl( pixel_values = torch.stack([transform(image) for image in frames_list]) return pixel_values -# Handle MediaPipe pipe_manager destructor -from habana_frameworks.mediapipe.backend.cal import pipe_manager, cpp_pipe_manager_list def _patched_close(self): - """Patched close method that handles None cpp_pipe_manager_list during shutdown""" + """Patched close method that handles + None cpp_pipe_manager_list during shutdown""" try: # Check if cpp_pipe_manager_list exists and is not None - if cpp_pipe_manager_list is not None and self._pm_ in cpp_pipe_manager_list: + if cpp_pipe_manager_list is not None and \ + self._pm_ in cpp_pipe_manager_list: cpp_pipe_manager_list.remove(self._pm_) except (TypeError, AttributeError): # Handle case where cpp_pipe_manager_list is None or not iterable @@ -329,6 +333,7 @@ def _patched_close(self): self._pm_.close() self._pm_ = None + pipe_manager.close = _patched_close # Queue shared between external reader and mediapipe call @@ -336,11 +341,13 @@ def _patched_close(self): class MediaPytorchIterator(MediaGenericPytorchIterator): + def __init__(self, mediapipe): super().__init__(mediapipe=mediapipe, device="hpu", fw_type="PYT_FW") class external_reader(media_ext_reader_op_impl): + def __init__(self, params, fw_params): self.batch_size = fw_params.batch_size self.max_file = "" @@ -355,13 +362,11 @@ def __len__(self): def __next__(self): img_list = shared_q.get() for i in range(len(img_list)): - # NOTE: this padding is needed because of HW alignmnet requirment + # NOTE: this padding is needed because of HW alignment requirement rem = len(img_list[i]) % 64 pad = (64 - rem) % 64 if pad: - img_list[i] = np.pad(img_list[i], - (0, pad), - 'constant') + img_list[i] = np.pad(img_list[i], (0, pad), 'constant') return img_list def get_media_output_type(self): @@ -379,17 +384,11 @@ def gen_output_info(self): class hpuMediaPipe(MediaPipe): - def __init__(self, device, queue_depth, batch_size, - num_threads, op_device, + + def __init__(self, device, queue_depth, batch_size, num_threads, op_device, img_height, img_width): - super( - hpuMediaPipe, - self).__init__( - device, - queue_depth, - batch_size, - num_threads, - self.__class__.__name__) + super().__init__(device, queue_depth, batch_size, num_threads, + self.__class__.__name__) mediapipe_seed = int(time.time_ns() % (2**31 - 1)) @@ -397,27 +396,35 @@ def __init__(self, device, queue_depth, batch_size, num_outputs=1, seed=mediapipe_seed, device=op_device) - self.decode = fn.ImageDecoder( - device="hpu", output_format=it.RGB_I, resize=[img_width, img_height]) - - self.mean_node = fn.MediaConst( - data=np.array([IMAGENET_MEAN[0]*255.0, - IMAGENET_MEAN[1]*255.0, - IMAGENET_MEAN[2]*255.0], dtype=dt.FLOAT32), - shape=[1, 1, 3], dtype=dt.FLOAT32) - - self.std_node = fn.MediaConst( - data=np.array([1.0/(IMAGENET_STD[0]*255.0), - 1.0/(IMAGENET_STD[1]*255.0), - 1.0/(IMAGENET_STD[2]*255.0)], dtype=dt.FLOAT32), - shape=[1, 1, 3], dtype=dt.FLOAT32) - - self.cmn = fn.CropMirrorNorm(crop_w=img_width, crop_h=img_height, dtype=dt.FLOAT32, device="hpu") + self.decode = fn.ImageDecoder(device="hpu", + output_format=it.RGB_I, + resize=[img_width, img_height]) + + self.mean_node = fn.MediaConst(data=np.array([ + IMAGENET_MEAN[0] * 255.0, IMAGENET_MEAN[1] * 255.0, + IMAGENET_MEAN[2] * 255.0 + ], + dtype=dt.FLOAT32), + shape=[1, 1, 3], + dtype=dt.FLOAT32) + + self.std_node = fn.MediaConst(data=np.array([ + 1.0 / (IMAGENET_STD[0] * 255.0), 1.0 / + (IMAGENET_STD[1] * 255.0), 1.0 / (IMAGENET_STD[2] * 255.0) + ], + dtype=dt.FLOAT32), + shape=[1, 1, 3], + dtype=dt.FLOAT32) + + self.cmn = fn.CropMirrorNorm(crop_w=img_width, + crop_h=img_height, + dtype=dt.FLOAT32, + device="hpu") self.transpose = fn.Transpose( device="hpu", tensorDim=4, - permutation=[1, 2, 0, 3] #NCHW + permutation=[1, 2, 0, 3] #NCHW ) def definegraph(self): @@ -437,29 +444,31 @@ def definegraph(self): # ----------------------------------------------------------------------------- @dataclass class _PipeState: - pipe: hpuMediaPipe | None = None - it: MediaGenericPytorchIterator | None = None - bsz: int | None = None - H: int | None = None - W: int | None = None + pipe: hpuMediaPipe | None = None + it: MediaGenericPytorchIterator | None = None + bsz: int | None = None + H: int | None = None + W: int | None = None class MediaPipeTiler: """Owns and reuses MediaPipe pipes/iterators for main path""" + def __init__(self) -> None: - self._main = _PipeState() + self._main = _PipeState() def _rebuild(self, st: _PipeState, *, bsz: int, H: int, W: int) -> None: if st.pipe is not None: - try: + with contextlib.suppress(BaseException): st.pipe.close() - except Exception: - pass pipe = hpuMediaPipe("legacy", 0, bsz, 1, "cpu", H, W) pipe.build() - st.pipe, st.it, st.bsz, st.H, st.W = pipe, iter(MediaPytorchIterator(pipe)), bsz, H, W + st.pipe, st.it, st.bsz, st.H, st.W = pipe, iter( + MediaPytorchIterator(pipe)), bsz, H, W - def ensure_main(self, *, bsz: int, H: int, W: int) -> tuple[hpuMediaPipe, MediaGenericPytorchIterator]: + def ensure_main( + self, *, bsz: int, H: int, + W: int) -> tuple[hpuMediaPipe, MediaGenericPytorchIterator]: st = self._main if st.pipe is None or st.bsz != bsz or st.H != H or st.W != W: self._rebuild(st, bsz=bsz, H=H, W=W) @@ -486,17 +495,17 @@ def close_all(self) -> None: _MP = MediaPipeTiler() atexit.register(_MP.close_all) + def get_image_info(data): # Get image info using PIL without decoding try: with Image.open(io.BytesIO(data)) as img: - return { - 'format': img.format, - 'size': img.size, - 'mode': img.mode - } + return {'format': img.format, 'size': img.size, 'mode': img.mode} except Exception as e: - raise ValueError(f"Input image bitstream is not in supported format: {str(e)}") + raise ValueError( + f"Input image bitstream is not in supported format: {str(e)}" + ) from None + def preprocess_images( images, @@ -505,8 +514,6 @@ def preprocess_images( use_thumbnail=False, ): batch_size = 0 - queue_depth = 0 - num_threads = 1 # validate images and create batches img_size = None @@ -515,8 +522,11 @@ def preprocess_images( for img in images: img_info = get_image_info(img) if not (img_info['format'] == 'JPEG' and img_info['mode'] == 'RGB'): - raise ValueError(f"HPU media pipeline only supports JPEG images in RGB mode. Detected format={img_info['format']}, mode={img_info['mode']}") - if img_size==None: + raise ValueError( + f"HPU media pipeline only supports JPEG images in RGB mode. \ + Detected format={img_info['format']}, \ + mode={img_info['mode']}") + if img_size is None: img_size = img_info['size'] else: if img_info['size'] != img_size: @@ -543,9 +553,13 @@ def preprocess_images( ) # if batch, H, W is changed, create new one - main_pipe, main_iter = _MP.ensure_main(bsz=batch_size, H=target_height, W=target_width) + main_pipe, main_iter = _MP.ensure_main(bsz=batch_size, + H=target_height, + W=target_width) - img_list = np.empty(shape=[batch_size, ], dtype=object) + img_list = np.empty(shape=[ + batch_size, + ], dtype=object) for i in range(batch_size): img_list[i] = np.frombuffer(images[i], np.uint8) @@ -554,50 +568,58 @@ def preprocess_images( processed_images = next(main_iter)[0] except StopIteration: _MP.reset_iter() - _, main_iter = _MP.ensure_main(bsz=batch_size, H=target_height, W=target_width) + _, main_iter = _MP.ensure_main(bsz=batch_size, + H=target_height, + W=target_width) processed_images = next(main_iter)[0] finally: shared_q.task_done() - # tiling vectorization: [N,C,H,W] -> [N,Ty,Tx,C,ps,ps] -> [N, T, C, ps, ps] + # tiling vectorization: + # [N,C,H,W] -> [N,Ty,Tx,C,ps,ps] -> [N, T, C, ps, ps] N, C, H, W = processed_images.shape Ty, Tx = H // patch_size, W // patch_size T = Ty * Tx use_thumb_now = use_thumbnail and (T > 1) - x = processed_images.view(N, C, Ty, patch_size, Tx, patch_size) \ + x = processed_images.view(N, C, Ty, \ + patch_size, Tx, patch_size) \ .permute(0, 2, 4, 1, 3, 5) \ .contiguous() \ - .view(N, T, C, patch_size, patch_size) # [N,T,C,ps,ps] + .view(N, T, C, \ + patch_size, patch_size) # [N,T,C,ps,ps] if use_thumb_now: # [N,3,ps,ps] - thumbs_batch = F.interpolate(processed_images, size=(patch_size, patch_size), - mode="bilinear", align_corners=False) + thumbs_batch = F.interpolate(processed_images, + size=(patch_size, patch_size), + mode="bilinear", + align_corners=False) num_patches = T + 1 if use_thumb_now else T - image_num_patches[image_num_patches_idx:image_num_patches_idx+batch_size] = num_patches + image_num_patches[image_num_patches_idx:image_num_patches_idx + + batch_size] = num_patches image_num_patches_idx += batch_size # consist tensor batch based (tile + thumbnail) if use_thumb_now: out = torch.empty((N, T + 1, C, patch_size, patch_size), - dtype=processed_images.dtype, device=processed_images.device) + dtype=processed_images.dtype, + device=processed_images.device) out[:, :T] = x - out[:, T] = thumbs_batch - out = out.view(N * (T + 1), C, patch_size, patch_size) # [N*(T+1),C,ps,ps] + out[:, T] = thumbs_batch + out = out.view(N * (T + 1), C, patch_size, + patch_size) # [N*(T+1),C,ps,ps] else: # no thumbnail (T==1 or off) - out = x.view(N * T, C, patch_size, patch_size) # [N*T,C,ps,ps] + out = x.view(N * T, C, patch_size, patch_size) # [N*T,C,ps,ps] batch_patches.append(out) - patches_flat = ( - torch.cat(batch_patches, dim=0) - if batch_patches - else torch.empty((0, 3, patch_size, patch_size), dtype=torch.float32) - ) + patches_flat = (torch.cat(batch_patches, dim=0) + if batch_patches else torch.empty( + (0, 3, patch_size, patch_size), dtype=torch.float32)) return patches_flat, image_num_patches @@ -756,7 +778,8 @@ def _preprocess_image( if len(images) == 0: image_inputs = {} else: - use_mediapipe = os.getenv("VLLM_USE_MEDIA_PIPELINE", "false").lower() in ("1", "true", "yes") + use_mediapipe = os.getenv("VLLM_USE_MEDIA_PIPELINE", + "false").lower() in ("1", "true", "yes") if use_mediapipe: # Use HPU media pipeline for image preprocessing min_num, max_num = self.resolve_min_max_num( @@ -785,7 +808,9 @@ def _preprocess_image( feature_size = num_patches * self.num_image_token image_repl = self.get_image_repl(feature_size, num_patches) - text = [t.replace('', image_repl.full, 1) for t in text] + text = [ + t.replace('', image_repl.full, 1) for t in text + ] else: pixel_values_lst = self._images_to_pixel_values_lst( @@ -806,7 +831,9 @@ def _preprocess_image( feature_size = num_patches * self.num_image_token image_repl = self.get_image_repl(feature_size, num_patches) - text = [t.replace('', image_repl.full, 1) for t in text] + text = [ + t.replace('', image_repl.full, 1) for t in text + ] return text, image_inputs @@ -821,7 +848,8 @@ def _make_batch_input(self, def __call__( self, text: Optional[Union[str, list[str]]] = None, - images: Optional[Union[Image.Image, list[Image.Image], bytes, list[bytes]]] = None, + images: Optional[Union[Image.Image, list[Image.Image], bytes, + list[bytes]]] = None, min_dynamic_patch: Optional[int] = None, max_dynamic_patch: Optional[int] = None, dynamic_image_size: Optional[bool] = None, @@ -940,7 +968,8 @@ def _preprocess_video( def __call__( self, text: Optional[Union[str, list[str]]] = None, - images: Optional[Union[Image.Image, list[Image.Image], bytes, list[bytes]]] = None, + images: Optional[Union[Image.Image, list[Image.Image], bytes, + list[bytes]]] = None, videos: Optional[Union[npt.NDArray, list[npt.NDArray]]] = None, min_dynamic_patch: Optional[int] = None, max_dynamic_patch: Optional[int] = None, @@ -1784,8 +1813,8 @@ def forward( elif inputs_embeds is None: if not is_hpu: vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings(input_ids, - vision_embeddings) + inputs_embeds = self.get_input_embeddings( + input_ids, vision_embeddings) input_ids = None forward_kwargs = { diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index 0157010b2c1e..0229f3bd6169 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -562,9 +562,9 @@ def merge_multimodal_embeddings( else: flat_ids.append(int(x)) - placeholder_token_id = torch.as_tensor( - flat_ids, device=input_ids.device, dtype=input_ids.dtype - ) + placeholder_token_id = torch.as_tensor(flat_ids, + device=input_ids.device, + dtype=input_ids.dtype) return _merge_multimodal_embeddings( inputs_embeds, torch.isin(input_ids, placeholder_token_id), diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py index 16c9292f6bec..efef82b261e4 100644 --- a/vllm/model_executor/sampling_metadata.py +++ b/vllm/model_executor/sampling_metadata.py @@ -567,8 +567,8 @@ def from_lists( if do_penalties: if is_hpu: - if (prompt_tokens_cache is not None and - prompt_tokens_cache.device == device): + if (prompt_tokens_cache is not None + and prompt_tokens_cache.device == device): # Reuse cached prompt_tokens already on HPU prompt_t = prompt_tokens_cache else: @@ -580,17 +580,21 @@ def from_lists( pin_memory=pin_memory, max_len_align=1024, ) - if (output_tokens_cache is not None and - output_tokens_cache.device == device and - len(output_tokens) > 0 and len(output_tokens_cache[0]) > 0): + if (output_tokens_cache is not None + and output_tokens_cache.device == device + and len(output_tokens) > 0 + and len(output_tokens_cache[0]) > 0): # Get the last element from each list last_elements = [out[-1] for out in output_tokens] - lengths = [len(out)-1 for out in output_tokens] + lengths = [len(out) - 1 for out in output_tokens] indices = torch.tensor(lengths, device=device) - rows = torch.arange(output_tokens_cache.shape[0], device=device) + rows = torch.arange(output_tokens_cache.shape[0], + device=device) # Convert to a PyTorch tensor with shape [4, 1] - last_elements_t = torch.tensor(last_elements).unsqueeze(1).to(output_tokens_cache.device) - output_t = output_tokens_cache.index_put_((rows, indices), last_elements_t) + last_elements_t = torch.tensor(last_elements).unsqueeze( + 1).to(output_tokens_cache.device) + output_t = output_tokens_cache.index_put_((rows, indices), + last_elements_t) else: output_t = make_tensor_with_pad_align( output_tokens, @@ -675,6 +679,7 @@ def from_lists( non_blocking=True), repetition_penalties=repetition_penalties_t.to(device=device, non_blocking=True), - prompt_tokens=prompt_t.to(device=device, non_blocking=True) if prompt_t.device != device else prompt_t, - output_tokens=output_t.to(device=device, non_blocking=True) if output_t.device != device else output_t - ) + prompt_tokens=prompt_t.to(device=device, non_blocking=True) + if prompt_t.device != device else prompt_t, + output_tokens=output_t.to(device=device, non_blocking=True) + if output_t.device != device else output_t) diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py index 693a9a230fb3..53bee48d180c 100644 --- a/vllm/multimodal/parse.py +++ b/vllm/multimodal/parse.py @@ -1,13 +1,13 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import io from abc import ABC, abstractmethod from collections import UserDict from collections.abc import Callable, Iterator, Mapping, Sequence from typing import (TYPE_CHECKING, Any, Generic, Literal, NamedTuple, Optional, TypeVar, Union) -import io import numpy as np import torch from typing_extensions import TypeAlias, TypeGuard, assert_never @@ -410,8 +410,7 @@ def _parse_image_data( if self._is_embeddings(data): return ImageEmbeddingItems(data) - if (isinstance(data, PILImage.Image) - or isinstance(data, (bytes, bytearray)) + if (isinstance(data, (PILImage.Image, bytes, bytearray)) or isinstance(data, (np.ndarray, torch.Tensor)) and data.ndim == 3): data_items = [data] diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index 2a5d4a0c2098..2df6d92ee750 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -1,12 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import os from itertools import groupby from pathlib import Path from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union from urllib.parse import ParseResult, urlparse -import os import numpy as np import numpy.typing as npt import torch @@ -114,7 +114,9 @@ def load_from_url( data = connection.get_bytes(url, timeout=fetch_timeout) if load_type == "bytes": - msg = "Only data URLs are currently supported for raw bytes loading." + msg = "Only data URLs are currently supported \ + for raw bytes loading." + raise NotImplementedError(msg) else: connection = self.connection @@ -127,7 +129,9 @@ def load_from_url( if url_spec.scheme == "file": if load_type == "bytes": - msg = "Only data URLs are currently supported for raw bytes loading." + msg = "Only data URLs are currently supported \ + for raw bytes loading." + raise NotImplementedError(msg) else: return self._load_file_url(url_spec, media_io) @@ -147,11 +151,14 @@ async def load_from_url_async( if url_spec.scheme.startswith("http"): if load_type == "bytes": - msg = "Only data URLs are currently supported for raw bytes loading." + msg = "Only data URLs are currently supported \ + for raw bytes loading." + raise NotImplementedError(msg) else: connection = self.connection - data = await connection.async_get_bytes(url, timeout=fetch_timeout) + data = await connection.async_get_bytes(url, + timeout=fetch_timeout) return media_io.load_bytes(data) @@ -160,7 +167,9 @@ async def load_from_url_async( if url_spec.scheme == "file": if load_type == "bytes": - msg = "Only data URLs are currently supported for raw bytes loading." + msg = "Only data URLs are currently supported \ + for raw bytes loading." + raise NotImplementedError(msg) else: return self._load_file_url(url_spec, media_io) @@ -210,7 +219,8 @@ def fetch_image( By default, the image is converted into RGB format. """ image_io = ImageMediaIO(image_mode=image_mode) - use_mediapipe = os.getenv("VLLM_USE_MEDIA_PIPELINE", "false").lower() in ("1", "true", "yes") + use_mediapipe = os.getenv("VLLM_USE_MEDIA_PIPELINE", + "false").lower() in ("1", "true", "yes") load_type = "bytes" if use_mediapipe else "PIL" try: @@ -231,12 +241,14 @@ async def fetch_image_async( image_mode: str = "RGB", ) -> Image.Image: """ - Asynchronously load a PIL image (or raw bytes) from a HTTP or base64 data URL. + Asynchronously load a PIL image (or raw bytes) + from a HTTP or base64 data URL. By default, the image is converted into RGB format. """ image_io = ImageMediaIO(image_mode=image_mode) - use_mediapipe = os.getenv("VLLM_USE_MEDIA_PIPELINE", "false").lower() in ("1", "true", "yes") + use_mediapipe = os.getenv("VLLM_USE_MEDIA_PIPELINE", + "false").lower() in ("1", "true", "yes") load_type = "bytes" if use_mediapipe else "PIL" try: diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 4c09ac5b1188..2aecded1b3b8 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -1891,8 +1891,9 @@ def _prepare_prompt( use_mediapipe = os.getenv("VLLM_USE_MEDIA_PIPELINE", "false").lower() in ("1", "true", "yes") if use_mediapipe: - # With mediapipe path some tensors will already be on HPU, we only move to HPU if needed - for key in multi_modal_kwargs.keys(): + # With mediapipe path some tensors will already be on HPU, + # we only move to HPU if needed + for key in multi_modal_kwargs: if hasattr(multi_modal_kwargs[key], "device" ) and multi_modal_kwargs[key].device != self.device: multi_modal_kwargs[key] = self.move_to_device( @@ -4032,8 +4033,10 @@ def try_revert_dummy_output_tokens(): sampling_tensors = self.model.sampler._sampling_tensors if sampling_tensors.prompt_tokens.numel() > 0: # Cache the prompt_tokens tensor that's already on HPU - self.model.sampler._prompt_tokens_hpu_cache = sampling_tensors.prompt_tokens - self.model.sampler._output_tokens_hpu_cache = sampling_tensors.output_tokens + self.model.sampler._prompt_tokens_hpu_cache = \ + sampling_tensors.prompt_tokens + self.model.sampler._output_tokens_hpu_cache = \ + sampling_tensors.output_tokens if use_delayed_sampling \ and model_input.async_callback is not None: model_input.async_callback() From bced3c57b0265b5eaa496d327f28a7ea2b8e9d51 Mon Sep 17 00:00:00 2001 From: Yeonsil Yoon Date: Fri, 3 Oct 2025 14:34:53 -0700 Subject: [PATCH 16/20] Fix pre-commit --- vllm/entrypoints/chat_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 7d549bb58558..c1e12a7fc4f2 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -411,10 +411,10 @@ def _resolve_chat_template_content_format( model_type = getattr(model_config.hf_config, 'model_type', '') if model_type == 'internvl_chat' or 'internvl' \ in model_config.model.lower(): - detected_format = "string" + detected_format:_ChatTemplateContentFormat = "string" else: detected_format = ("string" if jinja_text is None else - _detect_content_format(jinja_text, default="string")) + _detect_content_format(jinja_text, default="string")) return detected_format From 55f0d819cca4a3df7211f3d7f70c5d41d502d580 Mon Sep 17 00:00:00 2001 From: Yeonsil Yoon Date: Mon, 6 Oct 2025 10:00:32 -0700 Subject: [PATCH 17/20] Fix pre-commit --- vllm/entrypoints/chat_utils.py | 12 +- vllm/model_executor/layers/sampler.py | 4 +- vllm/multimodal/base.py | 4 + vllm/multimodal/utils.py | 180 +++++++++++++++++++------- vllm/worker/hpu_model_runner.py | 10 +- 5 files changed, 153 insertions(+), 57 deletions(-) diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index c1e12a7fc4f2..9d99dc97325e 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -411,12 +411,12 @@ def _resolve_chat_template_content_format( model_type = getattr(model_config.hf_config, 'model_type', '') if model_type == 'internvl_chat' or 'internvl' \ in model_config.model.lower(): - detected_format:_ChatTemplateContentFormat = "string" + detected_format = "string" else: detected_format = ("string" if jinja_text is None else - _detect_content_format(jinja_text, default="string")) + _detect_content_format(jinja_text, default="string")) - return detected_format + return cast(_ChatTemplateContentFormat, detected_format) @lru_cache @@ -734,7 +734,7 @@ def __init__(self, tracker: MultiModalItemTracker) -> None: ) def parse_image(self, image_url: str) -> None: - image = self._connector.fetch_image(image_url) + image = self._connector.fetch_image(image_url, load_type="PIL") placeholder = self._tracker.add("image", image) self._add_placeholder(placeholder) @@ -785,7 +785,9 @@ def __init__(self, tracker: AsyncMultiModalItemTracker) -> None: ) def parse_image(self, image_url: str) -> None: - image_coro = self._connector.fetch_image_async(image_url) + image_coro = self._connector.fetch_image_async( + image_url, load_type="PIL" + ) placeholder = self._tracker.add("image", image_coro) self._add_placeholder(placeholder) diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index dc60aa2bf5e3..a40cac6a86b6 100755 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -218,13 +218,15 @@ def _init_sampling_tensors( # have pinned memory. self._sampling_tensors = None + csi = self._cached_seq_ids if self._cached_seq_ids is not None else set( + ) # Initialize new sampling tensors (sampling_tensors, do_penalties, do_top_p_top_k, do_min_p, top_k_scalar, top_p_scalar, current_seq_ids) = \ SamplingTensors.from_sampling_metadata( sampling_metadata, vocab_size, logits.device, logits.dtype, \ self._prompt_tokens_hpu_cache, self._output_tokens_hpu_cache, \ - self._cached_seq_ids) + csi) self._sampling_tensors = sampling_tensors self._do_penalties = do_penalties diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py index 7188ed14c573..eef9726d3228 100644 --- a/vllm/multimodal/base.py +++ b/vllm/multimodal/base.py @@ -214,6 +214,10 @@ def load_base64(self, media_type: str, data: str) -> _T: """ raise NotImplementedError + def load_base64_bytes(self, media_type: str, data: str) -> bytes: + """ Optional """ + raise NotImplementedError + @abstractmethod def load_file(self, filepath: Path) -> _T: raise NotImplementedError diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index 2df6d92ee750..a9a1e98f1413 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -4,7 +4,8 @@ import os from itertools import groupby from pathlib import Path -from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union +from typing import (TYPE_CHECKING, Any, Literal, Optional, TypeVar, Union, + overload) from urllib.parse import ParseResult, urlparse import numpy as np @@ -63,18 +64,36 @@ def __init__( self.allowed_local_media_path = allowed_local_media_path_ + @overload def _load_data_url( self, url_spec: ParseResult, media_io: MediaIO[_M], - load_type: str = "PIL", + load_type: Literal["PIL"] = "PIL", ) -> _M: + ... + + @overload + def _load_data_url( + self, + url_spec: ParseResult, + media_io: MediaIO[_M], + load_type: Literal["bytes"], + ) -> bytes: + ... + + def _load_data_url( + self, + url_spec: ParseResult, + media_io: MediaIO[_M], + load_type: Literal["PIL", "bytes"] = "PIL", + ) -> Union[_M, bytes]: data_spec, data = url_spec.path.split(",", 1) media_type, data_type = data_spec.split(";", 1) if data_type != "base64": - msg = "Only base64 data URLs are supported for now." - raise NotImplementedError(msg) + raise NotImplementedError( + "Only base64 data URLs are supported for now.") if load_type == "bytes": return media_io.load_base64_bytes(media_type, data) @@ -99,45 +118,79 @@ def _load_file_url( return media_io.load_file(filepath) + @overload def load_from_url( self, url: str, media_io: MediaIO[_M], *, - fetch_timeout: Optional[int] = None, - load_type: str = "bytes", + fetch_timeout: Optional[int] = ..., + load_type: Literal["PIL"] = "PIL", ) -> _M: + ... + + @overload + def load_from_url( + self, + url: str, + media_io: MediaIO[_M], + *, + fetch_timeout: Optional[int] = ..., + load_type: Literal["bytes"], + ) -> bytes: + ... + + def load_from_url( + self, + url: str, + media_io: MediaIO[_M], + *, + fetch_timeout: Optional[int] = None, + load_type: Literal["PIL", "bytes"] = "bytes", + ) -> Union[_M, bytes]: url_spec = urlparse(url) if url_spec.scheme.startswith("http"): - connection = self.connection - data = connection.get_bytes(url, timeout=fetch_timeout) - if load_type == "bytes": - msg = "Only data URLs are currently supported \ - for raw bytes loading." - - raise NotImplementedError(msg) - else: - connection = self.connection - data = connection.get_bytes(url, timeout=fetch_timeout) - - return media_io.load_bytes(data) + raise NotImplementedError( + "Only data URLs are currently supported for bytes loading." + ) + data = self.connection.get_bytes(url, timeout=fetch_timeout) + return media_io.load_bytes(data) if url_spec.scheme == "data": return self._load_data_url(url_spec, media_io, load_type) if url_spec.scheme == "file": if load_type == "bytes": - msg = "Only data URLs are currently supported \ - for raw bytes loading." + raise NotImplementedError( + "Only data URLs are currently supported for bytes loading." + ) + return self._load_file_url(url_spec, media_io) - raise NotImplementedError(msg) - else: - return self._load_file_url(url_spec, media_io) + raise ValueError("The URL must be either a HTTP, data or file URL.") - msg = "The URL must be either a HTTP, data or file URL." - raise ValueError(msg) + @overload + async def load_from_url_async( + self, + url: str, + media_io: MediaIO[_M], + *, + fetch_timeout: Optional[int] = ..., + load_type: Literal["PIL"] = "PIL", + ) -> _M: + ... + + @overload + async def load_from_url_async( + self, + url: str, + media_io: MediaIO[_M], + *, + fetch_timeout: Optional[int] = ..., + load_type: Literal["bytes"], + ) -> bytes: + ... async def load_from_url_async( self, @@ -145,37 +198,30 @@ async def load_from_url_async( media_io: MediaIO[_M], *, fetch_timeout: Optional[int] = None, - load_type: str = "bytes", - ) -> _M: + load_type: Literal["PIL", "bytes"] = "bytes", + ) -> Union[_M, bytes]: url_spec = urlparse(url) if url_spec.scheme.startswith("http"): if load_type == "bytes": - msg = "Only data URLs are currently supported \ - for raw bytes loading." - - raise NotImplementedError(msg) - else: - connection = self.connection - data = await connection.async_get_bytes(url, - timeout=fetch_timeout) - - return media_io.load_bytes(data) + raise NotImplementedError( + "Only data URLs are currently supported for bytes loading." + ) + data = await self.connection.async_get_bytes(url, + timeout=fetch_timeout) + return media_io.load_bytes(data) if url_spec.scheme == "data": return self._load_data_url(url_spec, media_io, load_type) if url_spec.scheme == "file": if load_type == "bytes": - msg = "Only data URLs are currently supported \ - for raw bytes loading." - - raise NotImplementedError(msg) - else: - return self._load_file_url(url_spec, media_io) + raise NotImplementedError( + "Only data URLs are currently supported for bytes loading." + ) + return self._load_file_url(url_spec, media_io) - msg = "The URL must be either a HTTP, data or file URL." - raise ValueError(msg) + raise ValueError("The URL must be either a HTTP, data or file URL.") def fetch_audio( self, @@ -207,14 +253,35 @@ async def fetch_audio_async( fetch_timeout=envs.VLLM_AUDIO_FETCH_TIMEOUT, ) + @overload def fetch_image( self, image_url: str, *, image_mode: str = "RGB", + load_type: Literal["PIL"], ) -> Image.Image: + ... + + @overload + def fetch_image( + self, + image_url: str, + *, + image_mode: str = "RGB", + load_type: Literal["bytes"], + ) -> bytes: + ... + + def fetch_image( + self, + image_url: str, + *, + image_mode: str = "RGB", + load_type: Optional[Literal["PIL", "bytes"]] = None, + ) -> Union[Image.Image, bytes]: """ - Load a PIL image (or raw bytes) from a HTTP or base64 data URL. + Load a PIL image or raw bytes from a HTTP/base64 data URL. By default, the image is converted into RGB format. """ @@ -234,12 +301,33 @@ def fetch_image( # convert to ValueError to be properly caught upstream raise ValueError(str(e)) from e + @overload async def fetch_image_async( self, image_url: str, *, image_mode: str = "RGB", + load_type: Literal["PIL"], ) -> Image.Image: + ... + + @overload + async def fetch_image_async( + self, + image_url: str, + *, + image_mode: str = "RGB", + load_type: Literal["bytes"], + ) -> bytes: + ... + + async def fetch_image_async( + self, + image_url: str, + *, + image_mode: str = "RGB", + load_type: Optional[Literal["PIL", "bytes"]] = None, + ) -> Union[Image.Image, bytes]: """ Asynchronously load a PIL image (or raw bytes) from a HTTP or base64 data URL. diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 2aecded1b3b8..af1f83c10c6b 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -3899,11 +3899,11 @@ def try_revert_dummy_output_tokens(): ('pixel_values')in model_input.multi_modal_kwargs)) execute_model_kwargs['attn_metadata'] = attn_metadata - if 'image_index' in model_input.multi_modal_kwargs: - execute_model_kwargs[ - 'image_index'] = model_input.multi_modal_kwargs[ - 'image_index'] - model_input.multi_modal_kwargs.pop('image_index', None) + mmks = model_input.multi_modal_kwargs + if mmks is not None and 'image_index' in mmks: + execute_model_kwargs['image_index'] = mmks[ + 'image_index'] + mmks.pop('image_index', None) if not bypass_model_exec: if self.model_is_mrope or (self.is_mm_optimized From 08c5f9ecc6b175b0d75bbcd2633f874965380da8 Mon Sep 17 00:00:00 2001 From: Yeonsil Yoon Date: Mon, 6 Oct 2025 13:41:40 -0700 Subject: [PATCH 18/20] GEMMA3:move decode embedding from hpu_model_runner to gemma3 (https://github.com/HabanaAI/vllm-fork/pull/1848) --- vllm/model_executor/models/gemma3_mm.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py index 24fb4147e680..b267425e8556 100644 --- a/vllm/model_executor/models/gemma3_mm.py +++ b/vllm/model_executor/models/gemma3_mm.py @@ -641,10 +641,7 @@ def forward(self, # NOTE: In v1, inputs_embeds is always generated at model runner, this # condition is for v0 compatibility. - elif inputs_embeds is None: - if is_hpu: - raise AssertionError("hpu_model_runner should be computing \ - inputs_embeds") + elif inputs_embeds is None and not is_hpu: vision_embeddings = self.get_multimodal_embeddings(**kwargs) inputs_embeds = self.get_input_embeddings(input_ids, From 2d5ad937b56cadf4220e763db4bdf8cf680b22b9 Mon Sep 17 00:00:00 2001 From: Yeonsil Yoon Date: Mon, 6 Oct 2025 13:54:29 -0700 Subject: [PATCH 19/20] Fix vit_embeds duplication issue when N breakdown > 1 (From 5902ec7) --- vllm/model_executor/models/internvl.py | 29 +++++++++++++++++++------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index fdb88e5bc83c..4fe54f16531c 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -1510,10 +1510,13 @@ def extract_feature(self, pixel_values: torch.Tensor) -> torch.Tensor: else: batch_breakdown = [pixel_values.shape[0]] + total_mb = len(batch_breakdown) + need_copy = total_mb > 1 + start_idx = 0 vit_embeds_minibatches = [] - for i in batch_breakdown: + for mb_idx, i in enumerate(batch_breakdown): end_idx = start_idx + i batch_sliced_pixel_values = \ pixel_values[start_idx:end_idx, ...] @@ -1541,14 +1544,24 @@ def extract_feature(self, pixel_values: torch.Tensor) -> torch.Tensor: vit_embeds_minibatch.shape[-1]) if is_lazy: - vit_embeds_minibatches += [ - self.mlp1(vit_embeds_minibatch, - bypass_hpu_graphs=i - not in self.graphed_multimodal_buckets - and len(self.graphed_multimodal_buckets) > 0) - ] + proj = self.mlp1( + vit_embeds_minibatch, + bypass_hpu_graphs=i + not in self.graphed_multimodal_buckets + and len(self.graphed_multimodal_buckets) > 0) else: - vit_embeds_minibatches += [self.mlp1(vit_embeds_minibatch)] + proj = self.mlp1(vit_embeds_minibatch) + + if need_copy and (mb_idx < total_mb - 1): + proj_safe = torch.empty_like(proj) + proj_safe.copy_(proj) + if is_lazy: + import habana_frameworks.torch as htorch + htorch.core.mark_step() + vit_embeds_minibatches.append(proj_safe) + else: + vit_embeds_minibatches.append(proj) + start_idx = end_idx vit_embeds = torch.cat(vit_embeds_minibatches, dim=0) else: From b96a809439940b95733e97e5e3ec854bfb062da8 Mon Sep 17 00:00:00 2001 From: Yeonsil Yoon Date: Wed, 8 Oct 2025 15:13:52 -0700 Subject: [PATCH 20/20] Fix review comment --- vllm/worker/hpu_model_runner.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 787c33e5c0cb..442c723cc92b 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -4036,6 +4036,7 @@ def try_revert_dummy_output_tokens(): # Cache the prompt_tokens tensor that's already on HPU self.model.sampler._prompt_tokens_hpu_cache = \ sampling_tensors.prompt_tokens + if sampling_tensors.output_tokens.numel() > 0: self.model.sampler._output_tokens_hpu_cache = \ sampling_tensors.output_tokens if use_delayed_sampling \