From d04ec3fc3811d1c93dceb09db2c8ef9b01dd3aec Mon Sep 17 00:00:00 2001 From: Christopher Manteuffel Date: Fri, 5 Sep 2025 09:54:01 -0700 Subject: [PATCH 1/4] Initial import of internvl testing --- .../lm-eval-harness/configs/internvl3_5-14b.yaml | 12 ++++++++++++ .jenkins/lm-eval-harness/configs/models-internvl.txt | 1 + .jenkins/test_config.yaml | 10 ++++++++++ .jenkins/vision/configs/internvl3_5-14b.yaml | 7 +++++++ .jenkins/vision/configs/models-internvl.txt | 1 + .jenkins/vision/test_enc_dec_model.py | 2 ++ 6 files changed, 33 insertions(+) create mode 100644 .jenkins/lm-eval-harness/configs/internvl3_5-14b.yaml create mode 100644 .jenkins/lm-eval-harness/configs/models-internvl.txt create mode 100644 .jenkins/vision/configs/internvl3_5-14b.yaml create mode 100644 .jenkins/vision/configs/models-internvl.txt diff --git a/.jenkins/lm-eval-harness/configs/internvl3_5-14b.yaml b/.jenkins/lm-eval-harness/configs/internvl3_5-14b.yaml new file mode 100644 index 000000000000..140534221721 --- /dev/null +++ b/.jenkins/lm-eval-harness/configs/internvl3_5-14b.yaml @@ -0,0 +1,12 @@ +model_name: "/path/to/internvl/3/5/14b" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.700 + - name: "exact_match,flexible-extract" + value: 0.700 +limit: 256 +num_fewshot: 8 +dtype: "bfloat16" +trust_remote_code: True diff --git a/.jenkins/lm-eval-harness/configs/models-internvl.txt b/.jenkins/lm-eval-harness/configs/models-internvl.txt new file mode 100644 index 000000000000..80225e5a6265 --- /dev/null +++ b/.jenkins/lm-eval-harness/configs/models-internvl.txt @@ -0,0 +1 @@ +internvl3_5-14b.yaml \ No newline at end of file diff --git a/.jenkins/test_config.yaml b/.jenkins/test_config.yaml index 0ac488e1b150..14a5ece192f0 100644 --- a/.jenkins/test_config.yaml +++ b/.jenkins/test_config.yaml @@ -42,6 +42,11 @@ stages: command: >- export PT_HPU_LAZY_MODE=1 && cd .jenkins/lm-eval-harness && bash run-tests.sh -c configs/models-gemma.txt -t 1 + - name: v0_gsm8k_g3_internvl_3_5_tp1 + flavor: g3.s + command: >- + export PT_HPU_LAZY_MODE=1 && export VLLM_SKIP_WARMUP=true && + cd .jenkins/lm-eval-harness && bash run-tests.sh -c configs/models-internvl.txt -t 1 - name: test_gsm8k_small_models_apc steps: - name: gsm8k_small_g3_tp1_apc @@ -230,6 +235,11 @@ stages: cd .jenkins/vision && PT_HPU_LAZY_MODE=1 bash run-tests.sh -c configs/models-gemma.txt -t 1 + - name: multimodal_internvl_g3_tp1_ep + flavor: g3.s + command: >- + cd .jenkins/vision && + PT_HPU_LAZY_MODE=1 PT_HPUGRAPH_DISABLE_TENSOR_CACHE=0 bash run-tests.sh -c configs/models-internvl.txt -t 1 - name: tests_int4_quantization steps: - name: test_awq diff --git a/.jenkins/vision/configs/internvl3_5-14b.yaml b/.jenkins/vision/configs/internvl3_5-14b.yaml new file mode 100644 index 000000000000..aa1f204723e7 --- /dev/null +++ b/.jenkins/vision/configs/internvl3_5-14b.yaml @@ -0,0 +1,7 @@ +model_name: "/path/to/internvl/3/5/14b" +dtype: "bfloat16" +max_model_len: 16384 +max_num_seqs: 32 +num_prompts: 4 +limit_mm_per_prompt_image: 5 +trust_remote_code: True diff --git a/.jenkins/vision/configs/models-internvl.txt b/.jenkins/vision/configs/models-internvl.txt new file mode 100644 index 000000000000..111eef9a1c5a --- /dev/null +++ b/.jenkins/vision/configs/models-internvl.txt @@ -0,0 +1 @@ +internvl3_5-14b.yaml diff --git a/.jenkins/vision/test_enc_dec_model.py b/.jenkins/vision/test_enc_dec_model.py index a1571c64f41a..b023691b3b8d 100644 --- a/.jenkins/vision/test_enc_dec_model.py +++ b/.jenkins/vision/test_enc_dec_model.py @@ -22,6 +22,7 @@ def fail_on_exit(): def launch_enc_dec_model(config, question, images): + trust_remote_code = config.get('trust_remote_code', False) model_name = config.get('model_name') dtype = config.get('dtype', 'bfloat16') max_num_seqs = config.get('max_num_seqs', 128) @@ -41,6 +42,7 @@ def launch_enc_dec_model(config, question, images): enable_expert_parallel=enable_expert_parallel, enforce_eager=enforce_eager, limit_mm_per_prompt={"image": limit_mm_per_prompt_image}, + trust_remote_code=trust_remote_code, ) tokenizer = AutoTokenizer.from_pretrained(model_name) From 64360a2fad883902ab07677f5a4ce37b748a3e59 Mon Sep 17 00:00:00 2001 From: Christopher Manteuffel Date: Wed, 17 Sep 2025 11:28:57 -0700 Subject: [PATCH 2/4] Update path to model on weka --- .jenkins/lm-eval-harness/configs/internvl3_5-14b.yaml | 2 +- .jenkins/vision/configs/internvl3_5-14b.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.jenkins/lm-eval-harness/configs/internvl3_5-14b.yaml b/.jenkins/lm-eval-harness/configs/internvl3_5-14b.yaml index 140534221721..b5dc1ced2c93 100644 --- a/.jenkins/lm-eval-harness/configs/internvl3_5-14b.yaml +++ b/.jenkins/lm-eval-harness/configs/internvl3_5-14b.yaml @@ -1,4 +1,4 @@ -model_name: "/path/to/internvl/3/5/14b" +model_name: "/mnt/weka/data/llm/opengvlab/internvl3-14b" tasks: - name: "gsm8k" metrics: diff --git a/.jenkins/vision/configs/internvl3_5-14b.yaml b/.jenkins/vision/configs/internvl3_5-14b.yaml index aa1f204723e7..ddf7c92a1be6 100644 --- a/.jenkins/vision/configs/internvl3_5-14b.yaml +++ b/.jenkins/vision/configs/internvl3_5-14b.yaml @@ -1,4 +1,4 @@ -model_name: "/path/to/internvl/3/5/14b" +model_name: "/mnt/weka/data/llm/opengvlab/internvl3-14b" dtype: "bfloat16" max_model_len: 16384 max_num_seqs: 32 From 36eb2cf9cec7ed62d40f29f28f2af48fef31794f Mon Sep 17 00:00:00 2001 From: Supreet Singh Date: Thu, 18 Sep 2025 05:37:04 -0700 Subject: [PATCH 3/4] Fixes HPU graph run for Gemma3 vision inputs (#1865) Fixes HPU graph issues for gemma3 vision inputs Text warmup to include attn_mask info, so vision+text data can reuse the graph for language model that's warmed up already. Changing slicing to index_select for multimodal bucketing for HPU. Slicing doesn't produce the same hash for the HPU graph with same input shape. Use buckets for the vision tower as well to reduce GC recompile Accuracy bug fix by clone output data of the multimodal-projector. Validated with Muirbench datasets. --- .../configs/Qwen2.5-VL-7B-Instruct.yaml | 2 +- vllm/model_executor/models/gemma3_mm.py | 33 ++++++------- vllm/worker/hpu_model_runner.py | 48 +++++++++++-------- 3 files changed, 44 insertions(+), 39 deletions(-) diff --git a/.jenkins/vision/configs/Qwen2.5-VL-7B-Instruct.yaml b/.jenkins/vision/configs/Qwen2.5-VL-7B-Instruct.yaml index 294b538633e5..840eaa12a922 100644 --- a/.jenkins/vision/configs/Qwen2.5-VL-7B-Instruct.yaml +++ b/.jenkins/vision/configs/Qwen2.5-VL-7B-Instruct.yaml @@ -1,5 +1,5 @@ model_name: "/mnt/weka/data/pytorch/Qwen/Qwen2.5-VL-7B-Instruct/" dtype: "bfloat16" -max_model_len: 32768 +max_model_len: 35840 max_num_seqs: 32 num_prompts: 4 diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py index b80a4ab5951c..578e49e7fd78 100644 --- a/vllm/model_executor/models/gemma3_mm.py +++ b/vllm/model_executor/models/gemma3_mm.py @@ -569,11 +569,6 @@ def _process_image_input( pixel_values = image_input["pixel_values"] num_patches = image_input["num_patches"] - image_features = self._image_pixels_to_features( - self.vision_tower, - pixel_values, - ) - if is_hpu: batch_breakdown = greedy_plan(pixel_values.shape[0], \ self.vision_buckets.multimodal_buckets) @@ -582,22 +577,24 @@ def _process_image_input( for i in batch_breakdown: end_idx = start_idx + i - batch_sliced_image_features = \ - image_features[start_idx:end_idx, ...] - if is_lazy: - image_embeds_multibatches += \ - [self.multi_modal_projector( - batch_sliced_image_features, - bypass_hpu_graphs=i - not in self.graphed_multimodal_buckets - and len(self.graphed_multimodal_buckets) > 0)] - else: - image_embeds_multibatches += \ - [self.multi_modal_projector( \ - batch_sliced_image_features)] + indices = torch.arange(start_idx, end_idx) + batch_sliced_pixel_values = torch.index_select(pixel_values, + dim=0, + index=indices) + + image_features = self._image_pixels_to_features( + self.vision_tower, + batch_sliced_pixel_values, + ) + image_embeds = self.multi_modal_projector(image_features) + image_embeds_multibatches += [image_embeds.clone()] start_idx = end_idx image_embeds = torch.cat(image_embeds_multibatches, dim=0) else: + image_features = self._image_pixels_to_features( + self.vision_tower, + pixel_values, + ) image_embeds = self.multi_modal_projector(image_features) return [ e.flatten(0, 1) for e in image_embeds.split(num_patches.tolist()) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 3fd9e2c78158..f400db0d7c07 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -373,7 +373,7 @@ def __init__(self, model, vllm_config, is_causal, sampler): if self.is_mm_optimized: if hasattr(self.model, 'vision_tower'): self.model.vision_tower = htorch.hpu.wrap_in_hpu_graph( - self.model.vision_tower, disable_tensor_cache=True) + self.model.vision_tower, disable_tensor_cache=False) if hasattr(self.model, 'multi_modal_projector'): self.model.multi_modal_projector = \ htorch.hpu.wrap_in_hpu_graph( \ @@ -619,13 +619,19 @@ def _update_metadata(self, device, dtype, True) return attn_metadata - def compute_input_embeddings_for_mm_optimized(self, **kwargs): + def compute_input_embeddings_for_mm_optimized(self, warmup_mode, **kwargs): input_ids = kwargs['input_ids'] vision_embeddings = self.model.get_multimodal_embeddings(**kwargs) inputs_embeds = self.model.get_input_embeddings( input_ids, vision_embeddings) - if vision_embeddings is not None: + # TODO: In warmup, we need to warmup the model with dummy image data for + # multimodal model for prompt, here instead of generating a dummy image, + # we are just generating attn_mask for the images and pass with + # attn_metadata, so we can reuse HPU graph without running + # the whole vision tower. + if vision_embeddings is not None or ( + warmup_mode and kwargs['attn_metadata'].is_prompt): input_ids = kwargs['input_ids'] positions = kwargs['positions'] kwargs = self.model.prepare_attn_masks( @@ -634,14 +640,16 @@ def compute_input_embeddings_for_mm_optimized(self, **kwargs): ) kwargs['input_ids'] = input_ids kwargs['positions'] = positions - #input_ids = None kwargs.update({'inputs_embeds': inputs_embeds}) - # done compute the visual tokens + # done compute the visual tokens and others kwargs.pop('pixel_values', None) + kwargs.pop("num_crops", None) + kwargs.pop("graphed_multimodal_buckets", None) return kwargs - def compute_input_embeddings_for_mrope_mm_optimized(self, **kwargs): + def compute_input_embeddings_for_mrope_mm_optimized( + self, warmup_mode, **kwargs): if 'inputs_embeds' in kwargs: return kwargs @@ -680,7 +688,8 @@ def compute_input_embeddings_for_mrope_mm_optimized(self, **kwargs): kwargs.pop('image_grid_thw', None) return kwargs else: - return self.compute_input_embeddings_for_mm_optimized(**kwargs) + return self.compute_input_embeddings_for_mm_optimized( + warmup_mode, **kwargs) def forward(self, *args, **kwargs): kwargs = kwargs.copy() @@ -692,9 +701,9 @@ def forward(self, *args, **kwargs): virtual_engine = kwargs.pop('virtual_engine') input_ids = kwargs['input_ids'] - global_attn_masks = kwargs.get("global_attn_masks") \ + global_attn_masks = kwargs.pop("global_attn_masks") \ if kwargs.get("global_attn_masks") else None - local_attn_masks = kwargs.get("local_attn_masks") \ + local_attn_masks = kwargs.pop("local_attn_masks") \ if kwargs.get("local_attn_masks") else None kwargs['attn_metadata'] = self._update_metadata( @@ -1396,12 +1405,8 @@ def get_model(self) -> torch.nn.Module: return self.model.model return self.model - def _use_graphs(self, img_args=None): - if not img_args: - return not self.enforce_eager - #TODO: We might need to check both language bucket and multimodal bucket - # and return True only it's avialble, or return separately. - return (img_args) in self.graphed_multimodal_buckets + def _use_graphs(self): + return not self.enforce_eager def _is_valid_bucket(self, bucket): return bucket[0] * bucket[1] <= self.max_num_batched_tokens @@ -2667,7 +2672,7 @@ def trim_attn_metadata(self, metadata: AttentionMetadata) -> object: def create_dummy_multi_modal_seq_group_metadata(self, group_id, img_args, sampling_params, - lora_request): + lora_request, seq_len): assert self.model_is_mrope or self.is_mm_optimized, \ ("Warmup compatible with Qwen2vl/Gemma3 models") if img_args == UNSET_IMG_ARGS: @@ -2712,7 +2717,9 @@ def create_dummy_multi_modal_seq_group_metadata(self, group_id, img_args, } image_token_id = self.get_model().config.image_token_id - prompt_token_ids = [image_token_id] * num_image_tokens + prompt_token_ids_image = [image_token_id] * num_image_tokens + prompt_token_ids = [0] * ( + seq_len - len(prompt_token_ids_image)) + prompt_token_ids_image prompt_token_ids_array = array('l', prompt_token_ids) # noqa: F821 placeholders_by_modality = { 'image': @@ -2756,6 +2763,7 @@ def create_dummy_seq_group_metadata(self, img_args=img_args, sampling_params=sampling_params, lora_request=lora_request, + seq_len=seq_len, ) else: input_len = seq_len @@ -2867,7 +2875,7 @@ def warmup_scenario(self, align_worker=False, is_dummy_run=False) -> None: phase = 'prompt' if is_prompt else 'decode' - use_graphs = is_dummy_run or self._use_graphs(img_args) + use_graphs = is_dummy_run or self._use_graphs() scenario_name = ("warmup_" f"{phase}_" @@ -3664,8 +3672,7 @@ def execute_model( if not warmup_mode: ctx_blocks = seq_len seq_len = 1 - img_args = self._get_img_args_from_model_input(model_input) - use_graphs = self._use_graphs(img_args=img_args) + use_graphs = self._use_graphs() self._check_config(batch_size, seq_len, ctx_blocks, attn_metadata, warmup_mode) lora_mask: torch.Tensor = None @@ -3831,6 +3838,7 @@ def try_revert_dummy_output_tokens(): # hpu graphs, hence turning it to a list execute_model_kwargs = \ self.model.compute_input_embeddings_for_mrope_mm_optimized( + warmup_mode, **execute_model_kwargs ) if warmup_mode and bypass_model_exec: From ee517a2fcef8ecd72c307abf2b528bc6010a5a8d Mon Sep 17 00:00:00 2001 From: Artur Fierka Date: Mon, 22 Sep 2025 14:42:44 +0200 Subject: [PATCH 4/4] Update common.txt (#1956) Add missing modelscope package - `VLLM_USE_MODELSCOPE` env doesn't work without it. --- requirements/common.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements/common.txt b/requirements/common.txt index a6a1ffe76196..be513f444a6e 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -48,3 +48,4 @@ opentelemetry-sdk>=1.26.0 # vllm.tracing opentelemetry-api>=1.26.0 # vllm.tracing opentelemetry-exporter-otlp>=1.26.0 # vllm.tracing opentelemetry-semantic-conventions-ai>=0.4.1 # vllm.tracing +modelscope # required to support VLLM_USE_MODELSCOPE env