From 999068b81763b5fd5e3ff0a17ad50cd9b33ce028 Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Wed, 5 Nov 2025 09:21:36 +0000 Subject: [PATCH 01/11] Continuous Batching for VLMs Signed-off-by: Asmita Goswami --- .../models/gemma3/modeling_gemma3.py | 119 +++++++++++------ .../models/llava/modeling_llava.py | 123 ++++++++++++------ 2 files changed, 162 insertions(+), 80 deletions(-) diff --git a/QEfficient/transformers/models/gemma3/modeling_gemma3.py b/QEfficient/transformers/models/gemma3/modeling_gemma3.py index 398259d8b..234dff860 100644 --- a/QEfficient/transformers/models/gemma3/modeling_gemma3.py +++ b/QEfficient/transformers/models/gemma3/modeling_gemma3.py @@ -610,6 +610,7 @@ def forward( image_idx, past_key_values, comp_ctx_lengths: Optional[List[int]] = None, + batch_index: Optional[torch.LongTensor] = None, ): inputs_embeds = self.model.get_input_embeddings()(input_ids) B, N, C = inputs_embeds.shape @@ -625,6 +626,7 @@ def forward( position_ids=position_ids, past_key_values=past_key_values, comp_ctx_lengths=comp_ctx_lengths, + batch_index=batch_index, use_cache=True, ) image_idx = (indices1.max() + 1).unsqueeze(0).unsqueeze(0) @@ -684,6 +686,9 @@ def get_specializations( comp_ctx_lengths_prefill: Optional[List[int]] = None, comp_ctx_lengths_decode: Optional[List[int]] = None, kv_offload: bool = False, + continuous_batching: bool = False, + kv_cache_batch_size: Optional[int] = None, + full_batch_size: Optional[int] = None, **compiler_options, ): prefill_seq_len = prefill_seq_len if prefill_seq_len else 32 @@ -707,50 +712,74 @@ def get_specializations( lang = [] for i in range(0, len(comp_ctx_lengths_prefill)): - lang.append( - { - "batch_size": batch_size, - "seq_len": prefill_seq_len, - "ctx_len": ctx_len, - "comp_ctx_lengths": comp_ctx_lengths_prefill[i], - "sliding_window": self.language_model.config.sliding_window, - "img_size": img_size, - "mm_tokens_per_image": mm_tokens_per_image, - } - ) - - for i in range(0, len(comp_ctx_lengths_decode)): - lang.append( - { - "batch_size": batch_size, - "seq_len": "1", - "ctx_len": ctx_len, - "comp_ctx_lengths": comp_ctx_lengths_decode[i], - "sliding_window": self.language_model.config.sliding_window, - "img_size": img_size, - "mm_tokens_per_image": mm_tokens_per_image, - } - ) - - else: - lang = [ - { - "batch_size": batch_size, + lang_prefill = { + "batch_size": 1 if continuous_batching else batch_size, "seq_len": prefill_seq_len, "ctx_len": ctx_len, + "comp_ctx_lengths": comp_ctx_lengths_prefill[i], "sliding_window": self.language_model.config.sliding_window, "img_size": img_size, "mm_tokens_per_image": mm_tokens_per_image, - }, - { - "batch_size": batch_size, + "vision_batch_size": batch_size, + } + if continuous_batching: + lang_prefill["full_batch_size"] = kv_cache_batch_size + else: + lang_prefill["batch_size"] = kv_cache_batch_size + if full_batch_size: + lang_prefill["full_batch_exec_size"] = full_batch_size + lang.append(lang_prefill) + + for i in range(0, len(comp_ctx_lengths_decode)): + lang_decode = { + "batch_size": full_batch_size if continuous_batching else batch_size, "seq_len": "1", "ctx_len": ctx_len, + "comp_ctx_lengths": comp_ctx_lengths_decode[i], "sliding_window": self.language_model.config.sliding_window, "img_size": img_size, "mm_tokens_per_image": mm_tokens_per_image, - }, - ] + "vision_batch_size": batch_size, + } + if continuous_batching: + lang_decode["full_batch_size"] = kv_cache_batch_size + else: + lang_decode["batch_size"] = kv_cache_batch_size + lang.append(lang_decode) + + else: + lang_prefill = { + "batch_size": 1 if continuous_batching else batch_size, + "seq_len": prefill_seq_len, + "ctx_len": ctx_len, + "sliding_window": self.language_model.config.sliding_window, + "img_size": img_size, + "mm_tokens_per_image": mm_tokens_per_image, + "vision_batch_size": batch_size, + } + if continuous_batching: + lang_prefill["full_batch_size"] = kv_cache_batch_size + else: + lang_prefill["batch_size"] = kv_cache_batch_size + if full_batch_size: + lang_prefill["full_batch_exec_size"] = full_batch_size + + lang_decode = { + "batch_size": full_batch_size if continuous_batching else batch_size, + "seq_len": "1", + "ctx_len": ctx_len, + "sliding_window": self.language_model.config.sliding_window, + "img_size": img_size, + "mm_tokens_per_image": mm_tokens_per_image, + "vision_batch_size": batch_size, + } + if continuous_batching: + lang_decode["full_batch_size"] = kv_cache_batch_size + else: + lang_decode["batch_size"] = kv_cache_batch_size + lang = [] + lang.append(lang_prefill) + lang.append(lang_decode) specializations = {} @@ -759,19 +788,23 @@ def get_specializations( specializations["lang"] = lang return specializations, compiler_options else: + lang[0].pop("vision_size") + lang[1].pop("vision_size") return lang, compiler_options - def get_onnx_dynamic_axes(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False): + def get_onnx_dynamic_axes(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False, continuous_batching: bool = False): # Define dynamic axes vision_dynamic_axes = {} lang_dynamic_axes = {} lang_dynamic_axes["input_ids"] = {0: "batch_size", 1: "seq_len"} lang_dynamic_axes["position_ids"] = {0: "batch_size", 1: "seq_len"} - lang_dynamic_axes["vision_embeds"] = {0: "batch_size", 1: "mm_tokens_per_image"} + lang_dynamic_axes["vision_embeds"] = {0: "vision_batch_size", 1: "mm_tokens_per_image"} + if continuous_batching: + lang_dynamic_axes["batch_index"] = {0: "batch_size"} vision_dynamic_axes["pixel_values"] = {0: "batch_size", 2: "img_size", 3: "img_size"} - pkv_dynamic_axes = {0: "batch_size", 2: "ctx_len"} - pkv_dynamic_sliding_axes = {0: "batch_size", 2: "sliding_window"} + pkv_dynamic_axes = {0: "full_batch_size" if continuous_batching else "batch_size", 2: "ctx_len"} + pkv_dynamic_sliding_axes = {0: "full_batch_size" if continuous_batching else "batch_size", 2: "sliding_window"} layer_switch = ( self.language_model.config.sliding_window_pattern if hasattr(self.language_model.config, "sliding_window_pattern") @@ -837,7 +870,7 @@ def get_dummy_pkv_cache(self, config, batch_size, seq_len): past_key_values.append(pkv) return past_key_values - def get_dummy_inputs(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False): + def get_dummy_inputs(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False, continuous_batching: bool = False): if vis_cfg := getattr(self.config, "vision_config", None): img_size = getattr(vis_cfg, "image_size", 896) else: @@ -876,15 +909,21 @@ def get_dummy_inputs(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offl .repeat(constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, 1) ) lang_inputs["image_idx"] = torch.zeros((inputs_shapes["image_idx"]), dtype=torch.int64) + + bs: int = constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE + fbs: int = constants.ONNX_EXPORT_EXAMPLE_FBS + # Add data for KV lang_inputs["past_key_values"] = self.get_dummy_pkv_cache( config=self.language_model.config, - batch_size=constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, + batch_size=fbs if continuous_batching else bs, seq_len=constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, ) if comp_ctx_lengths is not None: lang_inputs["comp_ctx_lengths"] = torch.randint(0, 100, (40,), dtype=torch.long) + if continuous_batching: + lang_inputs["batch_index"] = torch.arange(bs).view(bs, 1) inputs = {} if kv_offload: diff --git a/QEfficient/transformers/models/llava/modeling_llava.py b/QEfficient/transformers/models/llava/modeling_llava.py index dc6653db0..775d47768 100644 --- a/QEfficient/transformers/models/llava/modeling_llava.py +++ b/QEfficient/transformers/models/llava/modeling_llava.py @@ -18,6 +18,7 @@ from QEfficient.utils.logging_utils import logger BS = 1 +FBS = 4 NUM_CHANNEL = 3 SEQ_LEN = 592 CTX_LEN = 1024 @@ -61,6 +62,7 @@ def forward( image_idx, past_key_values, comp_ctx_lengths: Optional[List[int]] = None, + batch_index: Optional[torch.LongTensor] = None, ): inputs_embeds = self.model.get_input_embeddings()(input_ids) vision_embeds = vision_embeds.to(inputs_embeds.device, inputs_embeds.dtype) @@ -76,6 +78,7 @@ def forward( position_ids=position_ids, past_key_values=past_key_values, comp_ctx_lengths=comp_ctx_lengths, + batch_index=batch_index, return_dict=True, ) @@ -140,7 +143,7 @@ def forward( image_idx = torch.where(image_idx < next_image_idx, next_image_idx, image_idx) return logits, pixel_values, image_idx, outputs.past_key_values - def get_dummy_inputs(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False, **kwargs): + def get_dummy_inputs(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False, continuous_batching: bool = False, **kwargs): num_layers = self.config.text_config.num_hidden_layers num_key_value_heads = self.config.text_config.num_key_value_heads head_dim = self.config.text_config.hidden_size // self.config.text_config.num_attention_heads @@ -165,8 +168,8 @@ def get_dummy_inputs(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offl for i in range(num_layers): lang_inputs["past_key_values"].append( ( - torch.zeros(BS, num_key_value_heads, CTX_LEN, head_dim), - torch.zeros(BS, num_key_value_heads, CTX_LEN, head_dim), + torch.zeros(FBS if continuous_batching else BS, num_key_value_heads, CTX_LEN, head_dim), + torch.zeros(FBS if continuous_batching else BS, num_key_value_heads, CTX_LEN, head_dim), ) ) lang_inputs["position_ids"] = torch.full(lang_inputs["position_ids"].shape, CTX_LEN - 1) @@ -174,6 +177,8 @@ def get_dummy_inputs(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offl if comp_ctx_lengths is not None: lang_inputs["comp_ctx_lengths"] = torch.randint(0, 100, (40,), dtype=torch.long) + if continuous_batching: + lang_inputs["batch_index"] = torch.arange(BS).view(BS, 1) inputs = {} if kv_offload: @@ -193,6 +198,9 @@ def get_specializations( comp_ctx_lengths_prefill: Optional[List[int]] = None, comp_ctx_lengths_decode: Optional[List[int]] = None, kv_offload: bool = False, + continuous_batching: bool = False, + kv_cache_batch_size: Optional[int] = None, + full_batch_size: Optional[int] = None, **compiler_options, ): max_num_images = compiler_options.pop("max_num_images", 1) @@ -218,49 +226,74 @@ def get_specializations( lang = [] for i in range(0, len(comp_ctx_lengths_prefill)): - lang.append( - { - "batch_size": batch_size, - "seq_len": prefill_seq_len, - "ctx_len": ctx_len, - "comp_ctx_lengths": comp_ctx_lengths_prefill[i], - "max_num_images": max_num_images, - "img_size": img_size, - "vision_size": vision_size, - } - ) - - for i in range(0, len(comp_ctx_lengths_decode)): - lang.append( - { - "batch_size": batch_size, - "seq_len": "1", - "ctx_len": ctx_len, - "comp_ctx_lengths": comp_ctx_lengths_decode[i], - "max_num_images": max_num_images, - "img_size": img_size, - "vision_size": vision_size, - } - ) - else: - lang = [ - { - "batch_size": batch_size, + lang_prefill = { + "batch_size": 1 if continuous_batching else batch_size, "seq_len": prefill_seq_len, "ctx_len": ctx_len, + "comp_ctx_lengths": comp_ctx_lengths_prefill[i], "max_num_images": max_num_images, "img_size": img_size, "vision_size": vision_size, - }, - { - "batch_size": batch_size, + "vision_batch_size": batch_size, + } + if continuous_batching: + lang_prefill["full_batch_size"] = kv_cache_batch_size + else: + lang_prefill["batch_size"] = kv_cache_batch_size + if full_batch_size: + lang_prefill["full_batch_exec_size"] = full_batch_size + lang.append(lang_prefill) + + for i in range(0, len(comp_ctx_lengths_decode)): + lang_decode = { + "batch_size": full_batch_size if continuous_batching else batch_size,, "seq_len": "1", "ctx_len": ctx_len, + "comp_ctx_lengths": comp_ctx_lengths_decode[i], "max_num_images": max_num_images, "img_size": img_size, "vision_size": vision_size, - }, - ] + "vision_batch_size": batch_size, + } + if continuous_batching: + lang_decode["full_batch_size"] = kv_cache_batch_size + else: + lang_decode["batch_size"] = kv_cache_batch_size + lang.append(lang_decode) + else: + lang_prefill = { + "batch_size": 1 if continuous_batching else batch_size, + "seq_len": prefill_seq_len, + "ctx_len": ctx_len, + "max_num_images": max_num_images, + "img_size": img_size, + "vision_size": vision_size, + "vision_batch_size": batch_size, + } + if continuous_batching: + lang_prefill["full_batch_size"] = kv_cache_batch_size + else: + lang_prefill["batch_size"] = kv_cache_batch_size + if full_batch_size: + lang_prefill["full_batch_exec_size"] = full_batch_size + + lang_decode = { + "batch_size": full_batch_size if continuous_batching else batch_size, + "seq_len": "1", + "ctx_len": ctx_len, + "max_num_images": max_num_images, + "img_size": img_size, + "vision_size": vision_size, + "vision_batch_size": batch_size, + } + if continuous_batching: + lang_decode["full_batch_size"] = kv_cache_batch_size + else: + lang_decode["batch_size"] = kv_cache_batch_size + + lang = [] + lang.append(lang_prefill) + lang.append(lang_decode) specializations = {} @@ -269,9 +302,11 @@ def get_specializations( specializations["lang"] = lang return specializations, compiler_options else: + lang[0].pop("vision_size") + lang[1].pop("vision_size") return lang, compiler_options - def get_onnx_dynamic_axes(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False): + def get_onnx_dynamic_axes(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False, continuous_batching: bool = False): # Define dynamic axes num_layers = self.config.text_config.num_hidden_layers @@ -281,11 +316,19 @@ def get_onnx_dynamic_axes(self, comp_ctx_lengths: Optional[List[int]] = None, kv lang_dynamic_axes = { "input_ids": {0: "batch_size", 1: "seq_len"}, "position_ids": {0: "batch_size", 1: "seq_len"}, - "vision_embeds": {0: "batch_size", 1: "vision_size"}, + "vision_embeds": {0: "vision_batch_size", 1: "vision_size"}, } + if continuous_batching: + lang_dynamic_axes["batch_index"] = {0: "batch_size"} for i in range(num_layers): - lang_dynamic_axes[f"past_key.{i}"] = {0: "batch_size", 2: "ctx_len"} - lang_dynamic_axes[f"past_value.{i}"] = {0: "batch_size", 2: "ctx_len"} + lang_dynamic_axes[f"past_key.{i}"] = { + 0: "full_batch_size" if continuous_batching else "batch_size", + 2: "ctx_len", + } + lang_dynamic_axes[f"past_value.{i}"] = { + 0: "full_batch_size" if continuous_batching else "batch_size", + 2: "ctx_len", + } if comp_ctx_lengths is not None: lang_dynamic_axes["comp_ctx_lengths"] = {0: "comp_ctx_lengths"} From 1220cf99f07c1203730161f4d8afc91242077736 Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Mon, 10 Nov 2025 12:30:54 +0000 Subject: [PATCH 02/11] Added CB support for InternVL Signed-off-by: Asmita Goswami --- QEfficient/generation/embedding_handler.py | 85 ++++++++++++- QEfficient/generation/vlm_generation.py | 8 ++ .../models/internvl/modeling_internvl.py | 113 ++++++++++++------ .../transformers/models/modeling_auto.py | 5 + examples/internvl_CB_example.py | 98 +++++++++++++++ 5 files changed, 268 insertions(+), 41 deletions(-) create mode 100644 examples/internvl_CB_example.py diff --git a/QEfficient/generation/embedding_handler.py b/QEfficient/generation/embedding_handler.py index 76da7afc2..f18e84179 100644 --- a/QEfficient/generation/embedding_handler.py +++ b/QEfficient/generation/embedding_handler.py @@ -12,13 +12,14 @@ operations, separating them from the main text generation logic. """ -from typing import Any, Dict, Optional, Tuple +from io import BytesIO +from typing import Any, Dict, List, Optional, Tuple import numpy as np import requests import torch from PIL import Image -from transformers import AutoImageProcessor +from transformers import AutoImageProcessor, AutoTokenizer from QEfficient.generation.cloud_infer import QAICInferenceSession from QEfficient.utils.logging_utils import logger @@ -37,6 +38,9 @@ def __init__( qeff_model: Optional[QAICInferenceSession], vision_session: Optional[QAICInferenceSession], processor: Optional[AutoImageProcessor], + tokenizer: Optional[AutoTokenizer], + image_height: Optional[int] = None, + image_width: Optional[int] = None, config: Optional[Dict[str, Any]] = None, lang_session: Optional[QAICInferenceSession] = None, ): @@ -46,12 +50,16 @@ def __init__( Args: vision_session: QAICInferenceSession for vision model processor: AutoImageProcessor for image preprocessing + tokenizer: AutoTokenizer for text tokenization config: Configuration dictionary with vision model parameters lang_session: Optional language session for coordination (to avoid resource conflicts) """ self._qeff_model = qeff_model self._vision_session = vision_session self._processor = processor + self._tokenizer = tokenizer + self._image_height = image_height + self._image_width = image_width self._config = config or {} self._lang_session = lang_session # Store language session for coordination @@ -70,6 +78,71 @@ def is_available(self) -> bool: """ return self._vision_session is not None and self._processor is not None + def prepare_internVL_inputs(self, img_url: str, query: str) -> Dict[str, np.ndarray]: + """ + Prepare inputs for InternVL model + + Args: + image_url: URL or path to image + query: Text query to process with image + prompt = [query] + """ + if not self._tokenizer: + raise ValueError("Tokenizer is required for InternVL input preparation") + prompt = query + pixel_values = [] + num_patches_list = [] + questions = [] + img = requests.get(img_url, stream=True) + image = Image.open(BytesIO(img.content)).convert("RGB") + + if self._image_height and self._image_width: + image = image.resize((self._image_height, self._image_width)) + else: + logger.warning("Height and Width not specified. Using default image size for num_patches = 13.") + image = image.resize((1000, 747)) + + # preprocess the resized image + pixel_value = self._processor.load_image(image, max_num=12) + num_patches_list.append(pixel_value.shape[0]) + pixel_values.append(pixel_value) + + question = "\n" + prompt + questions.append(question) + + pixel_values = torch.cat(pixel_values, dim=0) + + # Chat Template information for prompt preprocessing + messages: List[List[str]] = [] + roles = ("<|im_start|>user\n", "<|im_start|>assistant\n") + prompt = self._processor(pixel_values, questions, messages, roles, num_patches_list=num_patches_list) + + inputs = self._tokenizer(prompt, return_tensors="pt") + inputs["pixel_values"] = pixel_values.clone() + + # Convert to numpy arrays + vision_inputs = {} + for k, v in inputs.items(): + if k in { + "pixel_values", + "image_masks", + "image_input_idx", + "valid_idx", + "aspect_ratio_ids", + "aspect_ratio_mask", + }: + vision_inputs[k] = np.array(v) + + # Convert specific inputs to float16 + vision_inputs_fp16 = {"pixel_values", "image_masks"} + for k in vision_inputs_fp16: + if k in vision_inputs: + vision_inputs[k] = vision_inputs[k].astype("float16") + + lang_inputs = {k: v for k, v in inputs.items() if k not in vision_inputs} + + return vision_inputs, lang_inputs + def prepare_vlm_inputs(self, image_url: str, query: str, prefill_seq_len: int) -> Dict[str, np.ndarray]: """ Download and preprocess image into model inputs @@ -323,7 +396,13 @@ def get_processed_inputs( try: ## Get vlm inputs ## - vision_inputs, lang_inputs = self.prepare_vlm_inputs(image_url, query, prefill_seq_len) + if ( + hasattr(self._qeff_model.model.config, "model_type") + and self._qeff_model.model.config.model_type == "internvl_chat" + ): + vision_inputs, lang_inputs = self.prepare_internVL_inputs(image_url, query) + else: + vision_inputs, lang_inputs = self.prepare_vlm_inputs(image_url, query, prefill_seq_len) # Handle padding for language model pad_token_id = 1 diff --git a/QEfficient/generation/vlm_generation.py b/QEfficient/generation/vlm_generation.py index 5eb91d142..dd5f579a8 100644 --- a/QEfficient/generation/vlm_generation.py +++ b/QEfficient/generation/vlm_generation.py @@ -88,6 +88,8 @@ def __init__( enable_debug_logs: bool = False, write_io_dir: Optional[str] = None, full_batch_size: Optional[int] = None, + image_height: Optional[int] = None, + image_width: Optional[int] = None, is_tlm: bool = False, include_sampler: bool = False, return_pdfs: bool = False, @@ -143,6 +145,9 @@ def __init__( ) self.qeff_model = qeff_model self.processor = processor + self.tokenizer = tokenizer + self.image_height = image_height + self.image_width = image_width self._vision_qpc_path = vision_qpc_path self.device_id = device_id # Store device_id for vision components self.enable_debug_logs = enable_debug_logs # Store for vision components @@ -173,6 +178,9 @@ def _init_vision_components(self): qeff_model=self.qeff_model, vision_session=self._vision_session, processor=self.processor, + tokenizer=self.tokenizer, + image_height=self.image_height, + image_width=self.image_width, config=vision_config, lang_session=self._session, # Pass language session for coordination ) diff --git a/QEfficient/transformers/models/internvl/modeling_internvl.py b/QEfficient/transformers/models/internvl/modeling_internvl.py index 96c59325f..fb0275acc 100644 --- a/QEfficient/transformers/models/internvl/modeling_internvl.py +++ b/QEfficient/transformers/models/internvl/modeling_internvl.py @@ -44,6 +44,7 @@ def forward( image_idx, past_key_values, comp_ctx_lengths: Optional[List[int]] = None, + batch_index: Optional[torch.LongTensor] = None, ): input_embeds = self.model.language_model.get_input_embeddings()(input_ids) B, N, C = input_embeds.shape @@ -69,6 +70,7 @@ def forward( position_ids=position_ids, past_key_values=past_key_values, comp_ctx_lengths=comp_ctx_lengths, + batch_index=batch_index, use_cache=True, ) image_idx = (indices1.max() + 1).unsqueeze(0).unsqueeze(0) @@ -91,6 +93,9 @@ def get_specializations( comp_ctx_lengths_prefill: Optional[List[int]] = None, comp_ctx_lengths_decode: Optional[List[int]] = None, kv_offload: bool = False, + continuous_batching: bool = False, + kv_cache_batch_size: Optional[int] = None, + full_batch_size: Optional[int] = None, **compiler_options, ): num_patches = compiler_options.pop("num_patches", None) @@ -124,50 +129,73 @@ def get_specializations( lang = [] for i in range(0, len(comp_ctx_lengths_prefill)): - lang.append( - { - "batch_size": batch_size, - "seq_len": prefill_seq_len, - "ctx_len": ctx_len, - "comp_ctx_lengths": comp_ctx_lengths_prefill[i], - "num_patches": num_patches, - "img_size": img_size, - "vision_size": vision_size, - } - ) - - for i in range(0, len(comp_ctx_lengths_decode)): - lang.append( - { - "batch_size": batch_size, - "seq_len": "1", - "ctx_len": ctx_len, - "comp_ctx_lengths": comp_ctx_lengths_decode[i], - "num_patches": num_patches, - "img_size": img_size, - "vision_size": vision_size, - } - ) - - else: - lang = [ - { - "batch_size": batch_size, + lang_prefill = { + "batch_size": 1 if continuous_batching else batch_size, "seq_len": prefill_seq_len, "ctx_len": ctx_len, + "comp_ctx_lengths": comp_ctx_lengths_prefill[i], "num_patches": num_patches, "img_size": img_size, "vision_size": vision_size, - }, - { - "batch_size": batch_size, + } + if continuous_batching: + lang_prefill["full_batch_size"] = kv_cache_batch_size + else: + lang_prefill["batch_size"] = kv_cache_batch_size + if full_batch_size: + lang_prefill["full_batch_exec_size"] = full_batch_size + lang.append(lang_prefill) + + for i in range(0, len(comp_ctx_lengths_decode)): + lang_decode = { + "batch_size": full_batch_size if continuous_batching else batch_size, "seq_len": "1", "ctx_len": ctx_len, + "comp_ctx_lengths": comp_ctx_lengths_prefill[i], "num_patches": num_patches, "img_size": img_size, "vision_size": vision_size, - }, - ] + } + + if continuous_batching: + lang_decode["full_batch_size"] = kv_cache_batch_size + else: + lang_decode["batch_size"] = kv_cache_batch_size + lang.append(lang_decode) + + else: + lang_prefill = { + "batch_size": 1 if continuous_batching else batch_size, + "seq_len": prefill_seq_len, + "ctx_len": ctx_len, + "num_patches": num_patches, + "img_size": img_size, + "vision_size": vision_size, + } + if continuous_batching: + lang_prefill["full_batch_size"] = kv_cache_batch_size + else: + lang_prefill["batch_size"] = kv_cache_batch_size + if full_batch_size: + lang_prefill["full_batch_exec_size"] = full_batch_size + + lang_decode = { + "batch_size": full_batch_size if continuous_batching else batch_size, + "seq_len": "1", + "ctx_len": ctx_len, + "num_patches": num_patches, + "img_size": img_size, + "vision_size": vision_size, + } + + if continuous_batching: + lang_decode["full_batch_size"] = kv_cache_batch_size + else: + lang_decode["batch_size"] = kv_cache_batch_size + + lang = [] + lang.append(lang_prefill) + lang.append(lang_decode) specializations = {} @@ -176,18 +204,22 @@ def get_specializations( specializations["lang"] = lang return specializations, compiler_options else: + lang[0].pop("vision_size") + lang[1].pop("vision_size") return lang, compiler_options - def get_onnx_dynamic_axes(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False): + def get_onnx_dynamic_axes(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False, continuous_batching: bool = False): # Define dynamic axes vision_dynamic_axes = {} lang_dynamic_axes = {} lang_dynamic_axes["input_ids"] = {0: "batch_size", 1: "seq_len"} lang_dynamic_axes["position_ids"] = {0: "batch_size", 1: "seq_len"} lang_dynamic_axes["vision_embeds"] = {1: "vision_size"} + if continuous_batching: + lang_dynamic_axes["batch_index"] = {0: "batch_size"} vision_dynamic_axes["pixel_values"] = {0: "batched_num_patches", 2: "img_size", 3: "img_size"} - pkv_dynamic_axes = {0: "batch_size", 2: "ctx_len"} + pkv_dynamic_axes = {0: "full_batch_size" if continuous_batching else "batch_size", 2: "ctx_len"} for i in range(self.language_model.config.num_hidden_layers): for kv in ["key", "value"]: lang_dynamic_axes[f"past_{kv}.{i}"] = pkv_dynamic_axes @@ -222,7 +254,7 @@ def get_output_names(self, kv_offload: bool = False): return lang_output_names return output_names - def get_dummy_inputs(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False): + def get_dummy_inputs(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False, continuous_batching: bool = False): if vis_cfg := getattr(self.config, "vision_config", None): img_size = getattr(vis_cfg, "image_size", constants.INTERN_IMG_SIZE) else: @@ -271,10 +303,13 @@ def get_dummy_inputs(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offl ) lang_inputs["image_idx"] = torch.zeros((1, 1), dtype=torch.int64) + bs: int = constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE + fbs: int = constants.ONNX_EXPORT_EXAMPLE_FBS + # Add data for KV kv_cache_shape = get_padding_shape_from_config( config=self.language_model.config, - batch_size=constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, + batch_size=fbs if continuous_batching else bs, seq_len=constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, ) @@ -285,6 +320,8 @@ def get_dummy_inputs(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offl if comp_ctx_lengths is not None: lang_inputs["comp_ctx_lengths"] = torch.randint(0, 100, (40,), dtype=torch.long) + if continuous_batching: + lang_inputs["batch_index"] = torch.arange(bs).view(bs, 1) inputs = {} if kv_offload: diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 5f1ec51e6..f3c5c6a7c 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -1215,6 +1215,8 @@ def generate( device_ids: List[int] = None, runtime_ai100: bool = True, generation_len: Optional[int] = None, + image_height: Optional[int] = None, + image_width: Optional[int] = None, ) -> Union[torch.Tensor, np.ndarray]: """ Generates output by executing the compiled QPC(s) on Cloud AI 100 Hardware cards. @@ -1273,6 +1275,8 @@ def generate( full_batch_size=fbs, comp_ctx_lengths_prefill=self.comp_ctx_lengths_prefill, comp_ctx_lengths_decode=self.comp_ctx_lengths_decode, + image_height=image_height, + image_width=image_width, ) # Call generate method @@ -2401,6 +2405,7 @@ def from_pretrained( kv_offload=kv_offload, pretrained_model_name_or_path=pretrained_model_name_or_path, qaic_config=qaic_config, + continuous_batching=continuous_batching, **kwargs, ) return cls( diff --git a/examples/internvl_CB_example.py b/examples/internvl_CB_example.py new file mode 100644 index 000000000..486f9db6c --- /dev/null +++ b/examples/internvl_CB_example.py @@ -0,0 +1,98 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +# SPDX-License-Identifier: BSD-3-Clause +# +# ---------------------------------------------------------------------------- + +from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer + +from QEfficient import QEFFAutoModelForCausalLM +from QEfficient.utils.test_utils import InternProcessor + +model_id = "OpenGVLab/InternVL2_5-1B" +config = AutoConfig.from_pretrained(model_id, trust_remote_code=True) +# For Testing Purpose Only +config.llm_config.num_hidden_layers = 2 +config.vision_config.num_hidden_layers = 2 + +model_hf = AutoModelForCausalLM.from_pretrained( + model_id, + low_cpu_mem_usage=False, + trust_remote_code=True, + config=config, +) + +tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True, use_fast=False) +processor = InternProcessor(model_hf, tokenizer) + + +continuous_batching = True +if continuous_batching: + qeff_model = QEFFAutoModelForCausalLM.from_pretrained( + model_id, + attn_implementation="eager", + kv_offload=True, + config=config, + continuous_batching=True, + trust_remote_code=True, + ) + + qeff_model.compile( + num_patches=13, # Set num_patches according to image_height and image_width, default is 13 (747 x 1000) + prefill_seq_len=128, + ctx_len=4096, + num_cores=16, + num_devices=4, + batch_size=1, + full_batch_size=1, + mxfp6_matmul=True, + mxint8_kv_cache=True, + aic_enable_depth_first=True, + mos=1, + ) +else: + qeff_model = QEFFAutoModelForCausalLM.from_pretrained( + model_id, attn_implementation="eager", kv_offload=True, config=config, trust_remote_code=True + ) + + qeff_model.compile( + num_patches=13, + prefill_seq_len=128, + ctx_len=4096, + num_cores=16, + num_devices=4, + batch_size=1, + mxfp6_matmul=True, + mxint8_kv_cache=True, + aic_enable_depth_first=True, + ) + +image_urls = [ + "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png", + "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg", + "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png", + "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg", +] + +prompts = [ + "Can you describe the image in detail?", + "What are the objects in the image?", + "What is the main subject of the image?", + "What colors are predominant in the image?", +] + +exec_info = qeff_model.generate( + tokenizer=tokenizer, + prompts=prompts, + processor=processor, + images=image_urls, + device_ids=[0, 1, 2, 3], + generation_len=10, + image_height=747, + image_width=1000, +) + +print("Generated texts:", exec_info.generated_texts) +print("Generated IDs:", exec_info.generated_ids) +print(exec_info) From c39ae015cf5fe88842ef9d0e82fe5d9d73f6e718 Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Mon, 10 Nov 2025 12:42:23 +0000 Subject: [PATCH 03/11] Added CB support for Mistral3 Signed-off-by: Asmita Goswami --- QEfficient/generation/embedding_handler.py | 3 + .../models/internvl/modeling_internvl.py | 2 +- .../models/mistral3/modeling_mistral3.py | 115 ++++++++++++------ examples/internvl_CB_example.py | 2 +- 4 files changed, 82 insertions(+), 40 deletions(-) diff --git a/QEfficient/generation/embedding_handler.py b/QEfficient/generation/embedding_handler.py index f18e84179..d196a23a2 100644 --- a/QEfficient/generation/embedding_handler.py +++ b/QEfficient/generation/embedding_handler.py @@ -168,6 +168,9 @@ def prepare_vlm_inputs(self, image_url: str, query: str, prefill_seq_len: int) - else: image = Image.open(image_url) + if "mistral3" in self._qeff_model.model.config.model_type: + image = image.resize((1540, 1540)) + # Prepare conversation format conversation = [ { diff --git a/QEfficient/transformers/models/internvl/modeling_internvl.py b/QEfficient/transformers/models/internvl/modeling_internvl.py index fb0275acc..eb5a4b475 100644 --- a/QEfficient/transformers/models/internvl/modeling_internvl.py +++ b/QEfficient/transformers/models/internvl/modeling_internvl.py @@ -151,7 +151,7 @@ def get_specializations( "batch_size": full_batch_size if continuous_batching else batch_size, "seq_len": "1", "ctx_len": ctx_len, - "comp_ctx_lengths": comp_ctx_lengths_prefill[i], + "comp_ctx_lengths": comp_ctx_lengths_decode[i], "num_patches": num_patches, "img_size": img_size, "vision_size": vision_size, diff --git a/QEfficient/transformers/models/mistral3/modeling_mistral3.py b/QEfficient/transformers/models/mistral3/modeling_mistral3.py index 694ed4cde..afe838f74 100644 --- a/QEfficient/transformers/models/mistral3/modeling_mistral3.py +++ b/QEfficient/transformers/models/mistral3/modeling_mistral3.py @@ -176,6 +176,7 @@ def forward( image_idx, past_key_values, comp_ctx_lengths: Optional[List[int]] = None, + batch_index: Optional[torch.LongTensor] = None, ): inputs_embeds = self.model.get_input_embeddings()(input_ids) vision_embeds = vision_embeds.to(inputs_embeds.device, inputs_embeds.dtype) @@ -190,6 +191,7 @@ def forward( position_ids=position_ids, past_key_values=past_key_values, comp_ctx_lengths=comp_ctx_lengths, + batch_index=batch_index, ) # Cast to int32 to avoid ONNXRT issue @@ -250,7 +252,7 @@ def forward( return logits, pixel_values, image_idx, outputs.past_key_values - def get_dummy_inputs(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False, **kwargs): + def get_dummy_inputs(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False, continuous_batching: bool = False, **kwargs): inputs_shapes = {} inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN) height = self.config.vision_config.image_size @@ -290,10 +292,14 @@ def get_dummy_inputs(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offl .repeat(constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, 1) ) lang_inputs["image_idx"] = torch.zeros((inputs_shapes["image_idx"]), dtype=torch.int64) + + bs: int = constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE + fbs: int = constants.ONNX_EXPORT_EXAMPLE_FBS + # Add data for KV kv_cache_shape = get_padding_shape_from_config( - config=self.language_model.config, - batch_size=constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, + config=self.model.config.text_config, + batch_size=fbs if continuous_batching else bs, seq_len=constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, ) @@ -304,6 +310,8 @@ def get_dummy_inputs(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offl if comp_ctx_lengths is not None: lang_inputs["comp_ctx_lengths"] = torch.randint(0, 100, (40,), dtype=torch.long) + if continuous_batching: + lang_inputs["batch_index"] = torch.arange(bs).view(bs, 1) inputs = {} if kv_offload: @@ -324,6 +332,9 @@ def get_specializations( comp_ctx_lengths_prefill: Optional[List[int]] = None, comp_ctx_lengths_decode: Optional[List[int]] = None, kv_offload: bool = False, + continuous_batching: bool = False, + kv_cache_batch_size: Optional[int] = None, + full_batch_size: Optional[int] = None, **compiler_options, ): if img_size is None and hasattr(self.config.vision_config, "image_size"): @@ -352,46 +363,65 @@ def get_specializations( lang = [] for i in range(0, len(comp_ctx_lengths_prefill)): - lang.append( - { - "batch_size": batch_size, - "seq_len": prefill_seq_len, - "ctx_len": ctx_len, - "comp_ctx_lengths": comp_ctx_lengths_prefill[i], - "image_size": img_size, - "vision_size": vision_size, - } - ) - - # Remaining elements use comp_ctx_lengths[1:] in a loop - for i in range(0, len(comp_ctx_lengths_decode)): - lang.append( - { - "batch_size": batch_size, - "seq_len": "1", - "ctx_len": ctx_len, - "comp_ctx_lengths": comp_ctx_lengths_decode[i], - "image_size": img_size, - "vision_size": vision_size, - } - ) - else: - lang = [ - { - "batch_size": batch_size, + lang_prefill = { + "batch_size": 1 if continuous_batching else batch_size, "seq_len": prefill_seq_len, "ctx_len": ctx_len, + "comp_ctx_lengths": comp_ctx_lengths_prefill[i], "image_size": img_size, "vision_size": vision_size, - }, - { - "batch_size": batch_size, + } + if continuous_batching: + lang_prefill["full_batch_size"] = kv_cache_batch_size + else: + lang_prefill["batch_size"] = kv_cache_batch_size + if full_batch_size: + lang_prefill["full_batch_exec_size"] = full_batch_size + lang.append(lang_prefill) + + # Remaining elements use comp_ctx_lengths[1:] in a loop + for i in range(0, len(comp_ctx_lengths_decode)): + lang_decode = { + "batch_size": full_batch_size if continuous_batching else batch_size, "seq_len": "1", "ctx_len": ctx_len, + "comp_ctx_lengths": comp_ctx_lengths_decode[i], "image_size": img_size, "vision_size": vision_size, - }, - ] + } + + if continuous_batching: + lang_decode["full_batch_size"] = kv_cache_batch_size + else: + lang_decode["batch_size"] = kv_cache_batch_size + lang.append(lang_decode) + else: + lang_prefill = { + "batch_size": 1 if continuous_batching else batch_size, + "seq_len": prefill_seq_len, + "ctx_len": ctx_len, + "image_size": img_size, + "vision_size": vision_size, + } + if continuous_batching: + lang_prefill["full_batch_size"] = kv_cache_batch_size + else: + lang_prefill["batch_size"] = kv_cache_batch_size + if full_batch_size: + lang_prefill["full_batch_exec_size"] = full_batch_size + + lang_decode = { + "batch_size": full_batch_size if continuous_batching else batch_size, + "seq_len": "1", + "ctx_len": ctx_len, + "image_size": img_size, + "vision_size": vision_size, + } + + if continuous_batching: + lang_decode["full_batch_size"] = kv_cache_batch_size + else: + lang_decode["batch_size"] = kv_cache_batch_size specializations = {} @@ -404,7 +434,7 @@ def get_specializations( lang[1].pop("vision_size") return lang, compiler_options - def get_onnx_dynamic_axes(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False): + def get_onnx_dynamic_axes(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False, continuous_batching: bool = False): # Define dynamic axes num_layers = self.config.text_config.num_hidden_layers @@ -417,9 +447,18 @@ def get_onnx_dynamic_axes(self, comp_ctx_lengths: Optional[List[int]] = None, kv "vision_embeds": {0: "vision_size"}, } + if continuous_batching: + lang_dynamic_axes["batch_index"] = {0: "batch_size"} + for i in range(num_layers): - lang_dynamic_axes[f"past_key.{i}"] = {0: "batch_size", 2: "ctx_len"} - lang_dynamic_axes[f"past_value.{i}"] = {0: "batch_size", 2: "ctx_len"} + lang_dynamic_axes[f"past_key.{i}"] = { + 0: "full_batch_size" if continuous_batching else "batch_size", + 2: "ctx_len", + } + lang_dynamic_axes[f"past_value.{i}"] = { + 0: "full_batch_size" if continuous_batching else "batch_size", + 2: "ctx_len", + } if comp_ctx_lengths is not None: lang_dynamic_axes["comp_ctx_lengths"] = {0: "comp_ctx_lengths"} diff --git a/examples/internvl_CB_example.py b/examples/internvl_CB_example.py index 486f9db6c..29cb9a5c4 100644 --- a/examples/internvl_CB_example.py +++ b/examples/internvl_CB_example.py @@ -45,7 +45,7 @@ num_cores=16, num_devices=4, batch_size=1, - full_batch_size=1, + full_batch_size=4, mxfp6_matmul=True, mxint8_kv_cache=True, aic_enable_depth_first=True, From 39f5c1649025809595bf5aa7b6fb3cadf580e5aa Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Tue, 11 Nov 2025 08:15:11 +0000 Subject: [PATCH 04/11] Updated test_image_text_to_text for CB tests Signed-off-by: Asmita Goswami --- QEfficient/utils/run_utils.py | 48 +++++++++++++ .../models/test_image_text_to_text_models.py | 67 ++++++++++++++++++- 2 files changed, 112 insertions(+), 3 deletions(-) diff --git a/QEfficient/utils/run_utils.py b/QEfficient/utils/run_utils.py index c54dadeac..0f82fb027 100644 --- a/QEfficient/utils/run_utils.py +++ b/QEfficient/utils/run_utils.py @@ -276,6 +276,54 @@ def __init__( self.config = config self.gen_len = max_gen_len + @torch.no_grad() + def run_vlm_hf_model_on_pytorch_CB(self, model, images, queries): + """ + Function responsible for running HuggingFace ``PyTorch`` model for continuous batching + and return the output tokens for each prompt/image pair. + + ``Mandatory`` Args: + :model (torch.nn.module): Original ``PyTorch`` model + :images (List[PIL.Image]): List of input images + :queries (List[str]): List of input queries + + Return: + :List[numpy.ndarray]: List of generated output tokens for each prompt + """ + generated_ids = [] + + for idx, (image, query) in enumerate(zip(images, queries)): + # Prepare conversation format for each image-query pair + conversation = [ + { + "role": "user", + "content": [ + {"type": "text", "text": query}, + {"type": "image"}, + ], + }, + ] + prompt = self.processor.apply_chat_template(conversation, add_generation_prompt=True) + + # Process inputs + inputs = self.processor(images=image, text=prompt, return_tensors="pt") + if "pixel_values" in inputs: + inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32) + + # Generate tokens + output = model.generate(**inputs, max_new_tokens=self.gen_len, do_sample=False) + offset_output = output[0, inputs["input_ids"].shape[1]:] + + # Decode and print output + py_output = self.processor.tokenizer.decode(offset_output).strip() + print(f"Original HF Model Outputs (Torch CPU) for prompt {idx}:") + print("Query:", repr(query)) + print("Completion:", repr(py_output)) + + generated_ids.append(offset_output.numpy()) + + return generated_ids + @torch.no_grad() def run_vlm_hf_model_on_pytorch(self, model, inputs): output = model.generate(**inputs, max_new_tokens=self.gen_len, do_sample=False) diff --git a/tests/transformers/models/test_image_text_to_text_models.py b/tests/transformers/models/test_image_text_to_text_models.py index e6a145195..5d095fe87 100644 --- a/tests/transformers/models/test_image_text_to_text_models.py +++ b/tests/transformers/models/test_image_text_to_text_models.py @@ -38,6 +38,7 @@ # model_name, # kv_offload, # batch_size, + # full_batch_size, # prompt_len, # ctx_len, # img_size, @@ -49,6 +50,7 @@ "llava-hf/llava-1.5-7b-hf", True, 1, + 4, 784, 1024, 336, @@ -60,6 +62,7 @@ "llava-hf/llava-1.5-7b-hf", False, 1, + 4, 784, 1024, 336, @@ -72,6 +75,7 @@ # "meta-llama/Llama-4-Scout-17B-16E-Instruct", # True, # 1, + # 4, # 128, # 3072, # 336, @@ -83,6 +87,7 @@ # "meta-llama/Llama-4-Scout-17B-16E-Instruct", # False, # 1, + # 4, # 128, # 3072, # 336, @@ -94,6 +99,7 @@ "google/gemma-3-4b-it", True, 1, + 4, 128, 3072, 896, @@ -105,6 +111,7 @@ "google/gemma-3-4b-it", False, 1, + 4, 128, 3072, 896, @@ -116,6 +123,7 @@ "mistralai/Mistral-Small-3.1-24B-Instruct-2503", True, 1, + 4, 128, 4096, 1540, @@ -127,6 +135,7 @@ "mistralai/Mistral-Small-3.1-24B-Instruct-2503", False, 1, + 4, 128, 4096, 1540, @@ -138,6 +147,7 @@ "Qwen/Qwen2.5-VL-3B-Instruct", True, 1, + 4, 128, 4096, 1540, @@ -149,6 +159,7 @@ # "meta-llama/Llama-3.2-11B-Vision-Instruct", # True, # 1, + # 4, # 32, # 512, # 560, @@ -256,6 +267,7 @@ def check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( query: str, prompt_len: int, ctx_len: int, + full_batch_size: int, max_gen_len: int = 20, batch_size: int = 1, n_layer: int = 1, @@ -341,8 +353,56 @@ def check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( output = qeff_model.generate(inputs=inputs, generation_len=NEW_GENERATION_TOKENS, streamer=streamer) qpc_tokens = output.generated_ids[:, :-1] assert (pytorch_hf_tokens == qpc_tokens).all(), "Tokens don't match for pytorch HF output and QPC output" - return + # testing for CB models + if not kv_offload: # CB not yet enabled for Single QPC + return + images = [image] * full_batch_size + queries = [query] * full_batch_size + + streamer = TextStreamer(processor.tokenizer) + pytorch_hf_tokens = api_runner.run_vlm_hf_model_on_pytorch_CB(model_hf, images, queries) + + qeff_model = QEFFAutoModelForImageTextToText.from_pretrained( + model_config["model_name"], + kv_offload=kv_offload, + config=config, + continuous_batching=True, + ) + + qeff_model.export() + + if not get_available_device_id(): + pytest.skip("No available devices to run model on Cloud AI 100") + + qeff_model.compile( + img_size=model_config["img_size"], + num_cores=16, + num_devices=num_devices, + prefill_seq_len=prompt_len, + ctx_len=ctx_len, + batch_size=batch_size, + full_batch_size=full_batch_size, + mxfp6_matmul=True, + enable_qnn=enable_qnn, + qnn_config=qnn_config, + ) + + print("QPC Outputs (QAIC):") + exec_info = qeff_model.generate( + tokenizer=processor.tokenizer, + processor=processor, + images=[img_url] * full_batch_size, + prompts=queries, + generation_len=max_gen_len, + ) + + qpc_tokens = exec_info.generated_ids[:, :max_gen_len] + + for i in range(full_batch_size): + assert (pytorch_hf_tokens[i] == qpc_tokens[i]).all(), f"Tokens don't match for prompt {i} between HF and QPC output" + + return def check_molmo_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( model_name: str, @@ -527,10 +587,10 @@ def check_intern_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( @pytest.mark.on_qaic @pytest.mark.multimodal @pytest.mark.parametrize( - "model_name, kv_offload, batch_size, prompt_len, ctx_len, img_size, img_url, query, n_layer", test_models_config + "model_name, kv_offload, batch_size, full_batch_size, prompt_len, ctx_len, img_size, img_url, query, n_layer", test_models_config ) def test_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( - model_name, kv_offload, batch_size, prompt_len, ctx_len, img_size, img_url, query, n_layer + model_name, kv_offload, batch_size, full_batch_size, prompt_len, ctx_len, img_size, img_url, query, n_layer ): """ Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, without continuous batching. @@ -547,6 +607,7 @@ def test_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( query=query, n_layer=n_layer, batch_size=batch_size, + full_batch_size=full_batch_size, kv_offload=kv_offload, ) From 9a42a081da61bf10b8a087f6087eaa7fe4172d12 Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Tue, 11 Nov 2025 08:18:34 +0000 Subject: [PATCH 05/11] Ruff format Signed-off-by: Asmita Goswami --- QEfficient/utils/run_utils.py | 2 +- .../models/test_image_text_to_text_models.py | 10 +++++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/QEfficient/utils/run_utils.py b/QEfficient/utils/run_utils.py index 0f82fb027..59e3f9bf4 100644 --- a/QEfficient/utils/run_utils.py +++ b/QEfficient/utils/run_utils.py @@ -312,7 +312,7 @@ def run_vlm_hf_model_on_pytorch_CB(self, model, images, queries): # Generate tokens output = model.generate(**inputs, max_new_tokens=self.gen_len, do_sample=False) - offset_output = output[0, inputs["input_ids"].shape[1]:] + offset_output = output[0, inputs["input_ids"].shape[1] :] # Decode and print output py_output = self.processor.tokenizer.decode(offset_output).strip() diff --git a/tests/transformers/models/test_image_text_to_text_models.py b/tests/transformers/models/test_image_text_to_text_models.py index 5d095fe87..11fcf6857 100644 --- a/tests/transformers/models/test_image_text_to_text_models.py +++ b/tests/transformers/models/test_image_text_to_text_models.py @@ -355,7 +355,7 @@ def check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( assert (pytorch_hf_tokens == qpc_tokens).all(), "Tokens don't match for pytorch HF output and QPC output" # testing for CB models - if not kv_offload: # CB not yet enabled for Single QPC + if not kv_offload: # CB not yet enabled for Single QPC return images = [image] * full_batch_size queries = [query] * full_batch_size @@ -400,10 +400,13 @@ def check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( qpc_tokens = exec_info.generated_ids[:, :max_gen_len] for i in range(full_batch_size): - assert (pytorch_hf_tokens[i] == qpc_tokens[i]).all(), f"Tokens don't match for prompt {i} between HF and QPC output" + assert (pytorch_hf_tokens[i] == qpc_tokens[i]).all(), ( + f"Tokens don't match for prompt {i} between HF and QPC output" + ) return + def check_molmo_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( model_name: str, img_url: str, @@ -587,7 +590,8 @@ def check_intern_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( @pytest.mark.on_qaic @pytest.mark.multimodal @pytest.mark.parametrize( - "model_name, kv_offload, batch_size, full_batch_size, prompt_len, ctx_len, img_size, img_url, query, n_layer", test_models_config + "model_name, kv_offload, batch_size, full_batch_size, prompt_len, ctx_len, img_size, img_url, query, n_layer", + test_models_config, ) def test_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( model_name, kv_offload, batch_size, full_batch_size, prompt_len, ctx_len, img_size, img_url, query, n_layer From c1465c8ede242bbac1590d99d06329f0c5008d5a Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Sun, 16 Nov 2025 16:54:54 +0000 Subject: [PATCH 06/11] Added CB update for Molmo Signed-off-by: Asmita Goswami --- QEfficient/generation/embedding_handler.py | 60 +++++++++++++ .../models/gemma3/modeling_gemma3.py | 8 +- .../models/internvl/modeling_internvl.py | 8 +- .../models/llava/modeling_llava.py | 14 ++- .../models/mistral3/modeling_mistral3.py | 12 ++- .../models/molmo/modeling_molmo.py | 87 ++++++++++++++++--- .../models/test_image_text_to_text_models.py | 57 +++++++++++- 7 files changed, 222 insertions(+), 24 deletions(-) diff --git a/QEfficient/generation/embedding_handler.py b/QEfficient/generation/embedding_handler.py index d196a23a2..b3ba55098 100644 --- a/QEfficient/generation/embedding_handler.py +++ b/QEfficient/generation/embedding_handler.py @@ -143,6 +143,61 @@ def prepare_internVL_inputs(self, img_url: str, query: str) -> Dict[str, np.ndar return vision_inputs, lang_inputs + def prepare_molmo_inputs(self, image_url: str, query: str) -> Dict[str, np.ndarray]: + """ + Download and preprocess image into model inputs + Args: + image_url: URL or path to image + query: Text query to process with image + Returns: + Dictionary of vision model inputs + Raises: + ValueError: If vision handler is not properly initialized + RuntimeError: If image processing fails + """ + if not self.is_available(): + raise ValueError("Vision handler not properly initialized. Need both vision_session and processor.") + + try: + # Download image + if image_url.startswith(("http://", "https://")): + image = Image.open(requests.get(image_url, stream=True).raw) + else: + image = Image.open(image_url) + image = image.resize((536, 354)) + inputs = self._processor.process(images=[image], text=query) + inputs = {k: v.unsqueeze(0) for k, v in inputs.items()} + inputs["attention_mask"] = torch.ones((inputs["input_ids"].shape), dtype=torch.int64) + valid = inputs["image_input_idx"] > 0 + valid = valid.reshape(1, -1) + inputs["valid_idx"] = torch.nonzero(valid)[:, 1].unsqueeze(0) + inputs["pixel_values"] = inputs.pop("images") + + # Convert to numpy arrays + vision_inputs = {} + for k, v in inputs.items(): + if k in { + "pixel_values", + "image_masks", + "image_input_idx", + "valid_idx", + "aspect_ratio_ids", + "aspect_ratio_mask", + }: + vision_inputs[k] = np.array(v) + + # Convert specific inputs to float16 + vision_inputs_fp16 = {"pixel_values", "image_masks"} + for k in vision_inputs_fp16: + if k in vision_inputs: + vision_inputs[k] = vision_inputs[k].astype("float16") + + lang_inputs = {k: v for k, v in inputs.items() if k not in vision_inputs} + + return vision_inputs, lang_inputs + except Exception as e: + raise RuntimeError(f"Failed to process image {image_url}: {str(e)}") + def prepare_vlm_inputs(self, image_url: str, query: str, prefill_seq_len: int) -> Dict[str, np.ndarray]: """ Download and preprocess image into model inputs @@ -404,6 +459,11 @@ def get_processed_inputs( and self._qeff_model.model.config.model_type == "internvl_chat" ): vision_inputs, lang_inputs = self.prepare_internVL_inputs(image_url, query) + elif ( + hasattr(self._qeff_model.model.config, "model_type") + and self._qeff_model.model.config.model_type == "molmo" + ): + vision_inputs, lang_inputs = self.prepare_molmo_inputs(image_url, query) else: vision_inputs, lang_inputs = self.prepare_vlm_inputs(image_url, query, prefill_seq_len) diff --git a/QEfficient/transformers/models/gemma3/modeling_gemma3.py b/QEfficient/transformers/models/gemma3/modeling_gemma3.py index 234dff860..15e30acc4 100644 --- a/QEfficient/transformers/models/gemma3/modeling_gemma3.py +++ b/QEfficient/transformers/models/gemma3/modeling_gemma3.py @@ -792,7 +792,9 @@ def get_specializations( lang[1].pop("vision_size") return lang, compiler_options - def get_onnx_dynamic_axes(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False, continuous_batching: bool = False): + def get_onnx_dynamic_axes( + self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False, continuous_batching: bool = False + ): # Define dynamic axes vision_dynamic_axes = {} lang_dynamic_axes = {} @@ -870,7 +872,9 @@ def get_dummy_pkv_cache(self, config, batch_size, seq_len): past_key_values.append(pkv) return past_key_values - def get_dummy_inputs(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False, continuous_batching: bool = False): + def get_dummy_inputs( + self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False, continuous_batching: bool = False + ): if vis_cfg := getattr(self.config, "vision_config", None): img_size = getattr(vis_cfg, "image_size", 896) else: diff --git a/QEfficient/transformers/models/internvl/modeling_internvl.py b/QEfficient/transformers/models/internvl/modeling_internvl.py index eb5a4b475..402f0450b 100644 --- a/QEfficient/transformers/models/internvl/modeling_internvl.py +++ b/QEfficient/transformers/models/internvl/modeling_internvl.py @@ -208,7 +208,9 @@ def get_specializations( lang[1].pop("vision_size") return lang, compiler_options - def get_onnx_dynamic_axes(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False, continuous_batching: bool = False): + def get_onnx_dynamic_axes( + self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False, continuous_batching: bool = False + ): # Define dynamic axes vision_dynamic_axes = {} lang_dynamic_axes = {} @@ -254,7 +256,9 @@ def get_output_names(self, kv_offload: bool = False): return lang_output_names return output_names - def get_dummy_inputs(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False, continuous_batching: bool = False): + def get_dummy_inputs( + self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False, continuous_batching: bool = False + ): if vis_cfg := getattr(self.config, "vision_config", None): img_size = getattr(vis_cfg, "image_size", constants.INTERN_IMG_SIZE) else: diff --git a/QEfficient/transformers/models/llava/modeling_llava.py b/QEfficient/transformers/models/llava/modeling_llava.py index 775d47768..b8f493b93 100644 --- a/QEfficient/transformers/models/llava/modeling_llava.py +++ b/QEfficient/transformers/models/llava/modeling_llava.py @@ -143,7 +143,13 @@ def forward( image_idx = torch.where(image_idx < next_image_idx, next_image_idx, image_idx) return logits, pixel_values, image_idx, outputs.past_key_values - def get_dummy_inputs(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False, continuous_batching: bool = False, **kwargs): + def get_dummy_inputs( + self, + comp_ctx_lengths: Optional[List[int]] = None, + kv_offload: bool = False, + continuous_batching: bool = False, + **kwargs, + ): num_layers = self.config.text_config.num_hidden_layers num_key_value_heads = self.config.text_config.num_key_value_heads head_dim = self.config.text_config.hidden_size // self.config.text_config.num_attention_heads @@ -246,7 +252,7 @@ def get_specializations( for i in range(0, len(comp_ctx_lengths_decode)): lang_decode = { - "batch_size": full_batch_size if continuous_batching else batch_size,, + "batch_size": full_batch_size if continuous_batching else batch_size, "seq_len": "1", "ctx_len": ctx_len, "comp_ctx_lengths": comp_ctx_lengths_decode[i], @@ -306,7 +312,9 @@ def get_specializations( lang[1].pop("vision_size") return lang, compiler_options - def get_onnx_dynamic_axes(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False, continuous_batching: bool = False): + def get_onnx_dynamic_axes( + self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False, continuous_batching: bool = False + ): # Define dynamic axes num_layers = self.config.text_config.num_hidden_layers diff --git a/QEfficient/transformers/models/mistral3/modeling_mistral3.py b/QEfficient/transformers/models/mistral3/modeling_mistral3.py index afe838f74..ab5c12bcc 100644 --- a/QEfficient/transformers/models/mistral3/modeling_mistral3.py +++ b/QEfficient/transformers/models/mistral3/modeling_mistral3.py @@ -252,7 +252,13 @@ def forward( return logits, pixel_values, image_idx, outputs.past_key_values - def get_dummy_inputs(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False, continuous_batching: bool = False, **kwargs): + def get_dummy_inputs( + self, + comp_ctx_lengths: Optional[List[int]] = None, + kv_offload: bool = False, + continuous_batching: bool = False, + **kwargs, + ): inputs_shapes = {} inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN) height = self.config.vision_config.image_size @@ -434,7 +440,9 @@ def get_specializations( lang[1].pop("vision_size") return lang, compiler_options - def get_onnx_dynamic_axes(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False, continuous_batching: bool = False): + def get_onnx_dynamic_axes( + self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False, continuous_batching: bool = False + ): # Define dynamic axes num_layers = self.config.text_config.num_hidden_layers diff --git a/QEfficient/transformers/models/molmo/modeling_molmo.py b/QEfficient/transformers/models/molmo/modeling_molmo.py index c088158c4..db4755843 100644 --- a/QEfficient/transformers/models/molmo/modeling_molmo.py +++ b/QEfficient/transformers/models/molmo/modeling_molmo.py @@ -43,14 +43,14 @@ def eager_attention_forward( if num_q_heads != num_kv_heads: assert num_q_heads % num_kv_heads == 0 repeat_factor = num_q_heads // num_kv_heads - _, _, S, D = k.shape + B, _, S, D = k.shape k = k.unsqueeze(2) k = k.expand(-1, -1, repeat_factor, -1, -1) - k = k.reshape(1, num_q_heads, S, D) + k = k.reshape(B, num_q_heads, S, D) v = v.unsqueeze(2) v = v.expand(-1, -1, repeat_factor, -1, -1) - v = v.reshape(1, num_q_heads, S, D) + v = v.reshape(B, num_q_heads, S, D) attn_weights = torch.matmul(q, k.transpose(2, 3)) * scale_factor @@ -596,6 +596,7 @@ def forward( image_idx, past_key_values, comp_ctx_lengths: Optional[List[int]] = None, + batch_index: Optional[torch.LongTensor] = None, ): if input_ids is not None: input_ids = input_ids * (input_ids != -1).to(input_ids.dtype) @@ -613,6 +614,7 @@ def forward( position_ids=position_ids, past_key_values=past_key_values, comp_ctx_lengths=comp_ctx_lengths, + batch_index=batch_index, use_cache=True, ) next_idx = (indices1.max() + 1).unsqueeze(0).unsqueeze(0) @@ -694,6 +696,9 @@ def get_specializations( comp_ctx_lengths_decode: Optional[List[int]] = None, valid_size: int = None, kv_offload: bool = False, + continuous_batching: bool = False, + kv_cache_batch_size: Optional[int] = None, + full_batch_size: Optional[int] = None, **compiler_options, ): prefill_seq_len = prefill_seq_len if prefill_seq_len else 1024 @@ -725,12 +730,20 @@ def get_specializations( for i in range(0, len(comp_ctx_lengths_prefill)): lang_prefill = { - "batch_size": batch_size, + "batch_size": 1 if continuous_batching else batch_size, "seq_len": prefill_seq_len, "ctx_len": ctx_len, "comp_ctx_lengths": comp_ctx_lengths_prefill[i], "valid_size": valid_size, + "vision_batch_size": batch_size, } + if continuous_batching: + lang_prefill["full_batch_size"] = kv_cache_batch_size + else: + lang_prefill["batch_size"] = kv_cache_batch_size + + if full_batch_size: + lang_prefill["full_batch_exec_size"] = full_batch_size if kv_offload: values = { "img_size": img_size, @@ -746,12 +759,17 @@ def get_specializations( for i in range(0, len(comp_ctx_lengths_decode)): lang_decode = { - "batch_size": batch_size, + "batch_size": full_batch_size if continuous_batching else batch_size, "seq_len": "1", "ctx_len": ctx_len, "comp_ctx_lengths": comp_ctx_lengths_decode[i], "valid_size": valid_size, + "vision_batch_size": batch_size, } + if continuous_batching: + lang_decode["full_batch_size"] = kv_cache_batch_size + else: + lang_decode["batch_size"] = kv_cache_batch_size if kv_offload: values = { "img_size": img_size, @@ -767,13 +785,33 @@ def get_specializations( else: lang_prefill = { - "batch_size": batch_size, + "batch_size": 1 if continuous_batching else batch_size, "seq_len": prefill_seq_len, "ctx_len": ctx_len, "valid_size": valid_size, + "vision_batch_size": batch_size, } - lang_decode = {"batch_size": batch_size, "seq_len": "1", "ctx_len": ctx_len, "valid_size": valid_size} + if continuous_batching: + lang_prefill["full_batch_size"] = kv_cache_batch_size + else: + lang_prefill["batch_size"] = kv_cache_batch_size + + if full_batch_size: + lang_prefill["full_batch_exec_size"] = full_batch_size + + lang_decode = { + "batch_size": full_batch_size if continuous_batching else batch_size, + "seq_len": "1", + "ctx_len": ctx_len, + "valid_size": valid_size, + "vision_batch_size": batch_size, + } + + if continuous_batching: + lang_decode["full_batch_size"] = kv_cache_batch_size + else: + lang_decode["batch_size"] = kv_cache_batch_size if kv_offload: values = { @@ -800,13 +838,15 @@ def get_specializations( else: return lang, compiler_options - def get_onnx_dynamic_axes(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False): + def get_onnx_dynamic_axes( + self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False, continuous_batching: bool = False + ): # Define dynamic axes vision_dynamic_axes = {} lang_dynamic_axes = {} lang_dynamic_axes["input_ids"] = {0: "batch_size", 1: "seq_len"} lang_dynamic_axes["position_ids"] = {0: "batch_size", 1: "seq_len"} - lang_dynamic_axes["vision_embeds"] = {0: "batch_size", 1: "valid_size"} + lang_dynamic_axes["vision_embeds"] = {0: "vision_batch_size", 1: "valid_size"} vision_dynamic_axes["pixel_values"] = {0: "batch_size", 1: "num_images", 2: "img_tile", 3: "img_size"} vision_dynamic_axes["image_input_idx"] = {0: "batch_size", 1: "num_images", 2: "num_patch"} @@ -816,8 +856,17 @@ def get_onnx_dynamic_axes(self, comp_ctx_lengths: Optional[List[int]] = None, kv num_layers = self.model.config.n_layers for i in range(num_layers): - lang_dynamic_axes[f"past_key.{i}"] = {0: "batch_size", 2: "ctx_len"} - lang_dynamic_axes[f"past_value.{i}"] = {0: "batch_size", 2: "ctx_len"} + lang_dynamic_axes[f"past_key.{i}"] = { + 0: "full_batch_size" if continuous_batching else "batch_size", + 2: "ctx_len", + } + lang_dynamic_axes[f"past_value.{i}"] = { + 0: "full_batch_size" if continuous_batching else "batch_size", + 2: "ctx_len", + } + + if continuous_batching: + lang_dynamic_axes["batch_index"] = {0: "batch_size"} if comp_ctx_lengths is not None: lang_dynamic_axes["comp_ctx_lengths"] = {0: "comp_ctx_lengths"} @@ -851,7 +900,13 @@ def get_output_names(self, kv_offload: bool = False): return lang_output_names return output_names - def get_dummy_inputs(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False, **kwargs): + def get_dummy_inputs( + self, + comp_ctx_lengths: Optional[List[int]] = None, + kv_offload: bool = False, + continuous_batching: bool = False, + **kwargs, + ): inputs_shapes = {} inputs_shapes_lang = {} inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN) @@ -902,10 +957,14 @@ def get_dummy_inputs(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offl .repeat(constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, 1) ) lang_inputs["image_idx"] = torch.zeros((inputs_shapes["image_idx"]), dtype=torch.int64) + + bs: int = constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE + fbs: int = constants.ONNX_EXPORT_EXAMPLE_FBS + # Add data for KV kv_cache_shape = get_padding_shape_from_config( config=self.config, - batch_size=constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, + batch_size=fbs if continuous_batching else bs, seq_len=constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, ) @@ -916,6 +975,8 @@ def get_dummy_inputs(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offl if comp_ctx_lengths is not None: lang_inputs["comp_ctx_lengths"] = torch.randint(0, 100, (40,), dtype=torch.long) + if continuous_batching: + lang_inputs["batch_index"] = torch.arange(bs).view(bs, 1) inputs = {} if kv_offload: diff --git a/tests/transformers/models/test_image_text_to_text_models.py b/tests/transformers/models/test_image_text_to_text_models.py index 11fcf6857..3ca4b65b0 100644 --- a/tests/transformers/models/test_image_text_to_text_models.py +++ b/tests/transformers/models/test_image_text_to_text_models.py @@ -208,6 +208,7 @@ # "allenai/Molmo-7B-D-0924", # True, # 1, + # 4, # 128, # 4096, # "https://picsum.photos/id/237/536/354", @@ -413,6 +414,7 @@ def check_molmo_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( query: str, prompt_len: int, ctx_len: int, + full_batch_size: int, max_gen_len: int = 20, batch_size: int = 1, n_layer: int = 1, @@ -430,6 +432,7 @@ def check_molmo_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( n_layer = (n_layer, n_layer) processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True, padding=True) + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) img = requests.get(img_url, stream=True) image = Image.open(BytesIO(img.content)).convert("RGB") image = image.resize((536, 354)) @@ -475,6 +478,54 @@ def check_molmo_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( output = qeff_model.generate(inputs=inputs, generation_len=NEW_GENERATION_TOKENS, streamer=streamer) qpc_tokens = output.generated_ids[:, :-1] assert (pytorch_hf_tokens == qpc_tokens).all(), "Tokens don't match for pytorch HF output and QPC output" + + if not kv_offload: # CB not yet enabled for Single QPC + return + images = [image] * full_batch_size + queries = [query] * full_batch_size + + pytorch_hf_tokens = api_runner.run_vlm_hf_model_on_pytorch_CB(model_hf, images, queries) + + qeff_model = QEFFAutoModelForCausalLM.from_pretrained( + model_name, + trust_remote_code=True, + attn_implementation="eager", + kv_offload=True, + config=config, + continuous_batching=True, + ) + + qeff_model.export() + + qeff_model.compile( + prefill_seq_len=prompt_len, + ctx_len=ctx_len, + num_devices=4, + batch_size=1, + full_batch_size=4, + mxfp6_matmul=False, + mxint8_kv_cache=True, + aic_enable_depth_first=True, + mos=1, + ) + + exec_info = qeff_model.generate( + tokenizer=tokenizer, + processor=processor, + images=[img_url] * full_batch_size, + prompts=queries, + generation_len=max_gen_len, + ) + + qpc_tokens = exec_info.generated_ids[:, :max_gen_len] + print("QPC Outputs (QAIC) for Continuous Batching:") + print(exec_info.generated_texts) + + for i in range(full_batch_size): + assert (pytorch_hf_tokens[i] == qpc_tokens[i]).all(), ( + f"Tokens don't match for prompt {i} between HF and QPC output" + ) + return @@ -655,15 +706,17 @@ def test_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100_qnn( @pytest.mark.on_qaic @pytest.mark.multimodal @pytest.mark.parametrize( - "model_name, kv_offload, batch_size, prompt_len, ctx_len, img_url, query, n_layer", molmo_model_config + "model_name, kv_offload, batch_size, full_batch_size, prompt_len, ctx_len, img_url, query, n_layer", + molmo_model_config, ) def test_image_text_to_text_molmo_pytorch_vs_kv_vs_ort_vs_ai100( - model_name, kv_offload, batch_size, prompt_len, ctx_len, img_url, query, n_layer + model_name, kv_offload, batch_size, full_batch_size, prompt_len, ctx_len, img_url, query, n_layer ): check_molmo_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( model_name=model_name, prompt_len=prompt_len, ctx_len=ctx_len, + full_batch_size=full_batch_size, max_gen_len=NEW_GENERATION_TOKENS, img_url=img_url, query=query, From a6f11823e6282481370c6db1056b839e75a1bef1 Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Mon, 17 Nov 2025 06:56:48 +0000 Subject: [PATCH 07/11] Added mistral CB support Signed-off-by: Asmita Goswami --- .../models/mistral3/modeling_mistral3.py | 13 ++++++++----- .../models/test_image_text_to_text_models.py | 5 +++-- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/QEfficient/transformers/models/mistral3/modeling_mistral3.py b/QEfficient/transformers/models/mistral3/modeling_mistral3.py index ab5c12bcc..60d33f388 100644 --- a/QEfficient/transformers/models/mistral3/modeling_mistral3.py +++ b/QEfficient/transformers/models/mistral3/modeling_mistral3.py @@ -178,16 +178,16 @@ def forward( comp_ctx_lengths: Optional[List[int]] = None, batch_index: Optional[torch.LongTensor] = None, ): - inputs_embeds = self.model.get_input_embeddings()(input_ids) - vision_embeds = vision_embeds.to(inputs_embeds.device, inputs_embeds.dtype) + inputs_embeds = self.model.language_model.get_input_embeddings()(input_ids) mask = input_ids == self.model.config.image_token_index indices1 = mask.to(torch.int64).cumsum(1) - 1 indices1 = torch.where(indices1 != -1, indices1 + image_idx, indices1) indices0 = torch.arange(mask.shape[0]).view(-1, 1) image_features_expanded = vision_embeds.unsqueeze(0)[indices0, indices1] - inputs_embeds_1 = torch.where(mask.unsqueeze(-1), image_features_expanded, inputs_embeds) - outputs = self.model.model( - inputs_embeds=inputs_embeds_1, + image_embeds = torch.where(mask.unsqueeze(-1), image_features_expanded, inputs_embeds) + inputs_embeds = torch.where(input_ids.shape[1] == torch.tensor(1), inputs_embeds, image_embeds) + outputs = self.language_model( + inputs_embeds=inputs_embeds, position_ids=position_ids, past_key_values=past_key_values, comp_ctx_lengths=comp_ctx_lengths, @@ -428,6 +428,9 @@ def get_specializations( lang_decode["full_batch_size"] = kv_cache_batch_size else: lang_decode["batch_size"] = kv_cache_batch_size + lang = [] + lang.append(lang_prefill) + lang.append(lang_decode) specializations = {} diff --git a/tests/transformers/models/test_image_text_to_text_models.py b/tests/transformers/models/test_image_text_to_text_models.py index 3ca4b65b0..89a915189 100644 --- a/tests/transformers/models/test_image_text_to_text_models.py +++ b/tests/transformers/models/test_image_text_to_text_models.py @@ -384,7 +384,7 @@ def check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( ctx_len=ctx_len, batch_size=batch_size, full_batch_size=full_batch_size, - mxfp6_matmul=True, + mxfp6_matmul=False, enable_qnn=enable_qnn, qnn_config=qnn_config, ) @@ -399,12 +399,13 @@ def check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( ) qpc_tokens = exec_info.generated_ids[:, :max_gen_len] + print("QPC Outputs (QAIC) for Continuous Batching:") + print(exec_info.generated_texts) for i in range(full_batch_size): assert (pytorch_hf_tokens[i] == qpc_tokens[i]).all(), ( f"Tokens don't match for prompt {i} between HF and QPC output" ) - return From 94552e05d9770782d11a0cda730d43c70a8ead85 Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Thu, 20 Nov 2025 10:14:40 +0000 Subject: [PATCH 08/11] Added CB Test for InternVL Signed-off-by: Asmita Goswami --- QEfficient/utils/run_utils.py | 46 +++++++++++ .../models/test_image_text_to_text_models.py | 80 ++++++++++++++----- 2 files changed, 106 insertions(+), 20 deletions(-) diff --git a/QEfficient/utils/run_utils.py b/QEfficient/utils/run_utils.py index 59e3f9bf4..f43654c0c 100644 --- a/QEfficient/utils/run_utils.py +++ b/QEfficient/utils/run_utils.py @@ -496,6 +496,52 @@ def __init__(self, batch_size, processor, config, image, prompt, prompt_len, ctx self.config = config self.gen_len = max_gen_len + @torch.no_grad() + def run_vlm_hf_model_on_pytorch_CB(self, model, images, queries): + """ + Function responsible for running HuggingFace ``PyTorch`` model for continuous batching + and return the output tokens for each prompt/image pair. + + ``Mandatory`` Args: + :model (torch.nn.module): Original ``PyTorch`` model + :images (List[PIL.Image]): List of input images + :queries (List[str]): List of input queries + + Return: + :List[numpy.ndarray]: List of generated output tokens for each prompt + """ + generated_ids = [] + + for idx, (image, query) in enumerate(zip(images, queries)): + num_patches_list = [] + + pixel_value = self.processor.load_image(image, max_num=12) + num_patches_list.append(pixel_value.shape[0]) + question = "\n" + query + + # Chat Template information for prompt preprocessing + messages: List[List[str]] = [] + roles = ("<|im_start|>user\n", "<|im_start|>assistant\n") + prompt = self.processor(pixel_value, question, messages, roles, num_patches_list=num_patches_list) + + inputs = self.processor.tokenizer(prompt, return_tensors="pt") + batch_size, prompt_len = inputs["input_ids"].shape + inputs["pixel_values"] = pixel_value.clone() + + generation_config = dict(max_new_tokens=self.gen_len, do_sample=False) + generation_config["eos_token_id"] = self.processor.tokenizer.convert_tokens_to_ids("<|im_end|>\n".strip()) + + # Decode and print output + outputs = model.generate(**inputs, **generation_config) + offset_output = outputs[0].detach().numpy() + + py_output = self.processor.tokenizer.decode(offset_output, skip_special_tokens=True).strip() + print("Original HF Model Outputs (Torch CPU):") + print("Completion:", repr(py_output)) + generated_ids.append(offset_output) + + return generated_ids + @torch.no_grad() def run_vlm_hf_model_on_pytorch(self, model, inputs, generation_config): outputs = model.generate(**inputs, **generation_config) diff --git a/tests/transformers/models/test_image_text_to_text_models.py b/tests/transformers/models/test_image_text_to_text_models.py index 89a915189..52068b99d 100644 --- a/tests/transformers/models/test_image_text_to_text_models.py +++ b/tests/transformers/models/test_image_text_to_text_models.py @@ -38,7 +38,6 @@ # model_name, # kv_offload, # batch_size, - # full_batch_size, # prompt_len, # ctx_len, # img_size, @@ -50,7 +49,6 @@ "llava-hf/llava-1.5-7b-hf", True, 1, - 4, 784, 1024, 336, @@ -62,7 +60,6 @@ "llava-hf/llava-1.5-7b-hf", False, 1, - 4, 784, 1024, 336, @@ -75,7 +72,6 @@ # "meta-llama/Llama-4-Scout-17B-16E-Instruct", # True, # 1, - # 4, # 128, # 3072, # 336, @@ -87,7 +83,6 @@ # "meta-llama/Llama-4-Scout-17B-16E-Instruct", # False, # 1, - # 4, # 128, # 3072, # 336, @@ -99,7 +94,6 @@ "google/gemma-3-4b-it", True, 1, - 4, 128, 3072, 896, @@ -111,7 +105,6 @@ "google/gemma-3-4b-it", False, 1, - 4, 128, 3072, 896, @@ -123,7 +116,6 @@ "mistralai/Mistral-Small-3.1-24B-Instruct-2503", True, 1, - 4, 128, 4096, 1540, @@ -135,7 +127,6 @@ "mistralai/Mistral-Small-3.1-24B-Instruct-2503", False, 1, - 4, 128, 4096, 1540, @@ -147,7 +138,6 @@ "Qwen/Qwen2.5-VL-3B-Instruct", True, 1, - 4, 128, 4096, 1540, @@ -159,7 +149,6 @@ # "meta-llama/Llama-3.2-11B-Vision-Instruct", # True, # 1, - # 4, # 32, # 512, # 560, @@ -208,7 +197,6 @@ # "allenai/Molmo-7B-D-0924", # True, # 1, - # 4, # 128, # 4096, # "https://picsum.photos/id/237/536/354", @@ -268,7 +256,6 @@ def check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( query: str, prompt_len: int, ctx_len: int, - full_batch_size: int, max_gen_len: int = 20, batch_size: int = 1, n_layer: int = 1, @@ -358,6 +345,7 @@ def check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( # testing for CB models if not kv_offload: # CB not yet enabled for Single QPC return + full_batch_size = 4 images = [image] * full_batch_size queries = [query] * full_batch_size @@ -415,7 +403,6 @@ def check_molmo_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( query: str, prompt_len: int, ctx_len: int, - full_batch_size: int, max_gen_len: int = 20, batch_size: int = 1, n_layer: int = 1, @@ -480,8 +467,10 @@ def check_molmo_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( qpc_tokens = output.generated_ids[:, :-1] assert (pytorch_hf_tokens == qpc_tokens).all(), "Tokens don't match for pytorch HF output and QPC output" + # testing for CB models if not kv_offload: # CB not yet enabled for Single QPC return + full_batch_size = 4 images = [image] * full_batch_size queries = [query] * full_batch_size @@ -636,17 +625,68 @@ def check_intern_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( output = qeff_model.generate(inputs=inputs, generation_len=NEW_GENERATION_TOKENS, streamer=streamer) qpc_tokens = output.generated_ids[:, :-1] assert (pytorch_hf_tokens == qpc_tokens).all(), "Tokens don't match for pytorch HF output and QPC output" + + # testing for CB models + if not kv_offload: # CB not yet enabled for Single QPC + return + + full_batch_size = 4 + image = [image] * full_batch_size + queries = [query] * full_batch_size + + pytorch_hf_tokens = api_runner.run_vlm_hf_model_on_pytorch_CB(model_hf, image, queries) + + qeff_model = QEFFAutoModelForCausalLM.from_pretrained( + model_name, + trust_remote_code=True, + attn_implementation="eager", + kv_offload=True, + config=config, + continuous_batching=True, + ) + + qeff_model.export() + + qeff_model.compile( + num_patches=1, + prefill_seq_len=prompt_len, + ctx_len=ctx_len, + num_devices=4, + batch_size=1, + full_batch_size=full_batch_size, + mxfp6_matmul=False, + ) + + exec_info = qeff_model.generate( + tokenizer=tokenizer, + processor=processor, + images=img_url * full_batch_size, + prompts=queries, + generation_len=max_gen_len, + image_height=448, + image_width=448, + ) + + qpc_tokens = exec_info.generated_ids[:, :max_gen_len] + print("QPC Outputs (QAIC) for Continuous Batching:") + print(exec_info.generated_texts) + + for i in range(full_batch_size): + assert (pytorch_hf_tokens[i] == qpc_tokens[i]).all(), ( + f"Tokens don't match for prompt {i} between HF and QPC output" + ) + return @pytest.mark.on_qaic @pytest.mark.multimodal @pytest.mark.parametrize( - "model_name, kv_offload, batch_size, full_batch_size, prompt_len, ctx_len, img_size, img_url, query, n_layer", + "model_name, kv_offload, batch_size, prompt_len, ctx_len, img_size, img_url, query, n_layer", test_models_config, ) def test_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( - model_name, kv_offload, batch_size, full_batch_size, prompt_len, ctx_len, img_size, img_url, query, n_layer + model_name, kv_offload, batch_size, prompt_len, ctx_len, img_size, img_url, query, n_layer ): """ Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, without continuous batching. @@ -663,7 +703,6 @@ def test_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( query=query, n_layer=n_layer, batch_size=batch_size, - full_batch_size=full_batch_size, kv_offload=kv_offload, ) @@ -671,6 +710,7 @@ def test_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( @pytest.mark.on_qaic @pytest.mark.qnn @pytest.mark.multimodal +@pytest.mark.skip(reason="Issues with QNN") @pytest.mark.parametrize( "model_name, kv_offload, batch_size, prompt_len, ctx_len, img_size, img_url, query, n_layer", test_models_config ) @@ -707,17 +747,16 @@ def test_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100_qnn( @pytest.mark.on_qaic @pytest.mark.multimodal @pytest.mark.parametrize( - "model_name, kv_offload, batch_size, full_batch_size, prompt_len, ctx_len, img_url, query, n_layer", + "model_name, kv_offload, batch_size, prompt_len, ctx_len, img_url, query, n_layer", molmo_model_config, ) def test_image_text_to_text_molmo_pytorch_vs_kv_vs_ort_vs_ai100( - model_name, kv_offload, batch_size, full_batch_size, prompt_len, ctx_len, img_url, query, n_layer + model_name, kv_offload, batch_size, prompt_len, ctx_len, img_url, query, n_layer ): check_molmo_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( model_name=model_name, prompt_len=prompt_len, ctx_len=ctx_len, - full_batch_size=full_batch_size, max_gen_len=NEW_GENERATION_TOKENS, img_url=img_url, query=query, @@ -751,6 +790,7 @@ def test_image_text_to_text_intern_pytorch_vs_kv_vs_ort_vs_ai100( @pytest.mark.on_qaic @pytest.mark.qnn @pytest.mark.multimodal +@pytest.mark.skip(reason="Issues with QNN") @pytest.mark.parametrize( "model_name, kv_offload, batch_size, prompt_len, ctx_len, img_url, query, n_layer", intern_model_config ) From e8af9174d1744c6eea9985f73899fe46ae0ad72f Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Thu, 20 Nov 2025 10:19:16 +0000 Subject: [PATCH 09/11] Ruff format Signed-off-by: Asmita Goswami --- QEfficient/utils/run_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/QEfficient/utils/run_utils.py b/QEfficient/utils/run_utils.py index f43654c0c..76234f76e 100644 --- a/QEfficient/utils/run_utils.py +++ b/QEfficient/utils/run_utils.py @@ -6,6 +6,7 @@ # ----------------------------------------------------------------------------- import os +from typing import List import numpy as np import onnx From eea2ffa526c30fb688181d74e669c1035017caef Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Tue, 25 Nov 2025 18:03:45 +0000 Subject: [PATCH 10/11] Resolving CI issues Signed-off-by: Asmita Goswami --- .../models/gemma3/modeling_gemma3.py | 2 - examples/internvl_CB_example.py | 98 ------------------- .../models/test_image_text_to_text_models.py | 3 +- 3 files changed, 1 insertion(+), 102 deletions(-) delete mode 100644 examples/internvl_CB_example.py diff --git a/QEfficient/transformers/models/gemma3/modeling_gemma3.py b/QEfficient/transformers/models/gemma3/modeling_gemma3.py index 15e30acc4..c80efde55 100644 --- a/QEfficient/transformers/models/gemma3/modeling_gemma3.py +++ b/QEfficient/transformers/models/gemma3/modeling_gemma3.py @@ -788,8 +788,6 @@ def get_specializations( specializations["lang"] = lang return specializations, compiler_options else: - lang[0].pop("vision_size") - lang[1].pop("vision_size") return lang, compiler_options def get_onnx_dynamic_axes( diff --git a/examples/internvl_CB_example.py b/examples/internvl_CB_example.py deleted file mode 100644 index 29cb9a5c4..000000000 --- a/examples/internvl_CB_example.py +++ /dev/null @@ -1,98 +0,0 @@ -# ----------------------------------------------------------------------------- -# -# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. -# SPDX-License-Identifier: BSD-3-Clause -# -# ---------------------------------------------------------------------------- - -from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer - -from QEfficient import QEFFAutoModelForCausalLM -from QEfficient.utils.test_utils import InternProcessor - -model_id = "OpenGVLab/InternVL2_5-1B" -config = AutoConfig.from_pretrained(model_id, trust_remote_code=True) -# For Testing Purpose Only -config.llm_config.num_hidden_layers = 2 -config.vision_config.num_hidden_layers = 2 - -model_hf = AutoModelForCausalLM.from_pretrained( - model_id, - low_cpu_mem_usage=False, - trust_remote_code=True, - config=config, -) - -tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True, use_fast=False) -processor = InternProcessor(model_hf, tokenizer) - - -continuous_batching = True -if continuous_batching: - qeff_model = QEFFAutoModelForCausalLM.from_pretrained( - model_id, - attn_implementation="eager", - kv_offload=True, - config=config, - continuous_batching=True, - trust_remote_code=True, - ) - - qeff_model.compile( - num_patches=13, # Set num_patches according to image_height and image_width, default is 13 (747 x 1000) - prefill_seq_len=128, - ctx_len=4096, - num_cores=16, - num_devices=4, - batch_size=1, - full_batch_size=4, - mxfp6_matmul=True, - mxint8_kv_cache=True, - aic_enable_depth_first=True, - mos=1, - ) -else: - qeff_model = QEFFAutoModelForCausalLM.from_pretrained( - model_id, attn_implementation="eager", kv_offload=True, config=config, trust_remote_code=True - ) - - qeff_model.compile( - num_patches=13, - prefill_seq_len=128, - ctx_len=4096, - num_cores=16, - num_devices=4, - batch_size=1, - mxfp6_matmul=True, - mxint8_kv_cache=True, - aic_enable_depth_first=True, - ) - -image_urls = [ - "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png", - "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg", - "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png", - "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg", -] - -prompts = [ - "Can you describe the image in detail?", - "What are the objects in the image?", - "What is the main subject of the image?", - "What colors are predominant in the image?", -] - -exec_info = qeff_model.generate( - tokenizer=tokenizer, - prompts=prompts, - processor=processor, - images=image_urls, - device_ids=[0, 1, 2, 3], - generation_len=10, - image_height=747, - image_width=1000, -) - -print("Generated texts:", exec_info.generated_texts) -print("Generated IDs:", exec_info.generated_ids) -print(exec_info) diff --git a/tests/transformers/models/test_image_text_to_text_models.py b/tests/transformers/models/test_image_text_to_text_models.py index 52068b99d..cc66ddb9c 100644 --- a/tests/transformers/models/test_image_text_to_text_models.py +++ b/tests/transformers/models/test_image_text_to_text_models.py @@ -631,10 +631,9 @@ def check_intern_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( return full_batch_size = 4 - image = [image] * full_batch_size queries = [query] * full_batch_size - pytorch_hf_tokens = api_runner.run_vlm_hf_model_on_pytorch_CB(model_hf, image, queries) + pytorch_hf_tokens = [pytorch_hf_tokens] * 4 qeff_model = QEFFAutoModelForCausalLM.from_pretrained( model_name, From ee5421526c9739872200b77f8859719d12c5fdf4 Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Tue, 25 Nov 2025 18:05:08 +0000 Subject: [PATCH 11/11] Added InetrnVL example file for CB Signed-off-by: Asmita Goswami --- .../models/internvl/continuous_batching.py | 98 +++++++++++++++++++ 1 file changed, 98 insertions(+) create mode 100644 examples/image_text_to_text/models/internvl/continuous_batching.py diff --git a/examples/image_text_to_text/models/internvl/continuous_batching.py b/examples/image_text_to_text/models/internvl/continuous_batching.py new file mode 100644 index 000000000..29cb9a5c4 --- /dev/null +++ b/examples/image_text_to_text/models/internvl/continuous_batching.py @@ -0,0 +1,98 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +# SPDX-License-Identifier: BSD-3-Clause +# +# ---------------------------------------------------------------------------- + +from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer + +from QEfficient import QEFFAutoModelForCausalLM +from QEfficient.utils.test_utils import InternProcessor + +model_id = "OpenGVLab/InternVL2_5-1B" +config = AutoConfig.from_pretrained(model_id, trust_remote_code=True) +# For Testing Purpose Only +config.llm_config.num_hidden_layers = 2 +config.vision_config.num_hidden_layers = 2 + +model_hf = AutoModelForCausalLM.from_pretrained( + model_id, + low_cpu_mem_usage=False, + trust_remote_code=True, + config=config, +) + +tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True, use_fast=False) +processor = InternProcessor(model_hf, tokenizer) + + +continuous_batching = True +if continuous_batching: + qeff_model = QEFFAutoModelForCausalLM.from_pretrained( + model_id, + attn_implementation="eager", + kv_offload=True, + config=config, + continuous_batching=True, + trust_remote_code=True, + ) + + qeff_model.compile( + num_patches=13, # Set num_patches according to image_height and image_width, default is 13 (747 x 1000) + prefill_seq_len=128, + ctx_len=4096, + num_cores=16, + num_devices=4, + batch_size=1, + full_batch_size=4, + mxfp6_matmul=True, + mxint8_kv_cache=True, + aic_enable_depth_first=True, + mos=1, + ) +else: + qeff_model = QEFFAutoModelForCausalLM.from_pretrained( + model_id, attn_implementation="eager", kv_offload=True, config=config, trust_remote_code=True + ) + + qeff_model.compile( + num_patches=13, + prefill_seq_len=128, + ctx_len=4096, + num_cores=16, + num_devices=4, + batch_size=1, + mxfp6_matmul=True, + mxint8_kv_cache=True, + aic_enable_depth_first=True, + ) + +image_urls = [ + "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png", + "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg", + "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png", + "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg", +] + +prompts = [ + "Can you describe the image in detail?", + "What are the objects in the image?", + "What is the main subject of the image?", + "What colors are predominant in the image?", +] + +exec_info = qeff_model.generate( + tokenizer=tokenizer, + prompts=prompts, + processor=processor, + images=image_urls, + device_ids=[0, 1, 2, 3], + generation_len=10, + image_height=747, + image_width=1000, +) + +print("Generated texts:", exec_info.generated_texts) +print("Generated IDs:", exec_info.generated_ids) +print(exec_info)