From b399dd3b16f190c0354f1a1b12aaaf33893cd69c Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Wed, 21 May 2025 08:03:52 +0000 Subject: [PATCH 1/5] Added Prompt len check for VLMs Signed-off-by: Asmita Goswami --- QEfficient/cloud/infer.py | 92 ++++++++++++++++++++++++++------------ docs/source/quick_start.md | 17 +++++++ 2 files changed, 80 insertions(+), 29 deletions(-) diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py index 43bbda0ba..6cc840d98 100644 --- a/QEfficient/cloud/infer.py +++ b/QEfficient/cloud/infer.py @@ -22,15 +22,10 @@ # TODO: Remove after adding support for VLM's compile and execute def execute_vlm_model( + processor: PreTrainedModel, qeff_model: PreTrainedModel, - model_name: str, - image_url: str, - image_path: str, - prompt: Optional[str] = None, # type: ignore + inputs: Optional[dict] = None, device_group: Optional[List[int]] = None, - local_model_dir: Optional[str] = None, - cache_dir: Optional[str] = None, - hf_token: Optional[str] = None, generation_len: Optional[int] = None, ): """ @@ -50,16 +45,43 @@ def execute_vlm_model( Returns: :dict: Output from the ``AI_100`` runtime. """ + streamer = TextStreamer(processor.tokenizer) + output = qeff_model.generate( + inputs=inputs, + streamer=streamer, + device_ids=device_group, + generation_len=generation_len, + ) + return output + + +def count_vlm_tokens( + processor: PreTrainedModel, + prompt_len: int = 32, + ctx_len: int = 128, + image_url: Optional[str] = None, + image_path: Optional[str] = None, + prompt: Optional[str] = None, # type: ignore +): + """ + This method counts the number of tokens in the image and updates the prompt length and context length accordingly. + ``Mandatory`` Args: + :processor (PreTrainedModel): Hugging Face Processor object. + :image_url (str): Image URL to be used for inference. ``Defaults to None.`` + :image_path (str): Image path to be used for inference. ``Defaults to None.`` + ``Optional`` Args: + :prompt_len (str): Prompt length for the model to compile. ``Defaults to 32.`` + :ctx_len (str): Maximum context length to compile the model. ``Defaults to 128.`` + :prompt (str): Sample prompt for the model text generation. ``Defaults to None.``` + Returns: + :prompt_len: Updated prompt length for the VLM model to compile. + :ctx_len: Updated context length for the VLM model to compile. + :split_inputs: Tokenized inputs for the VLM model. + """ if not (image_url or image_path): raise ValueError('Neither Image URL nor Image Path is found, either provide "image_url" or "image_path"') raw_image = Image.open(requests.get(image_url, stream=True).raw) if image_url else Image.open(image_path) - processor = load_hf_processor( - pretrained_model_name_or_path=(local_model_dir if local_model_dir else model_name), - cache_dir=cache_dir, - hf_token=hf_token, - ) - # Added for QEff version 1.20 supported VLM models (mllama and llava) conversation = [ { @@ -73,21 +95,24 @@ def execute_vlm_model( # Converts a list of dictionaries with `"role"` and `"content"` keys to a list of token ids. input_text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False) - split_inputs = processor( text=input_text, images=raw_image, return_tensors="pt", add_special_tokens=False, ) - streamer = TextStreamer(processor.tokenizer) - output = qeff_model.generate( - inputs=split_inputs, - streamer=streamer, - device_ids=device_group, - generation_len=generation_len, - ) - return output + decoded_tokens = processor.tokenizer.decode(split_inputs["input_ids"][0]) + + total_tokens = decoded_tokens.count("") + decoded_tokens.count("") + if total_tokens > prompt_len: + logger.warning( + f"Prompt length {prompt_len} is less than the number of tokens in the image. " + f"Increasing increase the prompt length to at least {total_tokens + prompt_len}." + ) + prompt_len = total_tokens + prompt_len + ctx_len = prompt_len + 50 + + return prompt_len, ctx_len, split_inputs def main( @@ -176,6 +201,20 @@ def main( kwargs.pop("img_size", None) or image_path or image_url ): logger.warning(f"Skipping image arguments as they are not valid for {architecture}") + else: + processor = load_hf_processor( + pretrained_model_name_or_path=(local_model_dir if local_model_dir else model_name), + cache_dir=cache_dir, + hf_token=hf_token, + ) + prompt_len, ctx_len, inputs = count_vlm_tokens( + processor=processor, + prompt_len=prompt_len, + ctx_len=ctx_len, + image_url=image_url, + image_path=image_path, + prompt=prompt, + ) ######### # Compile @@ -206,15 +245,10 @@ def main( ######### if architecture in MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES.values(): exec_info = execute_vlm_model( + processor=processor, qeff_model=qeff_model, - model_name=model_name, - prompt=prompt, - image_url=image_url, - image_path=image_path, + inputs=inputs, device_group=device_group, - local_model_dir=local_model_dir, - cache_dir=cache_dir, - hf_token=hf_token, generation_len=generation_len, ) print(exec_info) diff --git a/docs/source/quick_start.md b/docs/source/quick_start.md index abab4cfc3..10ac85c12 100644 --- a/docs/source/quick_start.md +++ b/docs/source/quick_start.md @@ -194,6 +194,23 @@ qeff_model.generate(prompts=["My name is"]) **Users can also take advantage of features like multi-Qranium inference and continuous batching with QNN SDK Compilation.** + +### VLM Inference + +Users can compile a VLM model by using the below commands. + +**CLI Inference Command** + +For Llava +```bash +python -m QEfficient.cloud.infer --model_name llava-hf/llava-1.5-7b-hf --batch_size 1 --prompt_len 784 --ctx_len 1024 --mxfp6 --num_cores 16 --device_group [0] --prompt "Describe the image" --mos 1 --aic_enable_depth_first --image_url https://i.etsystatic.com/8155076/r/il/0825c2/1594869823/il_fullxfull.1594869823_5x0w.jpg --generation_len 128 +``` + +For Mllama +```bash +python -m QEfficient.cloud.infer --model_name meta-llama/Llama-3.2-11B-Vision-Instruct --batch_size 1 --prompt_len 32 --ctx_len 512 --num_cores 16 --device_group [0] --prompt "Describe the image?" --mos 1 --allocator_dealloc_delay 1 --image_url https://i.etsystatic.com/8155076/r/il/0825c2/1594869823/il_fullxfull.1594869823_5x0w.jpg +``` + ## Python API ### 1. Model download and Optimize for Cloud AI 100 From 5c1ea8f9f38b4716a1da835b2b77adfa685206f3 Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Wed, 21 May 2025 08:29:05 +0000 Subject: [PATCH 2/5] Added load streamer in utils Signed-off-by: Asmita Goswami --- QEfficient/cloud/infer.py | 17 +++++++++++++---- QEfficient/utils/__init__.py | 1 + QEfficient/utils/_utils.py | 17 +++++++++++++++++ 3 files changed, 31 insertions(+), 4 deletions(-) diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py index 6cc840d98..306b0536a 100644 --- a/QEfficient/cloud/infer.py +++ b/QEfficient/cloud/infer.py @@ -16,7 +16,7 @@ from transformers.models.auto.modeling_auto import MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES from QEfficient.base.common import QEFFCommonLoader -from QEfficient.utils import check_and_assign_cache_dir, load_hf_processor, load_hf_tokenizer +from QEfficient.utils import check_and_assign_cache_dir, load_hf_processor, load_hf_tokenizer, load_streamer from QEfficient.utils.logging_utils import logger @@ -45,7 +45,7 @@ def execute_vlm_model( Returns: :dict: Output from the ``AI_100`` runtime. """ - streamer = TextStreamer(processor.tokenizer) + streamer = load_streamer(processor.tokenizer) output = qeff_model.generate( inputs=inputs, streamer=streamer, @@ -101,16 +101,23 @@ def count_vlm_tokens( return_tensors="pt", add_special_tokens=False, ) + + # Get the number of total number of decoded tokens in the input decoded_tokens = processor.tokenizer.decode(split_inputs["input_ids"][0]) total_tokens = decoded_tokens.count("") + decoded_tokens.count("") + + # Check if the number of tokens in the image is greater than the prompt length if total_tokens > prompt_len: logger.warning( f"Prompt length {prompt_len} is less than the number of tokens in the image. " - f"Increasing increase the prompt length to at least {total_tokens + prompt_len}." + f"Increasing the prompt length to at least {total_tokens + prompt_len}." ) prompt_len = total_tokens + prompt_len - ctx_len = prompt_len + 50 + + # Update the context length only if it is less than the prompt length + if ctx_len < prompt_len: + ctx_len = prompt_len + ctx_len return prompt_len, ctx_len, split_inputs @@ -207,6 +214,8 @@ def main( cache_dir=cache_dir, hf_token=hf_token, ) + + # count the number of tokens in required in the input and update the prompt length and context length accordingly prompt_len, ctx_len, inputs = count_vlm_tokens( processor=processor, prompt_len=prompt_len, diff --git a/QEfficient/utils/__init__.py b/QEfficient/utils/__init__.py index f73998302..4825ddbec 100755 --- a/QEfficient/utils/__init__.py +++ b/QEfficient/utils/__init__.py @@ -21,6 +21,7 @@ hf_download, load_hf_processor, load_hf_tokenizer, + load_streamer, login_and_download_hf_lm, onnx_exists, padding_check_and_fix, diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py index f8bc5753c..5bfb01433 100644 --- a/QEfficient/utils/_utils.py +++ b/QEfficient/utils/_utils.py @@ -20,6 +20,7 @@ from requests.exceptions import HTTPError from transformers import ( AutoProcessor, + TextStreamer, AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast, @@ -220,6 +221,22 @@ def load_hf_processor( return processor +def load_streamer( + tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], +): + """ + Loads the streamer for the given tokenizer. + -------- + + tokenizer: `Union[PreTrainedTokenizer, PreTrainedTokenizerFast]` - Pass model tokenizer to load streamer. + + Return: + TextStreamer object for the given tokenizer. + """ + logger.info("Loading Streamer") + return TextStreamer(tokenizer) + + def get_qpc_dir_path( model_card_name, num_cores, From 43bcdbd5ef50f40ac4f5f8eb9fbe7e71f70c5d1f Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Wed, 21 May 2025 08:30:10 +0000 Subject: [PATCH 3/5] Added load streamer in utils Signed-off-by: Asmita Goswami --- QEfficient/cloud/infer.py | 2 +- QEfficient/utils/_utils.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py index 306b0536a..27cd51a4d 100644 --- a/QEfficient/cloud/infer.py +++ b/QEfficient/cloud/infer.py @@ -12,7 +12,7 @@ import requests from PIL import Image -from transformers import PreTrainedModel, TextStreamer +from transformers import PreTrainedModel from transformers.models.auto.modeling_auto import MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES from QEfficient.base.common import QEFFCommonLoader diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py index 5bfb01433..ab93fc488 100644 --- a/QEfficient/utils/_utils.py +++ b/QEfficient/utils/_utils.py @@ -20,10 +20,10 @@ from requests.exceptions import HTTPError from transformers import ( AutoProcessor, - TextStreamer, AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast, + TextStreamer, ) from QEfficient.utils.constants import QEFF_MODELS_DIR, Constants, QnnConstants @@ -222,7 +222,7 @@ def load_hf_processor( def load_streamer( - tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], + tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], ): """ Loads the streamer for the given tokenizer. @@ -234,7 +234,7 @@ def load_streamer( TextStreamer object for the given tokenizer. """ logger.info("Loading Streamer") - return TextStreamer(tokenizer) + return TextStreamer(tokenizer) def get_qpc_dir_path( From 557968612f16b0f96cdd5a6e49c4cb86f0cca197 Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Wed, 21 May 2025 08:34:14 +0000 Subject: [PATCH 4/5] Ruff format Signed-off-by: Asmita Goswami --- QEfficient/cloud/infer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py index 27cd51a4d..3d3af4858 100644 --- a/QEfficient/cloud/infer.py +++ b/QEfficient/cloud/infer.py @@ -105,15 +105,15 @@ def count_vlm_tokens( # Get the number of total number of decoded tokens in the input decoded_tokens = processor.tokenizer.decode(split_inputs["input_ids"][0]) - total_tokens = decoded_tokens.count("") + decoded_tokens.count("") + total_image_tokens = decoded_tokens.count("") + decoded_tokens.count("") # Check if the number of tokens in the image is greater than the prompt length - if total_tokens > prompt_len: + if total_image_tokens > prompt_len: logger.warning( f"Prompt length {prompt_len} is less than the number of tokens in the image. " - f"Increasing the prompt length to at least {total_tokens + prompt_len}." + f"Increasing the prompt length to at least {total_image_tokens + prompt_len}." ) - prompt_len = total_tokens + prompt_len + prompt_len = total_image_tokens + prompt_len # Update the context length only if it is less than the prompt length if ctx_len < prompt_len: From f34d554b1937a11eac186c194a34f7012430e7b2 Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Thu, 3 Jul 2025 06:07:58 +0000 Subject: [PATCH 5/5] Handled Causal LM model's case when image_path or image_url is passed Signed-off-by: Asmita Goswami --- QEfficient/cloud/infer.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py index 368f6103f..a8563accc 100644 --- a/QEfficient/cloud/infer.py +++ b/QEfficient/cloud/infer.py @@ -207,10 +207,9 @@ def main( config = qeff_model.model.config architecture = config.architectures[0] if config.architectures else None - if architecture not in MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES.values() and ( - kwargs.pop("img_size", None) or image_path or image_url - ): - logger.warning(f"Skipping image arguments as they are not valid for {architecture}") + if architecture not in MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES.values(): + if kwargs.pop("img_size", None) or image_path or image_url: + logger.warning(f"Skipping image arguments as they are not valid for {architecture}") else: processor = load_hf_processor( pretrained_model_name_or_path=(local_model_dir if local_model_dir else model_name),