From b399dd3b16f190c0354f1a1b12aaaf33893cd69c Mon Sep 17 00:00:00 2001
From: Asmita Goswami <asmigosw@qti.qualcomm.com>
Date: Wed, 21 May 2025 08:03:52 +0000
Subject: [PATCH 1/5] Added Prompt len check for VLMs

Signed-off-by: Asmita Goswami <quic_asmigosw@quicinc.com>
---
 QEfficient/cloud/infer.py  | 92 ++++++++++++++++++++++++++------------
 docs/source/quick_start.md | 17 +++++++
 2 files changed, 80 insertions(+), 29 deletions(-)
diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py
index 43bbda0ba..6cc840d98 100644
--- a/QEfficient/cloud/infer.py
+++ b/QEfficient/cloud/infer.py
@@ -22,15 +22,10 @@
 
 # TODO: Remove after adding support for VLM's compile and execute
 def execute_vlm_model(
+    processor: PreTrainedModel,
     qeff_model: PreTrainedModel,
-    model_name: str,
-    image_url: str,
-    image_path: str,
-    prompt: Optional[str] = None,  # type: ignore
+    inputs: Optional[dict] = None,
     device_group: Optional[List[int]] = None,
-    local_model_dir: Optional[str] = None,
-    cache_dir: Optional[str] = None,
-    hf_token: Optional[str] = None,
     generation_len: Optional[int] = None,
 ):
     """
@@ -50,16 +45,43 @@ def execute_vlm_model(
     Returns:
         :dict: Output from the ``AI_100`` runtime.
     """
+    streamer = TextStreamer(processor.tokenizer)
+    output = qeff_model.generate(
+        inputs=inputs,
+        streamer=streamer,
+        device_ids=device_group,
+        generation_len=generation_len,
+    )
+    return output
+
+
+def count_vlm_tokens(
+    processor: PreTrainedModel,
+    prompt_len: int = 32,
+    ctx_len: int = 128,
+    image_url: Optional[str] = None,
+    image_path: Optional[str] = None,
+    prompt: Optional[str] = None,  # type: ignore
+):
+    """
+    This method counts the number of tokens in the image and updates the prompt length and context length accordingly.
+    ``Mandatory`` Args:
+        :processor (PreTrainedModel): Hugging Face Processor object.
+        :image_url (str): Image URL to be used for inference. ``Defaults to None.``
+        :image_path (str): Image path to be used for inference. ``Defaults to None.``
+    ``Optional`` Args:
+        :prompt_len (str): Prompt length for the model to compile. ``Defaults to 32.``
+        :ctx_len (str): Maximum context length to compile the model. ``Defaults to 128.``
+        :prompt (str): Sample prompt for the model text generation. ``Defaults to None.```
+    Returns:
+        :prompt_len: Updated prompt length for the VLM model to compile.
+        :ctx_len: Updated context length for the VLM model to compile.
+        :split_inputs: Tokenized inputs for the VLM model.
+    """
     if not (image_url or image_path):
         raise ValueError('Neither Image URL nor Image Path is found, either provide "image_url" or "image_path"')
     raw_image = Image.open(requests.get(image_url, stream=True).raw) if image_url else Image.open(image_path)
 
-    processor = load_hf_processor(
-        pretrained_model_name_or_path=(local_model_dir if local_model_dir else model_name),
-        cache_dir=cache_dir,
-        hf_token=hf_token,
-    )
-
     # Added for QEff version 1.20 supported VLM models (mllama and llava)
     conversation = [
         {
@@ -73,21 +95,24 @@ def execute_vlm_model(
 
     # Converts a list of dictionaries with `"role"` and `"content"` keys to a list of token ids.
     input_text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
-
     split_inputs = processor(
         text=input_text,
         images=raw_image,
         return_tensors="pt",
         add_special_tokens=False,
     )
-    streamer = TextStreamer(processor.tokenizer)
-    output = qeff_model.generate(
-        inputs=split_inputs,
-        streamer=streamer,
-        device_ids=device_group,
-        generation_len=generation_len,
-    )
-    return output
+    decoded_tokens = processor.tokenizer.decode(split_inputs["input_ids"][0])
+
+    total_tokens = decoded_tokens.count("<IMG_CONTEXT>") + decoded_tokens.count("<image>")
+    if total_tokens > prompt_len:
+        logger.warning(
+            f"Prompt length {prompt_len} is less than the number of tokens in the image. "
+            f"Increasing increase the prompt length to at least {total_tokens + prompt_len}."
+        )
+        prompt_len = total_tokens + prompt_len
+        ctx_len = prompt_len + 50
+
+    return prompt_len, ctx_len, split_inputs
 
 
 def main(
@@ -176,6 +201,20 @@ def main(
         kwargs.pop("img_size", None) or image_path or image_url
     ):
         logger.warning(f"Skipping image arguments as they are not valid for {architecture}")
+    else:
+        processor = load_hf_processor(
+            pretrained_model_name_or_path=(local_model_dir if local_model_dir else model_name),
+            cache_dir=cache_dir,
+            hf_token=hf_token,
+        )
+        prompt_len, ctx_len, inputs = count_vlm_tokens(
+            processor=processor,
+            prompt_len=prompt_len,
+            ctx_len=ctx_len,
+            image_url=image_url,
+            image_path=image_path,
+            prompt=prompt,
+        )
 
     #########
     # Compile
@@ -206,15 +245,10 @@ def main(
     #########
     if architecture in MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES.values():
         exec_info = execute_vlm_model(
+            processor=processor,
             qeff_model=qeff_model,
-            model_name=model_name,
-            prompt=prompt,
-            image_url=image_url,
-            image_path=image_path,
+            inputs=inputs,
             device_group=device_group,
-            local_model_dir=local_model_dir,
-            cache_dir=cache_dir,
-            hf_token=hf_token,
             generation_len=generation_len,
         )
         print(exec_info)
diff --git a/docs/source/quick_start.md b/docs/source/quick_start.md
index abab4cfc3..10ac85c12 100644
--- a/docs/source/quick_start.md
+++ b/docs/source/quick_start.md
@@ -194,6 +194,23 @@ qeff_model.generate(prompts=["My name is"])
 
 **Users can also take advantage of features like multi-Qranium inference and continuous batching with QNN SDK Compilation.**
 
+
+### VLM Inference
+
+Users can compile a VLM model by using the below commands.
+
+**CLI Inference Command**
+
+For Llava
+```bash
+python -m QEfficient.cloud.infer --model_name llava-hf/llava-1.5-7b-hf --batch_size 1 --prompt_len 784 --ctx_len 1024 --mxfp6 --num_cores 16 --device_group [0] --prompt "Describe the image" --mos 1 --aic_enable_depth_first --image_url https://i.etsystatic.com/8155076/r/il/0825c2/1594869823/il_fullxfull.1594869823_5x0w.jpg --generation_len 128
+```
+
+For Mllama
+```bash
+python -m QEfficient.cloud.infer --model_name meta-llama/Llama-3.2-11B-Vision-Instruct --batch_size 1 --prompt_len 32 --ctx_len 512 --num_cores 16 --device_group [0] --prompt "Describe the image?" --mos 1  --allocator_dealloc_delay 1 --image_url https://i.etsystatic.com/8155076/r/il/0825c2/1594869823/il_fullxfull.1594869823_5x0w.jpg
+```
+
 ## Python API
 
 ### 1.  Model download and Optimize for Cloud AI 100

From 5c1ea8f9f38b4716a1da835b2b77adfa685206f3 Mon Sep 17 00:00:00 2001
From: Asmita Goswami <asmigosw@qti.qualcomm.com>
Date: Wed, 21 May 2025 08:29:05 +0000
Subject: [PATCH 2/5] Added load streamer in utils

Signed-off-by: Asmita Goswami <quic_asmigosw@quicinc.com>
---
 QEfficient/cloud/infer.py    | 17 +++++++++++++----
 QEfficient/utils/__init__.py |  1 +
 QEfficient/utils/_utils.py   | 17 +++++++++++++++++
 3 files changed, 31 insertions(+), 4 deletions(-)

diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py
index 6cc840d98..306b0536a 100644
--- a/QEfficient/cloud/infer.py
+++ b/QEfficient/cloud/infer.py
@@ -16,7 +16,7 @@
 from transformers.models.auto.modeling_auto import MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES
 
 from QEfficient.base.common import QEFFCommonLoader
-from QEfficient.utils import check_and_assign_cache_dir, load_hf_processor, load_hf_tokenizer
+from QEfficient.utils import check_and_assign_cache_dir, load_hf_processor, load_hf_tokenizer, load_streamer
 from QEfficient.utils.logging_utils import logger
 
 
@@ -45,7 +45,7 @@ def execute_vlm_model(
     Returns:
         :dict: Output from the ``AI_100`` runtime.
     """
-    streamer = TextStreamer(processor.tokenizer)
+    streamer = load_streamer(processor.tokenizer)
     output = qeff_model.generate(
         inputs=inputs,
         streamer=streamer,
@@ -101,16 +101,23 @@ def count_vlm_tokens(
         return_tensors="pt",
         add_special_tokens=False,
     )
+
+    # Get the number of total number of decoded tokens in the input
     decoded_tokens = processor.tokenizer.decode(split_inputs["input_ids"][0])
 
     total_tokens = decoded_tokens.count("<IMG_CONTEXT>") + decoded_tokens.count("<image>")
+
+    # Check if the number of tokens in the image is greater than the prompt length
     if total_tokens > prompt_len:
         logger.warning(
             f"Prompt length {prompt_len} is less than the number of tokens in the image. "
-            f"Increasing increase the prompt length to at least {total_tokens + prompt_len}."
+            f"Increasing the prompt length to at least {total_tokens + prompt_len}."
         )
         prompt_len = total_tokens + prompt_len
-        ctx_len = prompt_len + 50
+
+    # Update the context length only if it is less than the prompt length
+    if ctx_len < prompt_len:
+        ctx_len = prompt_len + ctx_len
 
     return prompt_len, ctx_len, split_inputs
 
@@ -207,6 +214,8 @@ def main(
             cache_dir=cache_dir,
             hf_token=hf_token,
         )
+
+        # count the number of tokens in required in the input and update the prompt length and context length accordingly
         prompt_len, ctx_len, inputs = count_vlm_tokens(
             processor=processor,
             prompt_len=prompt_len,
diff --git a/QEfficient/utils/__init__.py b/QEfficient/utils/__init__.py
index f73998302..4825ddbec 100755
--- a/QEfficient/utils/__init__.py
+++ b/QEfficient/utils/__init__.py
@@ -21,6 +21,7 @@
     hf_download,
     load_hf_processor,
     load_hf_tokenizer,
+    load_streamer,
     login_and_download_hf_lm,
     onnx_exists,
     padding_check_and_fix,
diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py
index f8bc5753c..5bfb01433 100644
--- a/QEfficient/utils/_utils.py
+++ b/QEfficient/utils/_utils.py
@@ -20,6 +20,7 @@
 from requests.exceptions import HTTPError
 from transformers import (
     AutoProcessor,
+    TextStreamer,
     AutoTokenizer,
     PreTrainedTokenizer,
     PreTrainedTokenizerFast,
@@ -220,6 +221,22 @@ def load_hf_processor(
     return processor
 
 
+def load_streamer(
+        tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
+):
+    """
+    Loads the streamer for the given tokenizer.
+    --------
+
+    tokenizer: `Union[PreTrainedTokenizer, PreTrainedTokenizerFast]` - Pass model tokenizer to load streamer.
+
+    Return:
+        TextStreamer object for the given tokenizer.
+    """
+    logger.info("Loading Streamer")
+    return  TextStreamer(tokenizer)
+
+
 def get_qpc_dir_path(
     model_card_name,
     num_cores,

From 43bcdbd5ef50f40ac4f5f8eb9fbe7e71f70c5d1f Mon Sep 17 00:00:00 2001
From: Asmita Goswami <asmigosw@qti.qualcomm.com>
Date: Wed, 21 May 2025 08:30:10 +0000
Subject: [PATCH 3/5] Added load streamer in utils

Signed-off-by: Asmita Goswami <quic_asmigosw@quicinc.com>
---
 QEfficient/cloud/infer.py  | 2 +-
 QEfficient/utils/_utils.py | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py
index 306b0536a..27cd51a4d 100644
--- a/QEfficient/cloud/infer.py
+++ b/QEfficient/cloud/infer.py
@@ -12,7 +12,7 @@
 
 import requests
 from PIL import Image
-from transformers import PreTrainedModel, TextStreamer
+from transformers import PreTrainedModel
 from transformers.models.auto.modeling_auto import MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES
 
 from QEfficient.base.common import QEFFCommonLoader
diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py
index 5bfb01433..ab93fc488 100644
--- a/QEfficient/utils/_utils.py
+++ b/QEfficient/utils/_utils.py
@@ -20,10 +20,10 @@
 from requests.exceptions import HTTPError
 from transformers import (
     AutoProcessor,
-    TextStreamer,
     AutoTokenizer,
     PreTrainedTokenizer,
     PreTrainedTokenizerFast,
+    TextStreamer,
 )
 
 from QEfficient.utils.constants import QEFF_MODELS_DIR, Constants, QnnConstants
@@ -222,7 +222,7 @@ def load_hf_processor(
 
 
 def load_streamer(
-        tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
+    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
 ):
     """
     Loads the streamer for the given tokenizer.
@@ -234,7 +234,7 @@ def load_streamer(
         TextStreamer object for the given tokenizer.
     """
     logger.info("Loading Streamer")
-    return  TextStreamer(tokenizer)
+    return TextStreamer(tokenizer)
 
 
 def get_qpc_dir_path(

From 557968612f16b0f96cdd5a6e49c4cb86f0cca197 Mon Sep 17 00:00:00 2001
From: Asmita Goswami <asmigosw@qti.qualcomm.com>
Date: Wed, 21 May 2025 08:34:14 +0000
Subject: [PATCH 4/5] Ruff format

Signed-off-by: Asmita Goswami <quic_asmigosw@quicinc.com>
---
 QEfficient/cloud/infer.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py
index 27cd51a4d..3d3af4858 100644
--- a/QEfficient/cloud/infer.py
+++ b/QEfficient/cloud/infer.py
@@ -105,15 +105,15 @@ def count_vlm_tokens(
     # Get the number of total number of decoded tokens in the input
     decoded_tokens = processor.tokenizer.decode(split_inputs["input_ids"][0])
 
-    total_tokens = decoded_tokens.count("<IMG_CONTEXT>") + decoded_tokens.count("<image>")
+    total_image_tokens = decoded_tokens.count("<IMG_CONTEXT>") + decoded_tokens.count("<image>")
 
     # Check if the number of tokens in the image is greater than the prompt length
-    if total_tokens > prompt_len:
+    if total_image_tokens > prompt_len:
         logger.warning(
             f"Prompt length {prompt_len} is less than the number of tokens in the image. "
-            f"Increasing the prompt length to at least {total_tokens + prompt_len}."
+            f"Increasing the prompt length to at least {total_image_tokens + prompt_len}."
         )
-        prompt_len = total_tokens + prompt_len
+        prompt_len = total_image_tokens + prompt_len
 
     # Update the context length only if it is less than the prompt length
     if ctx_len < prompt_len:

From f34d554b1937a11eac186c194a34f7012430e7b2 Mon Sep 17 00:00:00 2001
From: Asmita Goswami <asmigosw@qti.qualcomm.com>
Date: Thu, 3 Jul 2025 06:07:58 +0000
Subject: [PATCH 5/5] Handled Causal LM model's case when image_path or
 image_url is passed

Signed-off-by: Asmita Goswami <asmigosw@qti.qualcomm.com>
---
 QEfficient/cloud/infer.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py
index 368f6103f..a8563accc 100644
--- a/QEfficient/cloud/infer.py
+++ b/QEfficient/cloud/infer.py
@@ -207,10 +207,9 @@ def main(
     config = qeff_model.model.config
     architecture = config.architectures[0] if config.architectures else None
 
-    if architecture not in MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES.values() and (
-        kwargs.pop("img_size", None) or image_path or image_url
-    ):
-        logger.warning(f"Skipping image arguments as they are not valid for {architecture}")
+    if architecture not in MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES.values():
+        if kwargs.pop("img_size", None) or image_path or image_url:
+            logger.warning(f"Skipping image arguments as they are not valid for {architecture}")
     else:
         processor = load_hf_processor(
             pretrained_model_name_or_path=(local_model_dir if local_model_dir else model_name),