Adding ccl_enabled flag during model loading and passing CCL lists during compilation process

vjanfaza · vjanfaza · commit 6dc9d417da55 · 2025-12-02T14:46:12.000-08:00
Signed-off-by: Vahid Janfaza &lt;vjanfaza@qti.qualcomm.com&gt;
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
@@ -909,7 +909,7 @@ def __init__(
         self,
         model: nn.Module,
         continuous_batching: bool = False,
-        ccl_enabled: bool = False,
+        qaic_config: Optional[dict] = None,
         **kwargs,
     ):
         """
@@ -935,7 +935,7 @@ def __init__(
         self.vision_model = QEffVisionEncoderForTextImageToTextModel(model, **kwargs)
         self.lang_model = QEffCausalLMForTextImageToTextModel(model, **kwargs)
         self.continuous_batching = continuous_batching
-        self.ccl_enabled = ccl_enabled
+        self.ccl_enabled = qaic_config.get("ccl_enabled", False)
         self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = None, None
         self.input_shapes, self.output_names = None, None
 
@@ -955,7 +955,7 @@ def model_name(self) -> str:
         return mname
 
     @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs):
+    def from_pretrained(cls, pretrained_model_name_or_path: str, qaic_config: Optional[dict] = None, **kwargs):
         """
         Load a QEfficient multimodal model for dual QPC from a pretrained HuggingFace model or local path.
 
@@ -980,13 +980,12 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs):
             logger.warning("Updating low_cpu_mem_usage=False")
 
         kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False})
-        ccl_enabled = kwargs.pop("ccl_enabled", None)
 
         model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, **kwargs)
         return cls(
             model,
             pretrained_model_name_or_path=pretrained_model_name_or_path,
-            ccl_enabled=ccl_enabled,
+            qaic_config=qaic_config,
             **kwargs,
         )
 
@@ -1190,8 +1189,9 @@ def compile(
 
         # For supporting VLLM and Disaggregated with CCL
         if comp_ctx_lengths_prefill is not None or comp_ctx_lengths_decode is not None:
-            self.comp_ctx_lengths_prefill = comp_ctx_lengths_prefill
-            self.comp_ctx_lengths_decode = comp_ctx_lengths_decode
+            self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = process_ccl_specializations(
+                comp_ctx_lengths_prefill, comp_ctx_lengths_decode, ctx_len, prefill_seq_len
+            )
 
         specializations, compiler_options = self.model.get_specializations(
             batch_size=batch_size,
@@ -1614,7 +1614,7 @@ class _QEFFAutoModelForImageTextToTextSingleQPC(QEFFTransformersBase, Multimodal
     def __init__(
         self,
         model: nn.Module,
-        ccl_enabled: bool = False,
+        qaic_config: Optional[dict] = None,
         **kwargs,
     ):
         """
@@ -1647,13 +1647,14 @@ def __init__(
             else:
                 self.model.config.use_cache = True
         self.hash_params["qeff_auto_class"] = self.__class__.__name__
-        self.ccl_enabled = ccl_enabled
+        self.ccl_enabled = qaic_config.get("ccl_enabled", False)
         self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = None, None
 
     @classmethod
     def from_pretrained(
         cls,
         pretrained_model_name_or_path,
+        qaic_config: Optional[dict] = None,
         *args,
         **kwargs,
     ):
@@ -1684,7 +1685,6 @@ def from_pretrained(
             logger.warning("Updating low_cpu_mem_usage=False")
 
         kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False})
-        ccl_enabled = kwargs.pop("ccl_enabled", None)
 
         from transformers import AutoConfig
 
@@ -1696,7 +1696,7 @@ def from_pretrained(
         return cls(
             model,
             pretrained_model_name_or_path=pretrained_model_name_or_path,
-            ccl_enabled=ccl_enabled,
+            qaic_config=qaic_config,
             **kwargs,
         )
 
@@ -1823,8 +1823,9 @@ def compile(
 
         # For supporting VLLM and Disaggregated with CCL
         if comp_ctx_lengths_prefill is not None or comp_ctx_lengths_decode is not None:
-            self.comp_ctx_lengths_prefill = comp_ctx_lengths_prefill
-            self.comp_ctx_lengths_decode = comp_ctx_lengths_decode
+            self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = process_ccl_specializations(
+                comp_ctx_lengths_prefill, comp_ctx_lengths_decode, ctx_len, prefill_seq_len
+            )
 
         # Get specializations from modelling file
         # TODO: expose this via the auto class as well
@@ -2207,7 +2208,7 @@ def __new__(
         model: nn.Module,
         kv_offload: Optional[bool] = True,
         continuous_batching: bool = False,
-        ccl_enabled: bool = False,
+        qaic_config: Optional[dict] = None,
         **kwargs,
     ):
         """
@@ -2231,10 +2232,10 @@ def __new__(
         """
         if kv_offload:
             return _QEffAutoModelForImageTextToTextDualQPC(
-                model, continuous_batching, ccl_enabled=ccl_enabled, **kwargs
+                model, continuous_batching, qaic_config=qaic_config, **kwargs
             )
         else:
-            return _QEFFAutoModelForImageTextToTextSingleQPC(model, ccl_enabled=ccl_enabled, **kwargs)
+            return _QEFFAutoModelForImageTextToTextSingleQPC(model, qaic_config=qaic_config, **kwargs)
 
     @classmethod
     @with_replaced_quantizers
@@ -2284,15 +2285,14 @@ def from_pretrained(
             logger.warning("Updating low_cpu_mem_usage=False")
 
         kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False})
-        ccl_enabled = kwargs.pop("ccl_enabled", None)
 
         model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, **kwargs)
         return cls(
             model,
             kv_offload=kv_offload,
             continuous_batching=continuous_batching,
             pretrained_model_name_or_path=pretrained_model_name_or_path,
-            ccl_enabled=ccl_enabled,
+            qaic_config=qaic_config,
             **kwargs,
         )
 
@@ -2345,7 +2345,6 @@ def __init__(
         model: nn.Module,
         continuous_batching: bool = False,
         qaic_config: Optional[dict] = None,
-        ccl_enabled: bool = False,
         **kwargs,
     ):
         """
@@ -2400,7 +2399,7 @@ def __init__(
         self.is_tlm = transformed
 
         self.hash_params["qeff_auto_class"] = self.__class__.__name__
-        self.ccl_enabled = ccl_enabled
+        self.ccl_enabled = qaic_config.get("ccl_enabled", False)
         self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = None, None
 
         # ---Sampling---
@@ -2494,7 +2493,6 @@ def from_pretrained(
             logger.warning("Updating low_cpu_mem_usage=False")
 
         kv_offload = kwargs.pop("kv_offload", None)
-        ccl_enabled = kwargs.pop("ccl_enabled", None)
 
         kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False})
         model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
@@ -2508,15 +2506,14 @@ def from_pretrained(
                 model,
                 kv_offload=kv_offload,
                 pretrained_model_name_or_path=pretrained_model_name_or_path,
-                ccl_enabled=ccl_enabled,
+                qaic_config=qaic_config,
                 **kwargs,
             )
         return cls(
             model,
             continuous_batching=continuous_batching,
             qaic_config=qaic_config,
             pretrained_model_name_or_path=pretrained_model_name_or_path,
-            ccl_enabled=ccl_enabled,
             **kwargs,
         )
 
@@ -2964,6 +2961,9 @@ def compile(
                 self.comp_ctx_lengths_prefill = comp_ctx_lengths_prefill
                 self.comp_ctx_lengths_decode = comp_ctx_lengths_decode
 
+            self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = process_ccl_specializations(
+                self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode, ctx_len, prefill_seq_len
+            )
         # --- Validation ---
         if prefill_only is not None and not isinstance(prefill_only, bool):
             raise TypeError("`prefill_only` must be a boolean.")
diff --git a/examples/performance/compute_context_length/basic_inference.py b/examples/performance/compute_context_length/basic_inference.py
@@ -117,7 +117,9 @@ def main():
     model = QEFFAutoModelForCausalLM.from_pretrained(
         args.model_name,
         continuous_batching=args.continuous_batching,
-        ccl_enabled=args.ccl_enabled,
+        qaic_config={
+            "ccl_enabled":args.ccl_enabled,
+        },
     )
 
     # Compile the model
diff --git a/examples/performance/compute_context_length/gemma3.py b/examples/performance/compute_context_length/gemma3.py
@@ -38,8 +38,10 @@
     model_id,
     config=config,
     attn_implementation="eager",
-    kv_offload=True,
-    ccl_enabled=True,
+    kv_offload=False,
+    qaic_config={
+        "ccl_enabled":True,
+    },
 )
 
 ### use skip_vision=True, if want to run only text, or false ###
@@ -58,7 +60,7 @@
         aic_enable_depth_first=True,
         skip_vision=True,
         mos=1,
-        node_precision_info="examples/performance/compute_context_length/gemma3/fp32_nodes_gemma3_4b.yaml",
+        node_precision_info="examples/performance/compute_context_length/fp32_nodes_gemma3_4b.yaml",
         comp_ctx_lengths_prefill=comp_ctx_lengths_prefill,
         comp_ctx_lengths_decode=comp_ctx_lengths_decode,
     )
@@ -96,7 +98,7 @@
         mxint8_kv_cache=False,
         aic_enable_depth_first=True,
         mos=1,
-        node_precision_info="examples/performance/compute_context_length/gemma3/fp32_nodes_gemma3_4b.yaml",
+        node_precision_info="examples/performance/compute_context_length/fp32_nodes_gemma3_4b.yaml",
         comp_ctx_lengths_prefill=comp_ctx_lengths_prefill,
         comp_ctx_lengths_decode=comp_ctx_lengths_decode,
     )
diff --git a/examples/performance/compute_context_length/gpt_oss.py b/examples/performance/compute_context_length/gpt_oss.py
@@ -21,15 +21,14 @@
 
 ctx_len = 4096
 # In moe models like gpt-oss, since prefill_seq_len=1 both comp_ctx_lengths_prefill and comp_ctx_lengths_decode can share similar lists.
-# Set the list of ccl during prefilling process
-comp_ctx_lengths_prefill = [512, ctx_len]
-# Set the list of ccl during decoding process
-comp_ctx_lengths_decode = [512, ctx_len]
-
+# Set the list of ccl during prefilling and decoding processes
+comp_ctx_lengths_prefill = comp_ctx_lengths_decode = [1024, ctx_len]
 
 qeff_model = QEFFAutoModelForCausalLM.from_pretrained(
     model_id,
-    ccl_enabled=True,
+    qaic_config={
+        "ccl_enabled":True,
+    },
 )
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 
diff --git a/examples/performance/compute_context_length/granite_vision.py b/examples/performance/compute_context_length/granite_vision.py
@@ -41,7 +41,9 @@ def run_model(
         model_name,
         token=token,
         kv_offload=kv_offload,
-        ccl_enabled=ccl_enabled,
+        qaic_config={
+            "ccl_enabled":ccl_enabled,
+        },
     )
 
     ## STEP - 2 Export & Compile the Model
diff --git a/examples/performance/compute_context_length/internvl.py b/examples/performance/compute_context_length/internvl.py
@@ -188,7 +188,9 @@ def run_intern_on_aic(
         model_name,
         kv_offload=kv_offload,
         trust_remote_code=True,
-        ccl_enabled=ccl_enabled,
+        qaic_config={
+            "ccl_enabled":ccl_enabled,
+        },
     )
 
     ## STEP 2 -- EXPORT & COMPILE THE MODEL
diff --git a/examples/performance/compute_context_length/llama4.py b/examples/performance/compute_context_length/llama4.py
@@ -36,7 +36,9 @@
     attn_implementation="eager",
     kv_offload=True,
     config=config,
-    ccl_enabled=True,
+    qaic_config={
+        "ccl_enabled":True,
+    },
 )
 tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)
 processor = AutoProcessor.from_pretrained(model_id)
diff --git a/examples/performance/compute_context_length/llama4_cb.py b/examples/performance/compute_context_length/llama4_cb.py
@@ -41,7 +41,9 @@
         kv_offload=True,
         config=config,
         continuous_batching=True,
-        ccl_enabled=True,
+        qaic_config={
+            "ccl_enabled":True,
+        },
     )
 
     qeff_model.compile(
@@ -66,7 +68,9 @@
         attn_implementation="eager",
         kv_offload=True,
         config=config,
-        ccl_enabled=True,
+        qaic_config={
+            "ccl_enabled":True,
+        },
     )
 
     qeff_model.compile(
diff --git a/examples/performance/compute_context_length/llama4_multi_image.py b/examples/performance/compute_context_length/llama4_multi_image.py
@@ -36,7 +36,9 @@
     attn_implementation="eager",
     kv_offload=True,
     config=config,
-    ccl_enabled=True,
+    qaic_config={
+        "ccl_enabled":True,
+    },
 )
 tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)
 processor = AutoProcessor.from_pretrained(model_id)
diff --git a/examples/performance/compute_context_length/mistral3.py b/examples/performance/compute_context_length/mistral3.py
@@ -46,7 +46,9 @@ def run_model(
         model_name,
         kv_offload=kv_offload,
         config=config,
-        ccl_enabled=ccl_enabled,
+        qaic_config={
+            "ccl_enabled":ccl_enabled,
+        },
     )
 
     ## STEP - 2 Export & Compile the Model
diff --git a/examples/performance/compute_context_length/molmo.py b/examples/performance/compute_context_length/molmo.py
@@ -15,7 +15,7 @@
 
 model_id = "allenai/Molmo-7B-D-0924"
 config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
-
+# For Testing Purpose Only
 # config.num_hidden_layers = 2
 
 ## Activate Compute-Context-Length (CCL) feature by setting ccl_enabled=True when loading the model with from_pretrained().
@@ -33,10 +33,12 @@
 
 qeff_model = QEFFAutoModelForCausalLM.from_pretrained(
     model_id,
-    kv_offload=True,
+    kv_offload=False,
     trust_remote_code=True,
     config=config,
-    ccl_enabled=True,
+    qaic_config={
+        "ccl_enabled":True,
+    },
 )
 tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
 processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
diff --git a/examples/performance/compute_context_length/qwen2_5_vl.py b/examples/performance/compute_context_length/qwen2_5_vl.py
@@ -19,7 +19,8 @@
 ## For AWQ model update pytorch version to 2.8.*
 model_id = "Qwen/Qwen2.5-VL-32B-Instruct"
 config = AutoConfig.from_pretrained(model_id)
-# config.text_config.num_hidden_layers = 2
+# For Testing Purpose Only
+config.text_config.num_hidden_layers = 2
 
 ## Activate Compute-Context-Length (CCL) feature by setting ccl_enabled=True when loading the model with from_pretrained().
 ## Use the optional comp_ctx_lengths argument to provide two lists of context lengths for the prefilling and decoding processes. If comp_ctx_lengths=None, the model will run with its default context length.
@@ -38,7 +39,9 @@
     attn_implementation="eager",
     kv_offload=True,
     config=config,
-    ccl_enabled=True,
+    qaic_config={
+        "ccl_enabled":True,
+    },
 )
 tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)
 processor = AutoProcessor.from_pretrained(model_id)
diff --git a/examples/performance/compute_context_length/qwen2_5_vl_cb.py b/examples/performance/compute_context_length/qwen2_5_vl_cb.py
diff --git a/examples/performance/compute_context_length/qwen3moe.py b/examples/performance/compute_context_length/qwen3moe.py
diff --git a/examples/performance/compute_context_length/vlm_inference.py b/examples/performance/compute_context_length/vlm_inference.py