Adding ccl_enabled flag during model loading and passing CCL lists during compilation process

vjanfaza · vjanfaza · commit f25cd52ad927 · 2025-11-19T16:17:21.000-08:00
Signed-off-by: Vahid Janfaza &lt;vjanfaza@qti.qualcomm.com&gt;
diff --git a/examples/ccl_gpt_oss.py b/examples/ccl_gpt_oss.py
@@ -15,16 +15,16 @@
 ## Use the optional comp_ctx_lengths argument to provide two lists of context lengths for the prefilling and decoding processes. If comp_ctx_lengths=None, the model will run with its default context length.
 ##   - The first list, comp_ctx_lengths_prefill, defines the compute-context-length values for the prefilling process.
 ##           -- The process starts with the first value in the list and gradually increases the context length based on the position_id of the current prompt chunk.
-##   - The second list, comp_ctx_lengths_decode, defines the compute-context-length values for the decoding process. 
+##   - The second list, comp_ctx_lengths_decode, defines the compute-context-length values for the decoding process.
 ##           -- During decoding, the model selects an appropriate context length from the list based on the input prompt length and cache index.
 ##           -- It starts from the correct value in the list and increases the context length dynamically when the cache index exceeds the current threshold.
 
 ctx_len = 4096
 # In moe models like gpt-oss, since prefill_seq_len=1 both comp_ctx_lengths_prefill and comp_ctx_lengths_decode can share similar lists.
 # Set the list of ccl during prefilling process
-comp_ctx_lengths_prefill = [512, ctx_len] #None #
+comp_ctx_lengths_prefill = [512, ctx_len]  # None #
 # Set the list of ccl during decoding process
-comp_ctx_lengths_decode = [512, ctx_len] #None #
+comp_ctx_lengths_decode = [512, ctx_len]  # None #
 
 
 qeff_model = QEFFAutoModelForCausalLM.from_pretrained(
diff --git a/examples/ccl_llama4_CB_example_vision_lang.py b/examples/ccl_llama4_CB_example_vision_lang.py
@@ -23,7 +23,7 @@
 ## Use the optional comp_ctx_lengths argument to provide two lists of context lengths for the prefilling and decoding processes. If comp_ctx_lengths=None, the model will run with its default context length.
 ##   - The first list, comp_ctx_lengths_prefill, defines the compute-context-length values for the prefilling process.
 ##           -- The process starts with the first value in the list and gradually increases the context length based on the position_id of the current prompt chunk.
-##   - The second list, comp_ctx_lengths_decode, defines the compute-context-length values for the decoding process. 
+##   - The second list, comp_ctx_lengths_decode, defines the compute-context-length values for the decoding process.
 ##           -- During decoding, the model selects an appropriate context length from the list based on the input prompt length and cache index.
 ##           -- It starts from the correct value in the list and increases the context length dynamically when the cache index exceeds the current threshold.
 
diff --git a/examples/ccl_llama4_multi_image_example.py b/examples/ccl_llama4_multi_image_example.py
@@ -21,7 +21,7 @@
 ## Use the optional comp_ctx_lengths argument to provide two lists of context lengths for the prefilling and decoding processes. If comp_ctx_lengths=None, the model will run with its default context length.
 ##   - The first list, comp_ctx_lengths_prefill, defines the compute-context-length values for the prefilling process.
 ##           -- The process starts with the first value in the list and gradually increases the context length based on the position_id of the current prompt chunk.
-##   - The second list, comp_ctx_lengths_decode, defines the compute-context-length values for the decoding process. 
+##   - The second list, comp_ctx_lengths_decode, defines the compute-context-length values for the decoding process.
 ##           -- During decoding, the model selects an appropriate context length from the list based on the input prompt length and cache index.
 ##           -- It starts from the correct value in the list and increases the context length dynamically when the cache index exceeds the current threshold.
 
diff --git a/examples/ccl_qwen2_5_vl_CB.py b/examples/ccl_qwen2_5_vl_CB.py
@@ -22,7 +22,7 @@
 ## Use the optional comp_ctx_lengths argument to provide two lists of context lengths for the prefilling and decoding processes. If comp_ctx_lengths=None, the model will run with its default context length.
 ##   - The first list, comp_ctx_lengths_prefill, defines the compute-context-length values for the prefilling process.
 ##           -- The process starts with the first value in the list and gradually increases the context length based on the position_id of the current prompt chunk.
-##   - The second list, comp_ctx_lengths_decode, defines the compute-context-length values for the decoding process. 
+##   - The second list, comp_ctx_lengths_decode, defines the compute-context-length values for the decoding process.
 ##           -- During decoding, the model selects an appropriate context length from the list based on the input prompt length and cache index.
 ##           -- It starts from the correct value in the list and increases the context length dynamically when the cache index exceeds the current threshold.
 
@@ -81,7 +81,7 @@
     processor=processor,
     images=image_urls,
     generation_len=100,
-    device_ids=[28,29,30,31],
+    device_ids=[28, 29, 30, 31],
 )
 print(output.generated_ids)
 print(tokenizer.batch_decode(output.generated_ids))