Adding ccl_enabled flag during model loading and passing CCL lists during compilation process

vjanfaza · vjanfaza · commit d437aac6c9b6 · 2025-12-01T16:42:42.000-08:00
Signed-off-by: Vahid Janfaza &lt;vjanfaza@qti.qualcomm.com&gt;
diff --git a/examples/performance/compute_context_length/gemma3.py b/examples/performance/compute_context_length/gemma3.py
@@ -29,8 +29,8 @@
 ##           -- It starts from the correct value in the list and increases the context length dynamically when the cache index exceeds the current threshold.
 
 ctx_len = 8192
-comp_ctx_lengths_prefill = [3072]  # None #
-comp_ctx_lengths_decode = [4096, ctx_len]  # None #
+comp_ctx_lengths_prefill = [3072]
+comp_ctx_lengths_decode = [4096, ctx_len] 
 
 # pass HF_TOKEN if gated model
 # For running the model in single QPC approach use kv_offload=False. For Dual QPC approach use kv_offload=True ###
diff --git a/examples/performance/compute_context_length/gpt_oss.py b/examples/performance/compute_context_length/gpt_oss.py
@@ -22,9 +22,9 @@
 ctx_len = 4096
 # In moe models like gpt-oss, since prefill_seq_len=1 both comp_ctx_lengths_prefill and comp_ctx_lengths_decode can share similar lists.
 # Set the list of ccl during prefilling process
-comp_ctx_lengths_prefill = [512, ctx_len]  # None #
+comp_ctx_lengths_prefill = [512, ctx_len]
 # Set the list of ccl during decoding process
-comp_ctx_lengths_decode = [512, ctx_len]  # None #
+comp_ctx_lengths_decode = [512, ctx_len]
 
 
 qeff_model = QEFFAutoModelForCausalLM.from_pretrained(
diff --git a/examples/performance/compute_context_length/llama4.py b/examples/performance/compute_context_length/llama4.py
@@ -51,7 +51,7 @@
         ctx_len=ctx_len,
         img_size=336,
         num_cores=16,
-        num_devices=4,
+        num_devices=8,
         max_num_tiles=17,
         mxfp6_matmul=True,
         mxint8_kv_cache=True,
@@ -83,7 +83,7 @@
     )
 
     streamer = TextStreamer(tokenizer)
-    output = qeff_model.generate(inputs=inputs, device_ids=[0, 1, 2, 3], generation_len=100)
+    output = qeff_model.generate(inputs=inputs, device_ids=[0, 1, 2, 3, 4, 5, 6, 7], generation_len=100)
     print(output.generated_ids)
     print(tokenizer.batch_decode(output.generated_ids))
     print(output)
@@ -95,7 +95,7 @@
         ctx_len=ctx_len,
         img_size=336,
         num_cores=16,
-        num_devices=4,
+        num_devices=8,
         max_num_tiles=17,
         mxfp6_matmul=True,
         mxint8_kv_cache=True,
@@ -129,7 +129,7 @@
     )
     inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32)
     streamer = TextStreamer(tokenizer)
-    output = qeff_model.generate(inputs=inputs, device_ids=[8, 9, 10, 11], generation_len=100)
+    output = qeff_model.generate(inputs=inputs, device_ids=[0, 1, 2, 3, 4, 5, 6, 7], generation_len=100)
     print(output.generated_ids)
     print(tokenizer.batch_decode(output.generated_ids))
     print(output)
diff --git a/examples/performance/compute_context_length/qwen2_5_vl_cb.py b/examples/performance/compute_context_length/qwen2_5_vl_cb.py
@@ -81,7 +81,7 @@
     processor=processor,
     images=image_urls,
     generation_len=100,
-    device_ids=[28, 29, 30, 31],
+    device_ids=[0,1,2,3],
 )
 print(output.generated_ids)
 print(tokenizer.batch_decode(output.generated_ids))
diff --git a/examples/performance/compute_context_length/qwen3moe.py b/examples/performance/compute_context_length/qwen3moe.py
@@ -12,7 +12,7 @@
 
 model_name = "Qwen/Qwen3-30B-A3B-Instruct-2507"
 """
-# For CB inference, set continuous_batching to True and add full_batch_size,mxfp6,mint8 argument in compile function
+# For CB inference, set continuous_batching to True and add full_batch_size,mxfp6,mxint8 argument in compile function
 # We will use prompt_len=1 for compilation for both cb and non-cb inference
 """
 
@@ -27,8 +27,8 @@
 ctx_len = 1024
 prefill_seq_len = 1
 # In moe models when compiling with prefill_seq_len=1 and non-continuous-batching mode, prefill and decode will share the same ccl specializations.
-comp_ctx_lengths_prefill = [256, 512, ctx_len]  # None #
-comp_ctx_lengths_decode = [256, 512, ctx_len]  # None #
+comp_ctx_lengths_prefill = [256, 512, ctx_len]
+comp_ctx_lengths_decode = [256, 512, ctx_len]
 
 model = QEFFAutoModelForCausalLM.from_pretrained(
     model_name,
@@ -49,6 +49,6 @@
     comp_ctx_lengths_prefill=comp_ctx_lengths_prefill,
     comp_ctx_lengths_decode=comp_ctx_lengths_decode,
 )
-# mos=1,
+
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 exec_info = model.generate(prompts=Constants.INPUT_STR, tokenizer=tokenizer)

Original file line number	Diff line number	Diff line change
`@@ -81,7 +81,7 @@`
`81`	`81`	`processor=processor,`
`82`	`82`	`images=image_urls,`
`83`	`83`	`generation_len=100,`
`84`		`- device_ids=[28, 29, 30, 31],`
	`84`	`+ device_ids=[0,1,2,3],`
`85`	`85`	`)`
`86`	`86`	`print(output.generated_ids)`
`87`	`87`	`print(tokenizer.batch_decode(output.generated_ids))`