Skip to content

Commit d437aac

Browse files
committed
Adding ccl_enabled flag during model loading and passing CCL lists during compilation process
Signed-off-by: Vahid Janfaza <[email protected]>
1 parent 1b9a004 commit d437aac

File tree

5 files changed

+13
-13
lines changed

5 files changed

+13
-13
lines changed

examples/performance/compute_context_length/gemma3.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,8 @@
2929
## -- It starts from the correct value in the list and increases the context length dynamically when the cache index exceeds the current threshold.
3030

3131
ctx_len = 8192
32-
comp_ctx_lengths_prefill = [3072] # None #
33-
comp_ctx_lengths_decode = [4096, ctx_len] # None #
32+
comp_ctx_lengths_prefill = [3072]
33+
comp_ctx_lengths_decode = [4096, ctx_len]
3434

3535
# pass HF_TOKEN if gated model
3636
# For running the model in single QPC approach use kv_offload=False. For Dual QPC approach use kv_offload=True ###

examples/performance/compute_context_length/gpt_oss.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,9 @@
2222
ctx_len = 4096
2323
# In moe models like gpt-oss, since prefill_seq_len=1 both comp_ctx_lengths_prefill and comp_ctx_lengths_decode can share similar lists.
2424
# Set the list of ccl during prefilling process
25-
comp_ctx_lengths_prefill = [512, ctx_len] # None #
25+
comp_ctx_lengths_prefill = [512, ctx_len]
2626
# Set the list of ccl during decoding process
27-
comp_ctx_lengths_decode = [512, ctx_len] # None #
27+
comp_ctx_lengths_decode = [512, ctx_len]
2828

2929

3030
qeff_model = QEFFAutoModelForCausalLM.from_pretrained(

examples/performance/compute_context_length/llama4.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@
5151
ctx_len=ctx_len,
5252
img_size=336,
5353
num_cores=16,
54-
num_devices=4,
54+
num_devices=8,
5555
max_num_tiles=17,
5656
mxfp6_matmul=True,
5757
mxint8_kv_cache=True,
@@ -83,7 +83,7 @@
8383
)
8484

8585
streamer = TextStreamer(tokenizer)
86-
output = qeff_model.generate(inputs=inputs, device_ids=[0, 1, 2, 3], generation_len=100)
86+
output = qeff_model.generate(inputs=inputs, device_ids=[0, 1, 2, 3, 4, 5, 6, 7], generation_len=100)
8787
print(output.generated_ids)
8888
print(tokenizer.batch_decode(output.generated_ids))
8989
print(output)
@@ -95,7 +95,7 @@
9595
ctx_len=ctx_len,
9696
img_size=336,
9797
num_cores=16,
98-
num_devices=4,
98+
num_devices=8,
9999
max_num_tiles=17,
100100
mxfp6_matmul=True,
101101
mxint8_kv_cache=True,
@@ -129,7 +129,7 @@
129129
)
130130
inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32)
131131
streamer = TextStreamer(tokenizer)
132-
output = qeff_model.generate(inputs=inputs, device_ids=[8, 9, 10, 11], generation_len=100)
132+
output = qeff_model.generate(inputs=inputs, device_ids=[0, 1, 2, 3, 4, 5, 6, 7], generation_len=100)
133133
print(output.generated_ids)
134134
print(tokenizer.batch_decode(output.generated_ids))
135135
print(output)

examples/performance/compute_context_length/qwen2_5_vl_cb.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@
8181
processor=processor,
8282
images=image_urls,
8383
generation_len=100,
84-
device_ids=[28, 29, 30, 31],
84+
device_ids=[0,1,2,3],
8585
)
8686
print(output.generated_ids)
8787
print(tokenizer.batch_decode(output.generated_ids))

examples/performance/compute_context_length/qwen3moe.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212

1313
model_name = "Qwen/Qwen3-30B-A3B-Instruct-2507"
1414
"""
15-
# For CB inference, set continuous_batching to True and add full_batch_size,mxfp6,mint8 argument in compile function
15+
# For CB inference, set continuous_batching to True and add full_batch_size,mxfp6,mxint8 argument in compile function
1616
# We will use prompt_len=1 for compilation for both cb and non-cb inference
1717
"""
1818

@@ -27,8 +27,8 @@
2727
ctx_len = 1024
2828
prefill_seq_len = 1
2929
# In moe models when compiling with prefill_seq_len=1 and non-continuous-batching mode, prefill and decode will share the same ccl specializations.
30-
comp_ctx_lengths_prefill = [256, 512, ctx_len] # None #
31-
comp_ctx_lengths_decode = [256, 512, ctx_len] # None #
30+
comp_ctx_lengths_prefill = [256, 512, ctx_len]
31+
comp_ctx_lengths_decode = [256, 512, ctx_len]
3232

3333
model = QEFFAutoModelForCausalLM.from_pretrained(
3434
model_name,
@@ -49,6 +49,6 @@
4949
comp_ctx_lengths_prefill=comp_ctx_lengths_prefill,
5050
comp_ctx_lengths_decode=comp_ctx_lengths_decode,
5151
)
52-
# mos=1,
52+
5353
tokenizer = AutoTokenizer.from_pretrained(model_name)
5454
exec_info = model.generate(prompts=Constants.INPUT_STR, tokenizer=tokenizer)

0 commit comments

Comments
 (0)