diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index cbff5be91..b46aa7c43 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -909,7 +909,7 @@ def __init__( self, model: nn.Module, continuous_batching: bool = False, - qaic_config: Optional[dict] = None, + ccl_enabled: bool = False, **kwargs, ): """ @@ -932,11 +932,11 @@ def __init__( self.model = model self.config = model.config - self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = process_ccl_specializations(qaic_config) - self.vision_model = QEffVisionEncoderForTextImageToTextModel(model, **kwargs) self.lang_model = QEffCausalLMForTextImageToTextModel(model, **kwargs) self.continuous_batching = continuous_batching + self.ccl_enabled = ccl_enabled + self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = None, None self.input_shapes, self.output_names = None, None @property @@ -955,7 +955,7 @@ def model_name(self) -> str: return mname @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: str, qaic_config: Optional[dict] = None, **kwargs): + def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs): """ Load a QEfficient multimodal model for dual QPC from a pretrained HuggingFace model or local path. @@ -980,11 +980,13 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, qaic_config: Option logger.warning("Updating low_cpu_mem_usage=False") kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False}) + ccl_enabled = kwargs.pop("ccl_enabled", None) + model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, **kwargs) return cls( model, pretrained_model_name_or_path=pretrained_model_name_or_path, - qaic_config=qaic_config, + ccl_enabled=ccl_enabled, **kwargs, ) @@ -1090,6 +1092,8 @@ def compile( compile_dir: Optional[str] = None, *, prefill_seq_len: Optional[int] = None, + comp_ctx_lengths_prefill: Optional[List[int]] = None, + comp_ctx_lengths_decode: Optional[List[int]] = None, ctx_len: Optional[int] = None, batch_size: int = 1, full_batch_size: Optional[int] = None, @@ -1174,10 +1178,20 @@ def compile( output_names = self.model.get_output_names(kv_offload=True) + # if ccl_enabled is True read Compute-Context-Length lists + if self.ccl_enabled: + if comp_ctx_lengths_prefill is None or comp_ctx_lengths_decode is None: + logger.warning( + "Please set comp_ctx_lengths_prefill and comp_ctx_lengths_decode with a proper list of context lengths. Using non-CCL default model." + ) + self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = process_ccl_specializations( + comp_ctx_lengths_prefill, comp_ctx_lengths_decode, ctx_len, prefill_seq_len + ) + # For supporting VLLM and Disaggregated with CCL - if "comp_ctx_lengths_prefill" in compiler_options: - self.comp_ctx_lengths_prefill = compiler_options.pop("comp_ctx_lengths_prefill") - self.comp_ctx_lengths_decode = compiler_options.pop("comp_ctx_lengths_decode") + if comp_ctx_lengths_prefill is not None or comp_ctx_lengths_decode is not None: + self.comp_ctx_lengths_prefill = comp_ctx_lengths_prefill + self.comp_ctx_lengths_decode = comp_ctx_lengths_decode specializations, compiler_options = self.model.get_specializations( batch_size=batch_size, @@ -1600,7 +1614,7 @@ class _QEFFAutoModelForImageTextToTextSingleQPC(QEFFTransformersBase, Multimodal def __init__( self, model: nn.Module, - qaic_config: Optional[dict] = None, + ccl_enabled: bool = False, **kwargs, ): """ @@ -1622,8 +1636,6 @@ def __init__( raise NotImplementedError("Continuous batching is not supported for image-text-to-text models yet.") super().__init__(model, **kwargs) - self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = process_ccl_specializations(qaic_config) - # to handle internvl models if hasattr(self.model.config, "llm_config") and hasattr(self.model.config, "vision_config"): self.model.config.llm_config.use_cache = True @@ -1635,12 +1647,13 @@ def __init__( else: self.model.config.use_cache = True self.hash_params["qeff_auto_class"] = self.__class__.__name__ + self.ccl_enabled = ccl_enabled + self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = None, None @classmethod def from_pretrained( cls, pretrained_model_name_or_path, - qaic_config: Optional[dict] = None, *args, **kwargs, ): @@ -1671,6 +1684,8 @@ def from_pretrained( logger.warning("Updating low_cpu_mem_usage=False") kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False}) + ccl_enabled = kwargs.pop("ccl_enabled", None) + from transformers import AutoConfig config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=True) @@ -1681,7 +1696,7 @@ def from_pretrained( return cls( model, pretrained_model_name_or_path=pretrained_model_name_or_path, - qaic_config=qaic_config, + ccl_enabled=ccl_enabled, **kwargs, ) @@ -1725,6 +1740,8 @@ def compile( *, prefill_seq_len: Optional[int] = None, ctx_len: Optional[int] = None, + comp_ctx_lengths_prefill: Optional[List[int]] = None, + comp_ctx_lengths_decode: Optional[List[int]] = None, batch_size: int = 1, full_batch_size: Optional[int] = None, kv_cache_batch_size: Optional[int] = None, @@ -1794,10 +1811,20 @@ def compile( kv_cache_batch_size = kv_cache_batch_size or full_batch_size or batch_size output_names = self.model.get_output_names() + # if ccl_enabled is True read Compute-Context-Length lists + if self.ccl_enabled: + if comp_ctx_lengths_prefill is None or comp_ctx_lengths_decode is None: + logger.warning( + "Please set comp_ctx_lengths_prefill and comp_ctx_lengths_decode with a proper list of context lengths. Using non-CCL default model." + ) + self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = process_ccl_specializations( + comp_ctx_lengths_prefill, comp_ctx_lengths_decode, ctx_len, prefill_seq_len + ) + # For supporting VLLM and Disaggregated with CCL - if "comp_ctx_lengths_prefill" in compiler_options: - self.comp_ctx_lengths_prefill = compiler_options.pop("comp_ctx_lengths_prefill") - self.comp_ctx_lengths_decode = compiler_options.pop("comp_ctx_lengths_decode") + if comp_ctx_lengths_prefill is not None or comp_ctx_lengths_decode is not None: + self.comp_ctx_lengths_prefill = comp_ctx_lengths_prefill + self.comp_ctx_lengths_decode = comp_ctx_lengths_decode # Get specializations from modelling file # TODO: expose this via the auto class as well @@ -2180,7 +2207,7 @@ def __new__( model: nn.Module, kv_offload: Optional[bool] = True, continuous_batching: bool = False, - qaic_config: Optional[dict] = None, + ccl_enabled: bool = False, **kwargs, ): """ @@ -2204,10 +2231,10 @@ def __new__( """ if kv_offload: return _QEffAutoModelForImageTextToTextDualQPC( - model, continuous_batching, qaic_config=qaic_config, **kwargs + model, continuous_batching, ccl_enabled=ccl_enabled, **kwargs ) else: - return _QEFFAutoModelForImageTextToTextSingleQPC(model, qaic_config=qaic_config, **kwargs) + return _QEFFAutoModelForImageTextToTextSingleQPC(model, ccl_enabled=ccl_enabled, **kwargs) @classmethod @with_replaced_quantizers @@ -2257,6 +2284,7 @@ def from_pretrained( logger.warning("Updating low_cpu_mem_usage=False") kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False}) + ccl_enabled = kwargs.pop("ccl_enabled", None) model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, **kwargs) return cls( @@ -2264,7 +2292,7 @@ def from_pretrained( kv_offload=kv_offload, continuous_batching=continuous_batching, pretrained_model_name_or_path=pretrained_model_name_or_path, - qaic_config=qaic_config, + ccl_enabled=ccl_enabled, **kwargs, ) @@ -2317,6 +2345,7 @@ def __init__( model: nn.Module, continuous_batching: bool = False, qaic_config: Optional[dict] = None, + ccl_enabled: bool = False, **kwargs, ): """ @@ -2363,8 +2392,6 @@ def __init__( # Set use_cache=True to get KV values as output during ONNX export model.config.use_cache = True - self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = process_ccl_specializations(qaic_config) - super().__init__(model, qaic_config=qaic_config, **kwargs) self.num_layers = model.config.num_hidden_layers self.continuous_batching = continuous_batching @@ -2373,6 +2400,8 @@ def __init__( self.is_tlm = transformed self.hash_params["qeff_auto_class"] = self.__class__.__name__ + self.ccl_enabled = ccl_enabled + self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = None, None # ---Sampling--- # Note: SamplerTransform should be applied after all other transforms @@ -2465,6 +2494,7 @@ def from_pretrained( logger.warning("Updating low_cpu_mem_usage=False") kv_offload = kwargs.pop("kv_offload", None) + ccl_enabled = kwargs.pop("ccl_enabled", None) kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False}) model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, *args, **kwargs) @@ -2478,7 +2508,7 @@ def from_pretrained( model, kv_offload=kv_offload, pretrained_model_name_or_path=pretrained_model_name_or_path, - qaic_config=qaic_config, + ccl_enabled=ccl_enabled, **kwargs, ) return cls( @@ -2486,6 +2516,7 @@ def from_pretrained( continuous_batching=continuous_batching, qaic_config=qaic_config, pretrained_model_name_or_path=pretrained_model_name_or_path, + ccl_enabled=ccl_enabled, **kwargs, ) @@ -2814,6 +2845,8 @@ def compile( *, prefill_seq_len: int = 32, ctx_len: int = 128, + comp_ctx_lengths_prefill: Optional[List[int]] = None, + comp_ctx_lengths_decode: Optional[List[int]] = None, batch_size: int = 1, full_batch_size: Optional[int] = None, kv_cache_batch_size: Optional[int] = None, @@ -2905,10 +2938,18 @@ def compile( """ + # if ccl_enabled is True read Compute-Context-Length lists + if self.ccl_enabled: + if comp_ctx_lengths_prefill is None or comp_ctx_lengths_decode is None: + logger.warning( + "Please set comp_ctx_lengths_prefill and comp_ctx_lengths_decode with a proper list of context lengths. Using non-CCL default model." + ) + self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = process_ccl_specializations( + comp_ctx_lengths_prefill, comp_ctx_lengths_decode, ctx_len, prefill_seq_len + ) + # For supporting VLLM and Disaggregated with CCL - if "comp_ctx_lengths_prefill" in compiler_options and "comp_ctx_lengths_decode" in compiler_options: - comp_ctx_lengths_prefill = compiler_options.pop("comp_ctx_lengths_prefill") - comp_ctx_lengths_decode = compiler_options.pop("comp_ctx_lengths_decode") + if comp_ctx_lengths_prefill is not None or comp_ctx_lengths_decode is not None: if isinstance(comp_ctx_lengths_prefill, str): import ast diff --git a/QEfficient/utils/check_ccl_specializations.py b/QEfficient/utils/check_ccl_specializations.py index 308c69554..0d6a078f6 100644 --- a/QEfficient/utils/check_ccl_specializations.py +++ b/QEfficient/utils/check_ccl_specializations.py @@ -6,14 +6,7 @@ # ----------------------------------------------------------------------------- -def process_ccl_specializations(qaic_config): - if qaic_config is None: - return None, None - ccl_prefill = qaic_config.pop("comp_ctx_lengths_prefill", None) - ccl_decode = qaic_config.pop("comp_ctx_lengths_decode", None) - ctx_len = qaic_config.pop("ctx_len", None) - prefill_seq_len = qaic_config.pop("prefill_seq_len", 128) - +def process_ccl_specializations(ccl_prefill, ccl_decode, ctx_len, prefill_seq_len): if ccl_prefill is None or ccl_decode is None: return None, None diff --git a/examples/performance/README.md b/examples/performance/README.md index 48d34d972..9308ce6db 100644 --- a/examples/performance/README.md +++ b/examples/performance/README.md @@ -95,6 +95,56 @@ python on_device_sampling.py \ --top-p 0.89 ``` +### Compute-Context-Length + +Calculating Context-Length dynamically during inference for getting the best related performance within each window of context-length + +#### compute_context_length/basic_inference.py +Configure CCL parameters: 1) ccl-enabled: to activate CCL feature, 2) comp-ctx-lengths-prefill: list of context length to be used during prefilling, and 3) comp-ctx-lengths-decode: list of context lengths to be used during decoding. + +**Usage for Text-only models:** +```bash +python compute_context_length/basic_inference.py \ + --model-name meta-llama/Llama-3.1-8B \ + --num-cores 16 \ + --prefill-seq-len 32 \ + --ctx-len 1024 \ + --ccl-enabled \ + --comp-ctx-lengths-prefill 500,1000 \ + --comp-ctx-lengths-decode 512,1024 +``` + +**Usage for VLM models such as mllama and llava:** +```bash +python compute_context_length/vlm_inference.py \ + --model-name meta-llama/Llama-3.2-11B-Vision-Instruct \ + --hf-token "" \ + --num-cores 16 \ + --prefill-seq-len 32 \ + --ctx-len 8192 \ + --img-size 560 \ + --ccl-enabled \ + --comp-ctx-lengths-prefill 4096 \ + --comp-ctx-lengths-decode 6144,8192 +``` + +**Usage with other MoE and Multimodal models:** +For various models available in compute_context_length directory such as gemma3, gpt_oss, granite_vision, internvl, llama4_cb, llama4_multi_image, llama4, mistral3, molmo, qwen2_5_vl, qwen2_5_vl_cb, and qwen3moe, use the related inference script and only change the model-name and ccl configuration in the related script. The following is an example of each model: +```bash +python compute_context_length/gemma3.py +python compute_context_length/gpt_oss.py +python compute_context_length/granite_vision.py +python compute_context_length/internvl.py +python compute_context_length/llama4_cb.py +python compute_context_length/llama4_multi_image.py +python compute_context_length/llama4.py +python compute_context_length/mistral3.py +python compute_context_length/molmo.py +python compute_context_length/qwen2_5_vl.py +python compute_context_length/qwen2_5_vl_cb.py +python compute_context_length/qwen3moe.py +``` + ## Performance Tips 1. **Speculative Decoding**: Best for long-form generation where draft model is much faster than target diff --git a/examples/performance/compute_context_length/README.md b/examples/performance/compute_context_length/README.md index bbc240645..9f1d29b9a 100644 --- a/examples/performance/compute_context_length/README.md +++ b/examples/performance/compute_context_length/README.md @@ -68,7 +68,7 @@ python vlm_inference.py \ Basic CCL usage with text-only language models. **Supported Models:** -- Llama (3.2, 3.3) +- Llama (3.2, 3.3, swiftkv) - Gemma/Gemma-2 - Mistral - Phi/Phi-3 @@ -77,6 +77,9 @@ Basic CCL usage with text-only language models. - GPT-2, GPT-J - CodeGen - OLMo-2 +- Mistral/Mixtral +- Qwen2 +- Falcon **Command-Line Arguments:** - `--model-name`: HuggingFace model ID (default: meta-llama/Llama-3.2-1B) diff --git a/examples/performance/compute_context_length/basic_inference.py b/examples/performance/compute_context_length/basic_inference.py index a4407b05a..425d038d9 100644 --- a/examples/performance/compute_context_length/basic_inference.py +++ b/examples/performance/compute_context_length/basic_inference.py @@ -46,6 +46,11 @@ def main(): default=1024, help="Maximum context length", ) + parser.add_argument( + "--ccl-enabled", + action="store_true", + help="Enable compute-context-length (CCL) feature", + ) parser.add_argument( "--comp-ctx-lengths-prefill", type=lambda x: [int(i) for i in x.split(",")], @@ -112,11 +117,7 @@ def main(): model = QEFFAutoModelForCausalLM.from_pretrained( args.model_name, continuous_batching=args.continuous_batching, - qaic_config={ - "comp_ctx_lengths_prefill": args.comp_ctx_lengths_prefill, - "comp_ctx_lengths_decode": args.comp_ctx_lengths_decode, - "ctx_len": args.ctx_len, # Required for CCL validation - }, + ccl_enabled=args.ccl_enabled, ) # Compile the model @@ -132,6 +133,9 @@ def main(): if args.continuous_batching: compile_kwargs["full_batch_size"] = args.full_batch_size + if args.ccl_enabled: + compile_kwargs["comp_ctx_lengths_prefill"] = args.comp_ctx_lengths_prefill + compile_kwargs["comp_ctx_lengths_decode"] = args.comp_ctx_lengths_decode qpc_path = model.compile(**compile_kwargs) print(f"Model compiled successfully to: {qpc_path}") diff --git a/examples/performance/compute_context_length/gemma3.py b/examples/performance/compute_context_length/gemma3.py index c31b1748a..830e2ce43 100644 --- a/examples/performance/compute_context_length/gemma3.py +++ b/examples/performance/compute_context_length/gemma3.py @@ -20,22 +20,26 @@ tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) processor = AutoProcessor.from_pretrained(model_id) -# pass HF_TOKEN if gated model -# For running the model in single QPC approach use kv_offload=False. For Dual QPC approach use kv_offload=True ### +## Activate Compute-Context-Length (CCL) feature by setting ccl_enabled=True when loading the model with from_pretrained(). +## Use the optional comp_ctx_lengths argument to provide two lists of context lengths for the prefilling and decoding processes. If comp_ctx_lengths=None, the model will run with its default context length. +## - The first list, comp_ctx_lengths_prefill, defines the compute-context-length values for the prefilling process. +## -- The process starts with the first value in the list and gradually increases the context length based on the position_id of the current prompt chunk. +## - The second list, comp_ctx_lengths_decode, defines the compute-context-length values for the decoding process. +## -- During decoding, the model selects an appropriate context length from the list based on the input prompt length and cache index. +## -- It starts from the correct value in the list and increases the context length dynamically when the cache index exceeds the current threshold. + ctx_len = 8192 -comp_ctx_lengths_prefill = [3072] -comp_ctx_lengths_decode = [4096, ctx_len] +comp_ctx_lengths_prefill = [3072] # None # +comp_ctx_lengths_decode = [4096, ctx_len] # None # +# pass HF_TOKEN if gated model +# For running the model in single QPC approach use kv_offload=False. For Dual QPC approach use kv_offload=True ### qeff_model = QEFFAutoModelForImageTextToText.from_pretrained( model_id, config=config, attn_implementation="eager", kv_offload=True, - qaic_config={ - "comp_ctx_lengths_prefill": comp_ctx_lengths_prefill, - "comp_ctx_lengths_decode": comp_ctx_lengths_decode, - "ctx_len": ctx_len, - }, + ccl_enabled=True, ) ### use skip_vision=True, if want to run only text, or false ### @@ -54,7 +58,9 @@ aic_enable_depth_first=True, skip_vision=True, mos=1, - node_precision_info="examples/gemma3_example/fp32_nodes_gemma3_27b.yaml", + node_precision_info="examples/performance/compute_context_length/gemma3/fp32_nodes_gemma3_4b.yaml", + comp_ctx_lengths_prefill=comp_ctx_lengths_prefill, + comp_ctx_lengths_decode=comp_ctx_lengths_decode, ) messages = [ @@ -90,7 +96,9 @@ mxint8_kv_cache=False, aic_enable_depth_first=True, mos=1, - node_precision_info="examples/gemma3_example/fp32_nodes_gemma3_27b.yaml", + node_precision_info="examples/performance/compute_context_length/gemma3/fp32_nodes_gemma3_4b.yaml", + comp_ctx_lengths_prefill=comp_ctx_lengths_prefill, + comp_ctx_lengths_decode=comp_ctx_lengths_decode, ) ### IMAGE + TEXT ### diff --git a/examples/performance/compute_context_length/gemma3/fp32_nodes_gemma3_27b.yaml b/examples/performance/compute_context_length/gemma3/fp32_nodes_gemma3_27b.yaml new file mode 100755 index 000000000..d2a4bf164 --- /dev/null +++ b/examples/performance/compute_context_length/gemma3/fp32_nodes_gemma3_27b.yaml @@ -0,0 +1,685 @@ +FP32NodeInstanceNames: + + - /language_model/layers.0/Add_1_output_0 + - /language_model/layers.0/Add_2_output_0 + - /language_model/layers.0/Add_3_output_0 + - /language_model/layers.0/Add_output_0 + - /language_model/layers.1/Add_1_output_0 + - /language_model/layers.1/Add_2_output_0 + - /language_model/layers.1/Add_3_output_0 + - /language_model/layers.1/Add_output_0 + - /language_model/layers.2/Add_1_output_0 + - /language_model/layers.2/Add_2_output_0 + - /language_model/layers.2/Add_3_output_0 + - /language_model/layers.2/Add_output_0 + - /language_model/layers.3/Add_1_output_0 + - /language_model/layers.3/Add_2_output_0 + - /language_model/layers.3/Add_3_output_0 + - /language_model/layers.3/Add_output_0 + - /language_model/layers.4/Add_1_output_0 + - /language_model/layers.4/Add_2_output_0 + - /language_model/layers.4/Add_3_output_0 + - /language_model/layers.4/Add_output_0 + - /language_model/layers.5/Add_1_output_0 + - /language_model/layers.5/Add_2_output_0 + - /language_model/layers.5/Add_3_output_0 + - /language_model/layers.5/Add_output_0 + - /language_model/layers.6/Add_1_output_0 + - /language_model/layers.6/Add_2_output_0 + - /language_model/layers.6/Add_3_output_0 + - /language_model/layers.6/Add_output_0 + - /language_model/layers.7/Add_1_output_0 + - /language_model/layers.7/Add_2_output_0 + - /language_model/layers.7/Add_3_output_0 + - /language_model/layers.7/Add_output_0 + - /language_model/layers.8/Add_1_output_0 + - /language_model/layers.8/Add_2_output_0 + - /language_model/layers.8/Add_3_output_0 + - /language_model/layers.8/Add_output_0 + - /language_model/layers.9/Add_1_output_0 + - /language_model/layers.9/Add_2_output_0 + - /language_model/layers.9/Add_3_output_0 + - /language_model/layers.9/Add_output_0 + - /language_model/layers.10/Add_1_output_0 + - /language_model/layers.10/Add_2_output_0 + - /language_model/layers.10/Add_3_output_0 + - /language_model/layers.10/Add_output_0 + - /language_model/layers.11/Add_1_output_0 + - /language_model/layers.11/Add_2_output_0 + - /language_model/layers.11/Add_3_output_0 + - /language_model/layers.11/Add_output_0 + - /language_model/layers.12/Add_1_output_0 + - /language_model/layers.12/Add_2_output_0 + - /language_model/layers.12/Add_3_output_0 + - /language_model/layers.12/Add_output_0 + - /language_model/layers.13/Add_1_output_0 + - /language_model/layers.13/Add_2_output_0 + - /language_model/layers.13/Add_3_output_0 + - /language_model/layers.13/Add_output_0 + - /language_model/layers.14/Add_1_output_0 + - /language_model/layers.14/Add_2_output_0 + - /language_model/layers.14/Add_3_output_0 + - /language_model/layers.14/Add_output_0 + - /language_model/layers.15/Add_1_output_0 + - /language_model/layers.15/Add_2_output_0 + - /language_model/layers.15/Add_3_output_0 + - /language_model/layers.15/Add_output_0 + - /language_model/layers.16/Add_1_output_0 + - /language_model/layers.16/Add_2_output_0 + - /language_model/layers.16/Add_3_output_0 + - /language_model/layers.16/Add_output_0 + - /language_model/layers.17/Add_1_output_0 + - /language_model/layers.17/Add_2_output_0 + - /language_model/layers.17/Add_3_output_0 + - /language_model/layers.17/Add_output_0 + - /language_model/layers.18/Add_1_output_0 + - /language_model/layers.18/Add_2_output_0 + - /language_model/layers.18/Add_3_output_0 + - /language_model/layers.18/Add_output_0 + - /language_model/layers.19/Add_1_output_0 + - /language_model/layers.19/Add_2_output_0 + - /language_model/layers.19/Add_3_output_0 + - /language_model/layers.19/Add_output_0 + - /language_model/layers.20/Add_1_output_0 + - /language_model/layers.20/Add_2_output_0 + - /language_model/layers.20/Add_3_output_0 + - /language_model/layers.20/Add_output_0 + - /language_model/layers.21/Add_1_output_0 + - /language_model/layers.21/Add_2_output_0 + - /language_model/layers.21/Add_3_output_0 + - /language_model/layers.21/Add_output_0 + - /language_model/layers.22/Add_1_output_0 + - /language_model/layers.22/Add_2_output_0 + - /language_model/layers.22/Add_3_output_0 + - /language_model/layers.22/Add_output_0 + - /language_model/layers.23/Add_1_output_0 + - /language_model/layers.23/Add_2_output_0 + - /language_model/layers.23/Add_output_0 + - /language_model/layers.24/Add_1_output_0 + - /language_model/layers.24/Add_2_output_0 + - /language_model/layers.24/Add_3_output_0 + - /language_model/layers.24/Add_output_0 + - /language_model/layers.25/Add_1_output_0 + - /language_model/layers.25/Add_2_output_0 + - /language_model/layers.25/Add_3_output_0 + - /language_model/layers.25/Add_output_0 + - /language_model/layers.26/Add_1_output_0 + - /language_model/layers.26/Add_2_output_0 + - /language_model/layers.26/Add_3_output_0 + - /language_model/layers.26/Add_output_0 + - /language_model/layers.27/Add_1_output_0 + - /language_model/layers.27/Add_2_output_0 + - /language_model/layers.27/Add_3_output_0 + - /language_model/layers.27/Add_output_0 + - /language_model/layers.28/Add_1_output_0 + - /language_model/layers.28/Add_2_output_0 + - /language_model/layers.28/Add_3_output_0 + - /language_model/layers.28/Add_output_0 + - /language_model/layers.29/Add_1_output_0 + - /language_model/layers.29/Add_2_output_0 + - /language_model/layers.29/Add_3_output_0 + - /language_model/layers.29/Add_output_0 + - /language_model/layers.30/Add_1_output_0 + - /language_model/layers.30/Add_2_output_0 + - /language_model/layers.30/Add_3_output_0 + - /language_model/layers.30/Add_output_0 + - /language_model/layers.31/Add_1_output_0 + - /language_model/layers.31/Add_2_output_0 + - /language_model/layers.31/Add_3_output_0 + - /language_model/layers.31/Add_output_0 + - /language_model/layers.32/Add_1_output_0 + - /language_model/layers.32/Add_2_output_0 + - /language_model/layers.32/Add_3_output_0 + - /language_model/layers.32/Add_output_0 + - /language_model/layers.33/Add_1_output_0 + - /language_model/layers.33/Add_2_output_0 + - /language_model/layers.33/Add_3_output_0 + - /language_model/layers.33/Add_output_0 + - /language_model/layers.34/Add_1_output_0 + - /language_model/layers.34/Add_2_output_0 + - /language_model/layers.34/Add_3_output_0 + - /language_model/layers.34/Add_output_0 + - /language_model/layers.35/Add_1_output_0 + - /language_model/layers.35/Add_2_output_0 + - /language_model/layers.35/Add_3_output_0 + - /language_model/layers.35/Add_output_0 + - /language_model/layers.36/Add_1_output_0 + - /language_model/layers.36/Add_2_output_0 + - /language_model/layers.36/Add_3_output_0 + - /language_model/layers.36/Add_output_0 + - /language_model/layers.37/Add_1_output_0 + - /language_model/layers.37/Add_2_output_0 + - /language_model/layers.37/Add_3_output_0 + - /language_model/layers.37/Add_output_0 + - /language_model/layers.38/Add_1_output_0 + - /language_model/layers.38/Add_2_output_0 + - /language_model/layers.38/Add_3_output_0 + - /language_model/layers.38/Add_output_0 + - /language_model/layers.39/Add_1_output_0 + - /language_model/layers.39/Add_2_output_0 + - /language_model/layers.39/Add_3_output_0 + - /language_model/layers.39/Add_output_0 + - /language_model/layers.40/Add_1_output_0 + - /language_model/layers.40/Add_2_output_0 + - /language_model/layers.40/Add_3_output_0 + - /language_model/layers.40/Add_output_0 + - /language_model/layers.41/Add_1_output_0 + - /language_model/layers.41/Add_2_output_0 + - /language_model/layers.41/Add_3_output_0 + - /language_model/layers.41/Add_output_0 + - /language_model/layers.42/Add_1_output_0 + - /language_model/layers.42/Add_2_output_0 + - /language_model/layers.42/Add_3_output_0 + - /language_model/layers.42/Add_output_0 + - /language_model/layers.43/Add_1_output_0 + - /language_model/layers.43/Add_2_output_0 + - /language_model/layers.43/Add_3_output_0 + - /language_model/layers.43/Add_output_0 + - /language_model/layers.44/Add_1_output_0 + - /language_model/layers.44/Add_2_output_0 + - /language_model/layers.44/Add_3_output_0 + - /language_model/layers.44/Add_output_0 + - /language_model/layers.45/Add_1_output_0 + - /language_model/layers.45/Add_2_output_0 + - /language_model/layers.45/Add_3_output_0 + - /language_model/layers.45/Add_output_0 + - /language_model/layers.46/Add_1_output_0 + - /language_model/layers.46/Add_2_output_0 + - /language_model/layers.46/Add_3_output_0 + - /language_model/layers.46/Add_output_0 + - /language_model/layers.47/Add_1_output_0 + - /language_model/layers.47/Add_2_output_0 + - /language_model/layers.47/Add_3_output_0 + - /language_model/layers.47/Add_output_0 + - /language_model/layers.48/Add_1_output_0 + - /language_model/layers.48/Add_2_output_0 + - /language_model/layers.48/Add_3_output_0 + - /language_model/layers.48/Add_output_0 + - /language_model/layers.49/Add_1_output_0 + - /language_model/layers.49/Add_2_output_0 + - /language_model/layers.49/Add_3_output_0 + - /language_model/layers.49/Add_output_0 + - /language_model/layers.50/Add_1_output_0 + - /language_model/layers.50/Add_2_output_0 + - /language_model/layers.50/Add_3_output_0 + - /language_model/layers.50/Add_output_0 + - /language_model/layers.51/Add_1_output_0 + - /language_model/layers.51/Add_2_output_0 + - /language_model/layers.51/Add_3_output_0 + - /language_model/layers.51/Add_output_0 + - /language_model/layers.52/Add_1_output_0 + - /language_model/layers.52/Add_2_output_0 + - /language_model/layers.52/Add_3_output_0 + - /language_model/layers.52/Add_output_0 + - /language_model/layers.53/Add_1_output_0 + - /language_model/layers.53/Add_2_output_0 + - /language_model/layers.53/Add_3_output_0 + - /language_model/layers.53/Add_output_0 + - /language_model/layers.54/Add_1_output_0 + - /language_model/layers.54/Add_2_output_0 + - /language_model/layers.54/Add_3_output_0 + - /language_model/layers.54/Add_output_0 + - /language_model/layers.55/Add_1_output_0 + - /language_model/layers.55/Add_2_output_0 + - /language_model/layers.55/Add_3_output_0 + - /language_model/layers.55/Add_output_0 + - /language_model/layers.56/Add_1_output_0 + - /language_model/layers.56/Add_2_output_0 + - /language_model/layers.56/Add_3_output_0 + - /language_model/layers.56/Add_output_0 + - /language_model/layers.57/Add_1_output_0 + - /language_model/layers.57/Add_2_output_0 + - /language_model/layers.57/Add_3_output_0 + - /language_model/layers.57/Add_output_0 + - /language_model/layers.58/Add_1_output_0 + - /language_model/layers.58/Add_2_output_0 + - /language_model/layers.58/Add_3_output_0 + - /language_model/layers.58/Add_output_0 + - /language_model/layers.59/Add_1_output_0 + - /language_model/layers.59/Add_2_output_0 + - /language_model/layers.59/Add_3_output_0 + - /language_model/layers.59/Add_output_0 + - /language_model/layers.60/Add_1_output_0 + - /language_model/layers.60/Add_2_output_0 + - /language_model/layers.60/Add_3_output_0 + - /language_model/layers.60/Add_output_0 + - /language_model/layers.61/Add_1_output_0 + - /language_model/layers.61/Add_2_output_0 + - /language_model/layers.61/Add_3_output_0 + - /language_model/layers.61/Add_output_0 + - /language_model/norm/Add_output_0 + - /language_model/layers.0/self_attn/Mul_output_0 + - /language_model/layers.2/self_attn/Mul_output_0 + - /language_model/layers.3/self_attn/Mul_output_0 + - /language_model/layers.4/self_attn/Mul_output_0 + - /language_model/layers.5/self_attn/Mul_output_0 + - /language_model/layers.6/self_attn/Mul_output_0 + - /language_model/layers.7/self_attn/Mul_output_0 + - /language_model/layers.8/self_attn/Mul_output_0 + - /language_model/layers.9/self_attn/Mul_output_0 + - /language_model/layers.10/self_attn/Mul_output_0 + - /language_model/layers.11/self_attn/Mul_output_0 + - /language_model/layers.12/self_attn/Mul_output_0 + - /language_model/layers.13/self_attn/Mul_output_0 + - /language_model/layers.14/self_attn/Mul_output_0 + - /language_model/layers.15/self_attn/Mul_output_0 + - /language_model/layers.16/self_attn/Mul_output_0 + - /language_model/layers.17/self_attn/Mul_output_0 + - /language_model/layers.18/self_attn/Mul_output_0 + - /language_model/layers.19/self_attn/Mul_output_0 + - /language_model/layers.20/self_attn/Mul_output_0 + - /language_model/layers.21/self_attn/Mul_output_0 + - /language_model/layers.22/self_attn/Mul_output_0 + - /language_model/layers.23/self_attn/Mul_output_0 + - /language_model/layers.24/self_attn/Mul_output_0 + - /language_model/layers.25/self_attn/Mul_output_0 + - /language_model/layers.26/self_attn/Mul_output_0 + - /language_model/layers.27/self_attn/Mul_output_0 + - /language_model/layers.28/self_attn/Mul_output_0 + - /language_model/layers.29/self_attn/Mul_output_0 + - /language_model/layers.30/self_attn/Mul_output_0 + - /language_model/layers.31/self_attn/Mul_output_0 + - /language_model/layers.32/self_attn/Mul_output_0 + - /language_model/layers.33/self_attn/Mul_output_0 + - /language_model/layers.34/self_attn/Mul_output_0 + - /language_model/layers.35/self_attn/Mul_output_0 + - /language_model/layers.36/self_attn/Mul_output_0 + - /language_model/layers.37/self_attn/Mul_output_0 + - /language_model/layers.38/self_attn/Mul_output_0 + - /language_model/layers.39/self_attn/Mul_output_0 + - /language_model/layers.40/self_attn/Mul_output_0 + - /language_model/layers.41/self_attn/Mul_output_0 + - /language_model/layers.42/self_attn/Mul_output_0 + - /language_model/layers.43/self_attn/Mul_output_0 + - /language_model/layers.44/self_attn/Mul_output_0 + - /language_model/layers.45/self_attn/Mul_output_0 + - /language_model/layers.46/self_attn/Mul_output_0 + - /language_model/layers.47/self_attn/Mul_output_0 + - /language_model/layers.48/self_attn/Mul_output_0 + - /language_model/layers.49/self_attn/Mul_output_0 + - /language_model/layers.50/self_attn/Mul_output_0 + - /language_model/layers.51/self_attn/Mul_output_0 + - /language_model/layers.52/self_attn/Mul_output_0 + - /language_model/layers.53/self_attn/Mul_output_0 + - /language_model/layers.54/self_attn/Mul_output_0 + - /language_model/layers.55/self_attn/Mul_output_0 + - /language_model/layers.56/self_attn/Mul_output_0 + - /language_model/layers.57/self_attn/Mul_output_0 + - /language_model/layers.58/self_attn/Mul_output_0 + - /language_model/layers.59/self_attn/Mul_output_0 + - /language_model/layers.60/self_attn/Mul_output_0 + - /language_model/layers.61/self_attn/Mul_output_0 + - /language_model/layers.0/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.0/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.0/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.0/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.0/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.0/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.1/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.1/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.1/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.1/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.1/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.1/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.2/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.2/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.2/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.2/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.2/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.2/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.3/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.3/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.3/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.3/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.3/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.3/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.4/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.4/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.4/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.4/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.4/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.4/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.5/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.5/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.5/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.5/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.5/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.5/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.6/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.6/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.6/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.6/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.6/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.6/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.7/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.7/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.7/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.7/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.7/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.7/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.8/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.8/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.8/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.8/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.8/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.8/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.9/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.9/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.9/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.9/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.9/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.9/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.10/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.10/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.10/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.10/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.10/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.10/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.11/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.11/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.11/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.11/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.11/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.11/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.12/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.12/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.12/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.12/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.12/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.12/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.13/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.13/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.13/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.13/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.13/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.13/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.14/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.14/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.14/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.14/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.14/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.14/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.15/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.15/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.15/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.15/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.15/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.15/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.16/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.16/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.16/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.16/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.16/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.16/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.17/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.17/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.17/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.17/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.17/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.17/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.18/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.18/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.18/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.18/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.18/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.18/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.19/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.19/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.19/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.19/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.19/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.19/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.20/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.20/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.20/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.20/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.20/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.20/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.21/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.21/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.21/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.21/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.21/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.21/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.22/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.22/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.22/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.22/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.22/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.22/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.23/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.23/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.23/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.23/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.23/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.23/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.24/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.24/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.24/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.24/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.24/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.24/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.25/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.25/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.25/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.25/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.25/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.25/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.26/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.26/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.26/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.26/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.26/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.26/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.27/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.27/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.27/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.27/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.27/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.27/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.28/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.28/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.28/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.28/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.28/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.28/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.29/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.29/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.29/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.29/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.29/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.29/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.30/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.30/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.30/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.30/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.30/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.30/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.31/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.31/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.31/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.31/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.31/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.31/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.32/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.32/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.32/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.32/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.32/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.32/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.33/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.33/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.33/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.33/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.33/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.33/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.34/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.34/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.34/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.34/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.34/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.34/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.35/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.35/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.35/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.35/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.35/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.35/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.36/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.36/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.36/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.36/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.36/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.36/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.37/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.37/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.37/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.37/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.37/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.37/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.38/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.38/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.38/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.38/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.38/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.38/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.39/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.39/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.39/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.39/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.39/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.39/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.40/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.40/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.40/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.40/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.40/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.40/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.41/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.41/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.41/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.41/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.41/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.41/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.42/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.42/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.42/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.42/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.42/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.42/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.43/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.43/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.43/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.43/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.43/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.43/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.44/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.44/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.44/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.44/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.44/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.44/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.45/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.45/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.45/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.45/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.45/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.45/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.46/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.46/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.46/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.46/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.46/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.46/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.47/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.47/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.47/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.47/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.47/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.47/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.48/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.48/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.48/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.48/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.48/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.48/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.49/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.49/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.49/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.49/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.49/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.49/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.50/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.50/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.50/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.50/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.50/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.50/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.51/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.51/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.51/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.51/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.51/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.51/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.52/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.52/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.52/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.52/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.52/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.52/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.53/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.53/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.53/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.53/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.53/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.53/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.54/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.54/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.54/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.54/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.54/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.54/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.55/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.55/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.55/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.55/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.55/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.55/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.56/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.56/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.56/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.56/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.56/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.56/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.57/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.57/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.57/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.57/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.57/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.57/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.58/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.58/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.58/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.58/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.58/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.58/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.59/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.59/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.59/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.59/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.59/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.59/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.60/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.60/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.60/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.60/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.60/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.60/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.61/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.61/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.61/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.61/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.61/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.61/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/norm/CustomRMSNorm_output_0 + diff --git a/examples/performance/compute_context_length/gemma3/fp32_nodes_gemma3_4b.yaml b/examples/performance/compute_context_length/gemma3/fp32_nodes_gemma3_4b.yaml new file mode 100755 index 000000000..1c8aa1c41 --- /dev/null +++ b/examples/performance/compute_context_length/gemma3/fp32_nodes_gemma3_4b.yaml @@ -0,0 +1,698 @@ +FP32NodeInstanceNames: + + - /language_model/layers.0/Add_output_0 + - /language_model/layers.0/Add_1_output_0 + - /language_model/layers.0/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.0/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.0/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.0/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.0/Add_2_output_0 + - /language_model/layers.0/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.0/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.0/Add_3_output_0 + - /language_model/layers.1/Add_output_0 + - /language_model/layers.1/Add_1_output_0 + - /language_model/layers.1/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.1/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.1/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.1/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.1/Add_2_output_0 + - /language_model/layers.1/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.1/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.1/Add_3_output_0 + - /language_model/layers.2/Add_output_0 + - /language_model/layers.2/Add_1_output_0 + - /language_model/layers.2/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.2/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.2/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.2/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.2/Add_2_output_0 + - /language_model/layers.2/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.2/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.2/Add_3_output_0 + - /language_model/layers.3/Add_output_0 + - /language_model/layers.3/Add_1_output_0 + - /language_model/layers.3/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.3/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.3/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.3/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.3/Add_2_output_0 + - /language_model/layers.3/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.3/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.3/Add_3_output_0 + - /language_model/layers.4/Add_output_0 + - /language_model/layers.4/Add_1_output_0 + - /language_model/layers.4/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.4/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.4/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.4/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.4/Add_2_output_0 + - /language_model/layers.4/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.4/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.4/Add_3_output_0 + - /language_model/layers.5/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.5/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.5/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.5/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.5/Add_output_0 + - /language_model/layers.5/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.5/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.5/Add_1_output_0 + - /language_model/layers.6/Add_output_0 + - /language_model/layers.6/Add_1_output_0 + - /language_model/layers.6/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.6/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.6/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.6/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.6/Add_2_output_0 + - /language_model/layers.6/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.6/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.6/Add_3_output_0 + - /language_model/layers.7/Add_output_0 + - /language_model/layers.7/Add_1_output_0 + - /language_model/layers.7/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.7/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.7/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.7/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.7/Add_2_output_0 + - /language_model/layers.7/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.7/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.7/Add_3_output_0 + - /language_model/layers.8/Add_output_0 + - /language_model/layers.8/Add_1_output_0 + - /language_model/layers.8/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.8/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.8/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.8/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.8/Add_2_output_0 + - /language_model/layers.8/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.8/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.8/Add_3_output_0 + - /language_model/layers.9/Add_output_0 + - /language_model/layers.9/Add_1_output_0 + - /language_model/layers.9/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.9/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.9/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.9/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.9/Add_2_output_0 + - /language_model/layers.9/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.9/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.9/Add_3_output_0 + - /language_model/layers.10/Add_output_0 + - /language_model/layers.10/Add_1_output_0 + - /language_model/layers.10/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.10/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.10/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.10/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.10/Add_2_output_0 + - /language_model/layers.10/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.10/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.10/Add_3_output_0 + - /language_model/layers.11/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.11/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.11/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.11/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.11/Add_output_0 + - /language_model/layers.11/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.11/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.11/Add_1_output_0 + - /language_model/layers.12/Add_output_0 + - /language_model/layers.12/Add_1_output_0 + - /language_model/layers.12/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.12/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.12/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.12/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.12/Add_2_output_0 + - /language_model/layers.12/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.12/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.12/Add_3_output_0 + - /language_model/layers.13/Add_output_0 + - /language_model/layers.13/Add_1_output_0 + - /language_model/layers.13/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.13/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.13/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.13/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.13/Add_2_output_0 + - /language_model/layers.13/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.13/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.13/Add_3_output_0 + - /language_model/layers.14/Add_output_0 + - /language_model/layers.14/Add_1_output_0 + - /language_model/layers.14/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.14/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.14/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.14/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.14/Add_2_output_0 + - /language_model/layers.14/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.14/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.14/Add_3_output_0 + - /language_model/layers.15/Add_output_0 + - /language_model/layers.15/Add_1_output_0 + - /language_model/layers.15/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.15/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.15/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.15/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.15/Add_2_output_0 + - /language_model/layers.15/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.15/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.15/Add_3_output_0 + - /language_model/layers.16/Add_output_0 + - /language_model/layers.16/Add_1_output_0 + - /language_model/layers.16/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.16/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.16/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.16/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.16/Add_2_output_0 + - /language_model/layers.16/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.16/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.16/Add_3_output_0 + - /language_model/layers.17/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.17/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.17/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.17/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.17/Add_output_0 + - /language_model/layers.17/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.17/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.17/Add_1_output_0 + - /language_model/layers.18/Add_output_0 + - /language_model/layers.18/Add_1_output_0 + - /language_model/layers.18/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.18/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.18/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.18/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.18/Add_2_output_0 + - /language_model/layers.18/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.18/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.18/Add_3_output_0 + - /language_model/layers.19/Add_output_0 + - /language_model/layers.19/Add_1_output_0 + - /language_model/layers.19/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.19/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.19/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.19/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.19/Add_2_output_0 + - /language_model/layers.19/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.19/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.19/Add_3_output_0 + - /language_model/layers.20/Add_output_0 + - /language_model/layers.20/Add_1_output_0 + - /language_model/layers.20/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.20/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.20/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.20/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.20/Add_2_output_0 + - /language_model/layers.20/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.20/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.20/Add_3_output_0 + - /language_model/layers.21/Add_output_0 + - /language_model/layers.21/Add_1_output_0 + - /language_model/layers.21/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.21/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.21/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.21/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.21/Add_2_output_0 + - /language_model/layers.21/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.21/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.21/Add_3_output_0 + - /language_model/layers.22/Add_output_0 + - /language_model/layers.22/Add_1_output_0 + - /language_model/layers.22/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.22/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.22/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.22/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.22/Add_2_output_0 + - /language_model/layers.22/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.22/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.22/Add_3_output_0 + - /language_model/layers.23/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.23/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.23/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.23/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.23/Add_output_0 + - /language_model/layers.23/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.23/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.23/Add_1_output_0 + - /language_model/layers.24/Add_output_0 + - /language_model/layers.24/Add_1_output_0 + - /language_model/layers.24/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.24/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.24/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.24/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.24/Add_2_output_0 + - /language_model/layers.24/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.24/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.24/Add_3_output_0 + - /language_model/layers.25/Add_output_0 + - /language_model/layers.25/Add_1_output_0 + - /language_model/layers.25/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.25/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.25/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.25/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.25/Add_2_output_0 + - /language_model/layers.25/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.25/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.25/Add_3_output_0 + - /language_model/layers.26/Add_output_0 + - /language_model/layers.26/Add_1_output_0 + - /language_model/layers.26/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.26/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.26/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.26/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.26/Add_2_output_0 + - /language_model/layers.26/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.26/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.26/Add_3_output_0 + - /language_model/layers.27/Add_output_0 + - /language_model/layers.27/Add_1_output_0 + - /language_model/layers.27/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.27/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.27/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.27/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.27/Add_2_output_0 + - /language_model/layers.27/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.27/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.27/Add_3_output_0 + - /language_model/layers.28/Add_output_0 + - /language_model/layers.28/Add_1_output_0 + - /language_model/layers.28/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.28/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.28/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.28/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.28/Add_2_output_0 + - /language_model/layers.28/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.28/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.28/Add_3_output_0 + - /language_model/layers.29/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.29/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.29/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.29/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.29/Add_output_0 + - /language_model/layers.29/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.29/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.29/Add_1_output_0 + - /language_model/layers.30/Add_output_0 + - /language_model/layers.30/Add_1_output_0 + - /language_model/layers.30/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.30/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.30/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.30/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.30/Add_2_output_0 + - /language_model/layers.30/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.30/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.30/Add_3_output_0 + - /language_model/layers.31/Add_output_0 + - /language_model/layers.31/Add_1_output_0 + - /language_model/layers.31/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.31/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.31/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.31/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.31/Add_2_output_0 + - /language_model/layers.31/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.31/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.31/Add_3_output_0 + - /language_model/layers.32/Add_output_0 + - /language_model/layers.32/Add_1_output_0 + - /language_model/layers.32/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.32/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.32/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.32/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.32/Add_2_output_0 + - /language_model/layers.32/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.32/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.32/Add_3_output_0 + - /language_model/layers.33/Add_output_0 + - /language_model/layers.33/Add_1_output_0 + - /language_model/layers.33/input_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.33/self_attn/q_norm/CustomRMSNorm_output_0 + - /language_model/layers.33/self_attn/k_norm/CustomRMSNorm_output_0 + - /language_model/layers.33/post_attention_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.33/Add_2_output_0 + - /language_model/layers.33/pre_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.33/post_feedforward_layernorm/CustomRMSNorm_output_0 + - /language_model/layers.33/Add_3_output_0 + - /language_model/norm/CustomRMSNorm_output_0 + - /language_model/layers.0/self_attn/Mul_output_0 + - /language_model/layers.0/self_attn/Mul_1_output_0 + - /language_model/layers.0/self_attn/Mul_2_output_0 + - /language_model/layers.0/self_attn/Mul_3_output_0 + - /language_model/layers.0/self_attn/Mul_4_output_0 + - /language_model/layers.0/self_attn/Mul_5_output_0 + - /language_model/layers.0/self_attn/Mul_6_output_0 + - /language_model/layers.0/self_attn/Mul_7_output_0 + - /language_model/layers.0/self_attn/Mul_8_output_0 + - /language_model/layers.1/self_attn/Mul_9_output_0 + - /language_model/layers.2/self_attn/Mul_output_0 + - /language_model/layers.2/self_attn/Mul_1_output_0 + - /language_model/layers.2/self_attn/Mul_2_output_0 + - /language_model/layers.2/self_attn/Mul_3_output_0 + - /language_model/layers.2/self_attn/Mul_4_output_0 + - /language_model/layers.2/self_attn/Mul_5_output_0 + - /language_model/layers.2/self_attn/Mul_6_output_0 + - /language_model/layers.2/self_attn/Mul_7_output_0 + - /language_model/layers.2/self_attn/Mul_8_output_0 + - /language_model/layers.2/self_attn/Mul_9_output_0 + - /language_model/layers.3/self_attn/Mul_output_0 + - /language_model/layers.3/self_attn/Mul_1_output_0 + - /language_model/layers.3/self_attn/Mul_2_output_0 + - /language_model/layers.3/self_attn/Mul_3_output_0 + - /language_model/layers.3/self_attn/Mul_4_output_0 + - /language_model/layers.3/self_attn/Mul_5_output_0 + - /language_model/layers.3/self_attn/Mul_6_output_0 + - /language_model/layers.3/self_attn/Mul_7_output_0 + - /language_model/layers.3/self_attn/Mul_8_output_0 + - /language_model/layers.3/self_attn/Mul_9_output_0 + - /language_model/layers.4/self_attn/Mul_output_0 + - /language_model/layers.4/self_attn/Mul_1_output_0 + - /language_model/layers.4/self_attn/Mul_2_output_0 + - /language_model/layers.4/self_attn/Mul_3_output_0 + - /language_model/layers.4/self_attn/Mul_4_output_0 + - /language_model/layers.4/self_attn/Mul_5_output_0 + - /language_model/layers.4/self_attn/Mul_6_output_0 + - /language_model/layers.4/self_attn/Mul_7_output_0 + - /language_model/layers.4/self_attn/Mul_8_output_0 + - /language_model/layers.4/self_attn/Mul_9_output_0 + - /language_model/layers.5/self_attn/Mul_output_0 + - /language_model/layers.5/self_attn/Mul_1_output_0 + - /language_model/layers.5/self_attn/Mul_2_output_0 + - /language_model/layers.5/self_attn/Mul_3_output_0 + - /language_model/layers.5/self_attn/Mul_4_output_0 + - /language_model/layers.5/self_attn/Mul_5_output_0 + - /language_model/layers.5/self_attn/Mul_6_output_0 + - /language_model/layers.5/self_attn/Mul_7_output_0 + - /language_model/layers.5/self_attn/Mul_8_output_0 + - /language_model/layers.5/self_attn/Mul_9_output_0 + - /language_model/layers.6/self_attn/Mul_output_0 + - /language_model/layers.6/self_attn/Mul_1_output_0 + - /language_model/layers.6/self_attn/Mul_2_output_0 + - /language_model/layers.6/self_attn/Mul_3_output_0 + - /language_model/layers.6/self_attn/Mul_4_output_0 + - /language_model/layers.6/self_attn/Mul_5_output_0 + - /language_model/layers.6/self_attn/Mul_6_output_0 + - /language_model/layers.6/self_attn/Mul_7_output_0 + - /language_model/layers.6/self_attn/Mul_8_output_0 + - /language_model/layers.6/self_attn/Mul_9_output_0 + - /language_model/layers.7/self_attn/Mul_output_0 + - /language_model/layers.7/self_attn/Mul_1_output_0 + - /language_model/layers.7/self_attn/Mul_2_output_0 + - /language_model/layers.7/self_attn/Mul_3_output_0 + - /language_model/layers.7/self_attn/Mul_4_output_0 + - /language_model/layers.7/self_attn/Mul_5_output_0 + - /language_model/layers.7/self_attn/Mul_6_output_0 + - /language_model/layers.7/self_attn/Mul_7_output_0 + - /language_model/layers.7/self_attn/Mul_8_output_0 + - /language_model/layers.7/self_attn/Mul_9_output_0 + - /language_model/layers.8/self_attn/Mul_output_0 + - /language_model/layers.8/self_attn/Mul_1_output_0 + - /language_model/layers.8/self_attn/Mul_2_output_0 + - /language_model/layers.8/self_attn/Mul_3_output_0 + - /language_model/layers.8/self_attn/Mul_4_output_0 + - /language_model/layers.8/self_attn/Mul_5_output_0 + - /language_model/layers.8/self_attn/Mul_6_output_0 + - /language_model/layers.8/self_attn/Mul_7_output_0 + - /language_model/layers.8/self_attn/Mul_8_output_0 + - /language_model/layers.8/self_attn/Mul_9_output_0 + - /language_model/layers.9/self_attn/Mul_output_0 + - /language_model/layers.9/self_attn/Mul_1_output_0 + - /language_model/layers.9/self_attn/Mul_2_output_0 + - /language_model/layers.9/self_attn/Mul_3_output_0 + - /language_model/layers.9/self_attn/Mul_4_output_0 + - /language_model/layers.9/self_attn/Mul_5_output_0 + - /language_model/layers.9/self_attn/Mul_6_output_0 + - /language_model/layers.9/self_attn/Mul_7_output_0 + - /language_model/layers.9/self_attn/Mul_8_output_0 + - /language_model/layers.9/self_attn/Mul_9_output_0 + - /language_model/layers.10/self_attn/Mul_output_0 + - /language_model/layers.10/self_attn/Mul_1_output_0 + - /language_model/layers.10/self_attn/Mul_2_output_0 + - /language_model/layers.10/self_attn/Mul_3_output_0 + - /language_model/layers.10/self_attn/Mul_4_output_0 + - /language_model/layers.10/self_attn/Mul_5_output_0 + - /language_model/layers.10/self_attn/Mul_6_output_0 + - /language_model/layers.10/self_attn/Mul_7_output_0 + - /language_model/layers.10/self_attn/Mul_8_output_0 + - /language_model/layers.10/self_attn/Mul_9_output_0 + - /language_model/layers.11/self_attn/Mul_output_0 + - /language_model/layers.11/self_attn/Mul_1_output_0 + - /language_model/layers.11/self_attn/Mul_2_output_0 + - /language_model/layers.11/self_attn/Mul_3_output_0 + - /language_model/layers.11/self_attn/Mul_4_output_0 + - /language_model/layers.11/self_attn/Mul_5_output_0 + - /language_model/layers.11/self_attn/Mul_6_output_0 + - /language_model/layers.11/self_attn/Mul_7_output_0 + - /language_model/layers.11/self_attn/Mul_8_output_0 + - /language_model/layers.11/self_attn/Mul_9_output_0 + - /language_model/layers.12/self_attn/Mul_output_0 + - /language_model/layers.12/self_attn/Mul_1_output_0 + - /language_model/layers.12/self_attn/Mul_2_output_0 + - /language_model/layers.12/self_attn/Mul_3_output_0 + - /language_model/layers.12/self_attn/Mul_4_output_0 + - /language_model/layers.12/self_attn/Mul_5_output_0 + - /language_model/layers.12/self_attn/Mul_6_output_0 + - /language_model/layers.12/self_attn/Mul_7_output_0 + - /language_model/layers.12/self_attn/Mul_8_output_0 + - /language_model/layers.12/self_attn/Mul_9_output_0 + - /language_model/layers.13/self_attn/Mul_output_0 + - /language_model/layers.13/self_attn/Mul_1_output_0 + - /language_model/layers.13/self_attn/Mul_2_output_0 + - /language_model/layers.13/self_attn/Mul_3_output_0 + - /language_model/layers.13/self_attn/Mul_4_output_0 + - /language_model/layers.13/self_attn/Mul_5_output_0 + - /language_model/layers.13/self_attn/Mul_6_output_0 + - /language_model/layers.13/self_attn/Mul_7_output_0 + - /language_model/layers.13/self_attn/Mul_8_output_0 + - /language_model/layers.13/self_attn/Mul_9_output_0 + - /language_model/layers.14/self_attn/Mul_output_0 + - /language_model/layers.14/self_attn/Mul_1_output_0 + - /language_model/layers.14/self_attn/Mul_2_output_0 + - /language_model/layers.14/self_attn/Mul_3_output_0 + - /language_model/layers.14/self_attn/Mul_4_output_0 + - /language_model/layers.14/self_attn/Mul_5_output_0 + - /language_model/layers.14/self_attn/Mul_6_output_0 + - /language_model/layers.14/self_attn/Mul_7_output_0 + - /language_model/layers.14/self_attn/Mul_8_output_0 + - /language_model/layers.14/self_attn/Mul_9_output_0 + - /language_model/layers.15/self_attn/Mul_output_0 + - /language_model/layers.15/self_attn/Mul_1_output_0 + - /language_model/layers.15/self_attn/Mul_2_output_0 + - /language_model/layers.15/self_attn/Mul_3_output_0 + - /language_model/layers.15/self_attn/Mul_4_output_0 + - /language_model/layers.15/self_attn/Mul_5_output_0 + - /language_model/layers.15/self_attn/Mul_6_output_0 + - /language_model/layers.15/self_attn/Mul_7_output_0 + - /language_model/layers.15/self_attn/Mul_8_output_0 + - /language_model/layers.15/self_attn/Mul_9_output_0 + - /language_model/layers.16/self_attn/Mul_output_0 + - /language_model/layers.16/self_attn/Mul_1_output_0 + - /language_model/layers.16/self_attn/Mul_2_output_0 + - /language_model/layers.16/self_attn/Mul_3_output_0 + - /language_model/layers.16/self_attn/Mul_4_output_0 + - /language_model/layers.16/self_attn/Mul_5_output_0 + - /language_model/layers.16/self_attn/Mul_6_output_0 + - /language_model/layers.16/self_attn/Mul_7_output_0 + - /language_model/layers.16/self_attn/Mul_8_output_0 + - /language_model/layers.16/self_attn/Mul_9_output_0 + - /language_model/layers.17/self_attn/Mul_output_0 + - /language_model/layers.17/self_attn/Mul_1_output_0 + - /language_model/layers.17/self_attn/Mul_2_output_0 + - /language_model/layers.17/self_attn/Mul_3_output_0 + - /language_model/layers.17/self_attn/Mul_4_output_0 + - /language_model/layers.17/self_attn/Mul_5_output_0 + - /language_model/layers.17/self_attn/Mul_6_output_0 + - /language_model/layers.17/self_attn/Mul_7_output_0 + - /language_model/layers.17/self_attn/Mul_8_output_0 + - /language_model/layers.17/self_attn/Mul_9_output_0 + - /language_model/layers.18/self_attn/Mul_output_0 + - /language_model/layers.18/self_attn/Mul_1_output_0 + - /language_model/layers.18/self_attn/Mul_2_output_0 + - /language_model/layers.18/self_attn/Mul_3_output_0 + - /language_model/layers.18/self_attn/Mul_4_output_0 + - /language_model/layers.18/self_attn/Mul_5_output_0 + - /language_model/layers.18/self_attn/Mul_6_output_0 + - /language_model/layers.18/self_attn/Mul_7_output_0 + - /language_model/layers.18/self_attn/Mul_8_output_0 + - /language_model/layers.18/self_attn/Mul_9_output_0 + - /language_model/layers.19/self_attn/Mul_output_0 + - /language_model/layers.19/self_attn/Mul_1_output_0 + - /language_model/layers.19/self_attn/Mul_2_output_0 + - /language_model/layers.19/self_attn/Mul_3_output_0 + - /language_model/layers.19/self_attn/Mul_4_output_0 + - /language_model/layers.19/self_attn/Mul_5_output_0 + - /language_model/layers.19/self_attn/Mul_6_output_0 + - /language_model/layers.19/self_attn/Mul_7_output_0 + - /language_model/layers.19/self_attn/Mul_8_output_0 + - /language_model/layers.19/self_attn/Mul_9_output_0 + - /language_model/layers.20/self_attn/Mul_output_0 + - /language_model/layers.20/self_attn/Mul_1_output_0 + - /language_model/layers.20/self_attn/Mul_2_output_0 + - /language_model/layers.20/self_attn/Mul_3_output_0 + - /language_model/layers.20/self_attn/Mul_4_output_0 + - /language_model/layers.20/self_attn/Mul_5_output_0 + - /language_model/layers.20/self_attn/Mul_6_output_0 + - /language_model/layers.20/self_attn/Mul_7_output_0 + - /language_model/layers.20/self_attn/Mul_8_output_0 + - /language_model/layers.20/self_attn/Mul_9_output_0 + - /language_model/layers.21/self_attn/Mul_output_0 + - /language_model/layers.21/self_attn/Mul_1_output_0 + - /language_model/layers.21/self_attn/Mul_2_output_0 + - /language_model/layers.21/self_attn/Mul_3_output_0 + - /language_model/layers.21/self_attn/Mul_4_output_0 + - /language_model/layers.21/self_attn/Mul_5_output_0 + - /language_model/layers.21/self_attn/Mul_6_output_0 + - /language_model/layers.21/self_attn/Mul_7_output_0 + - /language_model/layers.21/self_attn/Mul_8_output_0 + - /language_model/layers.21/self_attn/Mul_9_output_0 + - /language_model/layers.22/self_attn/Mul_output_0 + - /language_model/layers.22/self_attn/Mul_1_output_0 + - /language_model/layers.22/self_attn/Mul_2_output_0 + - /language_model/layers.22/self_attn/Mul_3_output_0 + - /language_model/layers.22/self_attn/Mul_4_output_0 + - /language_model/layers.22/self_attn/Mul_5_output_0 + - /language_model/layers.22/self_attn/Mul_6_output_0 + - /language_model/layers.22/self_attn/Mul_7_output_0 + - /language_model/layers.22/self_attn/Mul_8_output_0 + - /language_model/layers.22/self_attn/Mul_9_output_0 + - /language_model/layers.23/self_attn/Mul_output_0 + - /language_model/layers.23/self_attn/Mul_1_output_0 + - /language_model/layers.23/self_attn/Mul_2_output_0 + - /language_model/layers.23/self_attn/Mul_3_output_0 + - /language_model/layers.23/self_attn/Mul_4_output_0 + - /language_model/layers.23/self_attn/Mul_5_output_0 + - /language_model/layers.23/self_attn/Mul_6_output_0 + - /language_model/layers.23/self_attn/Mul_7_output_0 + - /language_model/layers.23/self_attn/Mul_8_output_0 + - /language_model/layers.23/self_attn/Mul_9_output_0 + - /language_model/layers.24/self_attn/Mul_output_0 + - /language_model/layers.24/self_attn/Mul_1_output_0 + - /language_model/layers.24/self_attn/Mul_2_output_0 + - /language_model/layers.24/self_attn/Mul_3_output_0 + - /language_model/layers.24/self_attn/Mul_4_output_0 + - /language_model/layers.24/self_attn/Mul_5_output_0 + - /language_model/layers.24/self_attn/Mul_6_output_0 + - /language_model/layers.24/self_attn/Mul_7_output_0 + - /language_model/layers.24/self_attn/Mul_8_output_0 + - /language_model/layers.24/self_attn/Mul_9_output_0 + - /language_model/layers.25/self_attn/Mul_output_0 + - /language_model/layers.25/self_attn/Mul_1_output_0 + - /language_model/layers.25/self_attn/Mul_2_output_0 + - /language_model/layers.25/self_attn/Mul_3_output_0 + - /language_model/layers.25/self_attn/Mul_4_output_0 + - /language_model/layers.25/self_attn/Mul_5_output_0 + - /language_model/layers.25/self_attn/Mul_6_output_0 + - /language_model/layers.25/self_attn/Mul_7_output_0 + - /language_model/layers.25/self_attn/Mul_8_output_0 + - /language_model/layers.25/self_attn/Mul_9_output_0 + - /language_model/layers.26/self_attn/Mul_output_0 + - /language_model/layers.26/self_attn/Mul_1_output_0 + - /language_model/layers.26/self_attn/Mul_2_output_0 + - /language_model/layers.26/self_attn/Mul_3_output_0 + - /language_model/layers.26/self_attn/Mul_4_output_0 + - /language_model/layers.26/self_attn/Mul_5_output_0 + - /language_model/layers.26/self_attn/Mul_6_output_0 + - /language_model/layers.26/self_attn/Mul_7_output_0 + - /language_model/layers.26/self_attn/Mul_8_output_0 + - /language_model/layers.26/self_attn/Mul_9_output_0 + - /language_model/layers.27/self_attn/Mul_output_0 + - /language_model/layers.27/self_attn/Mul_1_output_0 + - /language_model/layers.27/self_attn/Mul_2_output_0 + - /language_model/layers.27/self_attn/Mul_3_output_0 + - /language_model/layers.27/self_attn/Mul_4_output_0 + - /language_model/layers.27/self_attn/Mul_5_output_0 + - /language_model/layers.27/self_attn/Mul_6_output_0 + - /language_model/layers.27/self_attn/Mul_7_output_0 + - /language_model/layers.27/self_attn/Mul_8_output_0 + - /language_model/layers.27/self_attn/Mul_9_output_0 + - /language_model/layers.28/self_attn/Mul_output_0 + - /language_model/layers.28/self_attn/Mul_1_output_0 + - /language_model/layers.28/self_attn/Mul_2_output_0 + - /language_model/layers.28/self_attn/Mul_3_output_0 + - /language_model/layers.28/self_attn/Mul_4_output_0 + - /language_model/layers.28/self_attn/Mul_5_output_0 + - /language_model/layers.28/self_attn/Mul_6_output_0 + - /language_model/layers.28/self_attn/Mul_7_output_0 + - /language_model/layers.28/self_attn/Mul_8_output_0 + - /language_model/layers.28/self_attn/Mul_9_output_0 + - /language_model/layers.29/self_attn/Mul_output_0 + - /language_model/layers.29/self_attn/Mul_1_output_0 + - /language_model/layers.29/self_attn/Mul_2_output_0 + - /language_model/layers.29/self_attn/Mul_3_output_0 + - /language_model/layers.29/self_attn/Mul_4_output_0 + - /language_model/layers.29/self_attn/Mul_5_output_0 + - /language_model/layers.29/self_attn/Mul_6_output_0 + - /language_model/layers.29/self_attn/Mul_7_output_0 + - /language_model/layers.29/self_attn/Mul_8_output_0 + - /language_model/layers.29/self_attn/Mul_9_output_0 + - /language_model/layers.30/self_attn/Mul_output_0 + - /language_model/layers.30/self_attn/Mul_1_output_0 + - /language_model/layers.30/self_attn/Mul_2_output_0 + - /language_model/layers.30/self_attn/Mul_3_output_0 + - /language_model/layers.30/self_attn/Mul_4_output_0 + - /language_model/layers.30/self_attn/Mul_5_output_0 + - /language_model/layers.30/self_attn/Mul_6_output_0 + - /language_model/layers.30/self_attn/Mul_7_output_0 + - /language_model/layers.30/self_attn/Mul_8_output_0 + - /language_model/layers.30/self_attn/Mul_9_output_0 + - /language_model/layers.31/self_attn/Mul_output_0 + - /language_model/layers.31/self_attn/Mul_1_output_0 + - /language_model/layers.31/self_attn/Mul_2_output_0 + - /language_model/layers.31/self_attn/Mul_3_output_0 + - /language_model/layers.31/self_attn/Mul_4_output_0 + - /language_model/layers.31/self_attn/Mul_5_output_0 + - /language_model/layers.31/self_attn/Mul_6_output_0 + - /language_model/layers.31/self_attn/Mul_7_output_0 + - /language_model/layers.31/self_attn/Mul_8_output_0 + - /language_model/layers.31/self_attn/Mul_9_output_0 + - /language_model/layers.32/self_attn/Mul_output_0 + - /language_model/layers.32/self_attn/Mul_1_output_0 + - /language_model/layers.32/self_attn/Mul_2_output_0 + - /language_model/layers.32/self_attn/Mul_3_output_0 + - /language_model/layers.32/self_attn/Mul_4_output_0 + - /language_model/layers.32/self_attn/Mul_5_output_0 + - /language_model/layers.32/self_attn/Mul_6_output_0 + - /language_model/layers.32/self_attn/Mul_7_output_0 + - /language_model/layers.32/self_attn/Mul_8_output_0 + - /language_model/layers.32/self_attn/Mul_9_output_0 + - /language_model/layers.33/self_attn/Mul_output_0 + - /language_model/layers.33/self_attn/Mul_1_output_0 + - /language_model/layers.33/self_attn/Mul_2_output_0 + - /language_model/layers.33/self_attn/Mul_3_output_0 + - /language_model/layers.33/self_attn/Mul_4_output_0 + - /language_model/layers.33/self_attn/Mul_5_output_0 + - /language_model/layers.33/self_attn/Mul_6_output_0 + - /language_model/layers.33/self_attn/Mul_7_output_0 + - /language_model/layers.33/self_attn/Mul_8_output_0 + - /language_model/layers.33/self_attn/Mul_9_output_0 + - /language_model/layers.0/self_attn/Softmax_output_0 + - /language_model/layers.1/self_attn/Softmax_output_0 + - /language_model/layers.2/self_attn/Softmax_output_0 + - /language_model/layers.3/self_attn/Softmax_output_0 + - /language_model/layers.4/self_attn/Softmax_output_0 + - /language_model/layers.5/self_attn/Softmax_output_0 + - /language_model/layers.6/self_attn/Softmax_output_0 + - /language_model/layers.7/self_attn/Softmax_output_0 + - /language_model/layers.8/self_attn/Softmax_output_0 + - /language_model/layers.9/self_attn/Softmax_output_0 + - /language_model/layers.10/self_attn/Softmax_output_0 + - /language_model/layers.11/self_attn/Softmax_output_0 + - /language_model/layers.12/self_attn/Softmax_output_0 + - /language_model/layers.13/self_attn/Softmax_output_0 + - /language_model/layers.14/self_attn/Softmax_output_0 + - /language_model/layers.15/self_attn/Softmax_output_0 + - /language_model/layers.16/self_attn/Softmax_output_0 + - /language_model/layers.17/self_attn/Softmax_output_0 + - /language_model/layers.18/self_attn/Softmax_output_0 + - /language_model/layers.19/self_attn/Softmax_output_0 + - /language_model/layers.20/self_attn/Softmax_output_0 + - /language_model/layers.21/self_attn/Softmax_output_0 + - /language_model/layers.22/self_attn/Softmax_output_0 + - /language_model/layers.23/self_attn/Softmax_output_0 + - /language_model/layers.24/self_attn/Softmax_output_0 + - /language_model/layers.25/self_attn/Softmax_output_0 + - /language_model/layers.26/self_attn/Softmax_output_0 + - /language_model/layers.27/self_attn/Softmax_output_0 + - /language_model/layers.28/self_attn/Softmax_output_0 + - /language_model/layers.29/self_attn/Softmax_output_0 + - /language_model/layers.30/self_attn/Softmax_output_0 + - /language_model/layers.31/self_attn/Softmax_output_0 + - /language_model/layers.32/self_attn/Softmax_output_0 + - /language_model/layers.33/self_attn/Softmax_output_0 + diff --git a/examples/performance/compute_context_length/gpt_oss.py b/examples/performance/compute_context_length/gpt_oss.py index b211ba914..ff5334d4e 100644 --- a/examples/performance/compute_context_length/gpt_oss.py +++ b/examples/performance/compute_context_length/gpt_oss.py @@ -11,26 +11,28 @@ model_id = "openai/gpt-oss-20b" # weights are not required to convert to fp32 +## Activate Compute-Context-Length (CCL) feature by setting ccl_enabled=True when loading the model with from_pretrained(). +## Use the optional comp_ctx_lengths argument to provide two lists of context lengths for the prefilling and decoding processes. If comp_ctx_lengths=None, the model will run with its default context length. +## - The first list, comp_ctx_lengths_prefill, defines the compute-context-length values for the prefilling process. +## -- The process starts with the first value in the list and gradually increases the context length based on the position_id of the current prompt chunk. +## - The second list, comp_ctx_lengths_decode, defines the compute-context-length values for the decoding process. +## -- During decoding, the model selects an appropriate context length from the list based on the input prompt length and cache index. +## -- It starts from the correct value in the list and increases the context length dynamically when the cache index exceeds the current threshold. + ctx_len = 4096 # In moe models like gpt-oss, since prefill_seq_len=1 both comp_ctx_lengths_prefill and comp_ctx_lengths_decode can share similar lists. # Set the list of ccl during prefilling process -comp_ctx_lengths_prefill = [512, ctx_len] +comp_ctx_lengths_prefill = [512, ctx_len] # None # # Set the list of ccl during decoding process -comp_ctx_lengths_decode = [512, ctx_len] +comp_ctx_lengths_decode = [512, ctx_len] # None # qeff_model = QEFFAutoModelForCausalLM.from_pretrained( model_id, - qaic_config={ - "comp_ctx_lengths_prefill": comp_ctx_lengths_prefill, - "comp_ctx_lengths_decode": comp_ctx_lengths_decode, - "ctx_len": ctx_len, - "prefill_seq_len": 1, # Passing prefill_seq_len is mandatory for CCL goal in moe models. Currently we can get best perf using PL=1. - }, + ccl_enabled=True, ) tokenizer = AutoTokenizer.from_pretrained(model_id) -onnx_model_path = qeff_model.export() qpc_path = qeff_model.compile( prefill_seq_len=1, # Currently we can get best perf using PL=1 i.e. decode-only model, prefill optimizations are being worked on. ctx_len=ctx_len, @@ -41,6 +43,8 @@ mos=1, aic_enable_depth_first=True, num_speculative_tokens=None, + comp_ctx_lengths_prefill=comp_ctx_lengths_prefill, + comp_ctx_lengths_decode=comp_ctx_lengths_decode, ) print(f"qpc path is {qpc_path}") streamer = TextStreamer(tokenizer) diff --git a/examples/performance/compute_context_length/granite_vision.py b/examples/performance/compute_context_length/granite_vision.py index 39b139bad..507ba11a4 100644 --- a/examples/performance/compute_context_length/granite_vision.py +++ b/examples/performance/compute_context_length/granite_vision.py @@ -20,6 +20,7 @@ def run_model( kv_offload=False, prefill_seq_len=5500, ctx_len=6000, + ccl_enabled=False, comp_ctx_lengths_prefill=None, comp_ctx_lengths_decode=None, generation_len=128, @@ -40,11 +41,7 @@ def run_model( model_name, token=token, kv_offload=kv_offload, - qaic_config={ - "comp_ctx_lengths_prefill": comp_ctx_lengths_prefill, - "comp_ctx_lengths_decode": comp_ctx_lengths_decode, - "ctx_len": ctx_len, - }, + ccl_enabled=ccl_enabled, ) ## STEP - 2 Export & Compile the Model @@ -56,6 +53,8 @@ def run_model( num_cores=num_cores, num_devices=num_devices, mxfp6_matmul=False, + comp_ctx_lengths_prefill=comp_ctx_lengths_prefill, + comp_ctx_lengths_decode=comp_ctx_lengths_decode, ) ## STEP - 3 Load and process the inputs for Inference @@ -96,6 +95,7 @@ def run_model( num_cores = 16 num_devices = 4 ctx_len = 8192 + ccl_enabled = True comp_ctx_lengths_prefill = [5500] comp_ctx_lengths_decode = [6144, ctx_len] @@ -106,6 +106,7 @@ def run_model( image_url=image_url, prefill_seq_len=prefill_seq_len, ctx_len=ctx_len, + ccl_enabled=ccl_enabled, comp_ctx_lengths_prefill=comp_ctx_lengths_prefill, comp_ctx_lengths_decode=comp_ctx_lengths_decode, generation_len=generation_len, diff --git a/examples/performance/compute_context_length/internvl.py b/examples/performance/compute_context_length/internvl.py index 827d50c97..bea3b49d3 100644 --- a/examples/performance/compute_context_length/internvl.py +++ b/examples/performance/compute_context_length/internvl.py @@ -174,27 +174,21 @@ def run_intern_on_aic( prefill_seq_len=3840, num_devices=1, num_cores=16, + ctx_len=512, + ccl_enabled=False, + comp_ctx_lengths_prefill=None, + comp_ctx_lengths_decode=None, ): ## STEP 1 -- LOAD THE MODEL # The original Intern-VL model, despite being multimodal, is loaded using `AutoModelForCausalLM` in Huggingface. # To maintain compatibility, we load this model using `QEFFAutoModelForCausalLM`. - ctx_len = 8192 - comp_ctx_lengths_prefill = [4096] - comp_ctx_lengths_decode = [6144, ctx_len] - - # model = QEFFAutoModelForCausalLM.from_pretrained(model_name, kv_offload=kv_offload, trust_remote_code=True) - model = QEFFAutoModelForCausalLM.from_pretrained( model_name, kv_offload=kv_offload, trust_remote_code=True, - qaic_config={ - "comp_ctx_lengths_prefill": comp_ctx_lengths_prefill, - "comp_ctx_lengths_decode": comp_ctx_lengths_decode, - "ctx_len": ctx_len, - }, + ccl_enabled=ccl_enabled, ) ## STEP 2 -- EXPORT & COMPILE THE MODEL @@ -205,6 +199,8 @@ def run_intern_on_aic( ctx_len=ctx_len, prefill_seq_len=prefill_seq_len, mxfp6_matmul=False, + comp_ctx_lengths_prefill=comp_ctx_lengths_prefill, + comp_ctx_lengths_decode=comp_ctx_lengths_decode, ) ## STEP 3 -- SETUP THE PROCESSOR @@ -263,6 +259,11 @@ def run_intern_on_aic( num_devices = 4 num_cores = 16 + ctx_len = 8192 + ccl_enabled = True + comp_ctx_lengths_prefill = [4096] + comp_ctx_lengths_decode = [6144, ctx_len] + run_intern_on_aic( model_name=model_name, prompt=prompt, @@ -273,6 +274,10 @@ def run_intern_on_aic( prefill_seq_len=prefill_seq_len, num_devices=num_devices, num_cores=num_cores, + ctx_len=ctx_len, + ccl_enabled=ccl_enabled, + comp_ctx_lengths_prefill=comp_ctx_lengths_prefill, + comp_ctx_lengths_decode=comp_ctx_lengths_decode, ) diff --git a/examples/performance/compute_context_length/llama4.py b/examples/performance/compute_context_length/llama4.py index 534be8f96..cb0cb1939 100644 --- a/examples/performance/compute_context_length/llama4.py +++ b/examples/performance/compute_context_length/llama4.py @@ -17,6 +17,14 @@ config.text_config.num_hidden_layers = 4 config.vision_config.num_hidden_layers = 2 +## Activate Compute-Context-Length (CCL) feature by setting ccl_enabled=True when loading the model with from_pretrained(). +## Use the optional comp_ctx_lengths argument to provide two lists of context lengths for the prefilling and decoding processes. If comp_ctx_lengths=None, the model will run with its default context length. +## - The first list, comp_ctx_lengths_prefill, defines the compute-context-length values for the prefilling process. +## -- The process starts with the first value in the list and gradually increases the context length based on the position_id of the current prompt chunk. +## - The second list, comp_ctx_lengths_decode, defines the compute-context-length values for the decoding process. +## -- During decoding, the model selects an appropriate context length from the list based on the input prompt length and cache index. +## -- It starts from the correct value in the list and increases the context length dynamically when the cache index exceeds the current threshold. + ctx_len = 8192 # Set the list of ccl during prefilling process comp_ctx_lengths_prefill = [3072] @@ -27,12 +35,8 @@ model_id, attn_implementation="eager", kv_offload=True, - qaic_config={ - "comp_ctx_lengths_prefill": comp_ctx_lengths_prefill, - "comp_ctx_lengths_decode": comp_ctx_lengths_decode, - "ctx_len": ctx_len, - }, config=config, + ccl_enabled=True, ) tokenizer = transformers.AutoTokenizer.from_pretrained(model_id) processor = AutoProcessor.from_pretrained(model_id) @@ -54,6 +58,8 @@ aic_enable_depth_first=True, skip_vision=True, mos=1, + comp_ctx_lengths_prefill=comp_ctx_lengths_prefill, + comp_ctx_lengths_decode=comp_ctx_lengths_decode, ) messages = [ @@ -95,6 +101,8 @@ mxint8_kv_cache=True, aic_enable_depth_first=True, mos=1, + comp_ctx_lengths_prefill=comp_ctx_lengths_prefill, + comp_ctx_lengths_decode=comp_ctx_lengths_decode, ) ### IMAGE + TEXT ### @@ -121,7 +129,7 @@ ) inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32) streamer = TextStreamer(tokenizer) - output = qeff_model.generate(inputs=inputs, device_ids=[0, 1, 2, 3], generation_len=100) + output = qeff_model.generate(inputs=inputs, device_ids=[8, 9, 10, 11], generation_len=100) print(output.generated_ids) print(tokenizer.batch_decode(output.generated_ids)) print(output) diff --git a/examples/performance/compute_context_length/llama4_cb.py b/examples/performance/compute_context_length/llama4_cb.py index ea7c09d69..98653080c 100644 --- a/examples/performance/compute_context_length/llama4_cb.py +++ b/examples/performance/compute_context_length/llama4_cb.py @@ -19,6 +19,14 @@ tokenizer = transformers.AutoTokenizer.from_pretrained(model_id) processor = AutoProcessor.from_pretrained(model_id) +## Activate Compute-Context-Length (CCL) feature by setting ccl_enabled=True when loading the model with from_pretrained(). +## Use the optional comp_ctx_lengths argument to provide two lists of context lengths for the prefilling and decoding processes. If comp_ctx_lengths=None, the model will run with its default context length. +## - The first list, comp_ctx_lengths_prefill, defines the compute-context-length values for the prefilling process. +## -- The process starts with the first value in the list and gradually increases the context length based on the position_id of the current prompt chunk. +## - The second list, comp_ctx_lengths_decode, defines the compute-context-length values for the decoding process. +## -- During decoding, the model selects an appropriate context length from the list based on the input prompt length and cache index. +## -- It starts from the correct value in the list and increases the context length dynamically when the cache index exceeds the current threshold. + ctx_len = 4096 # Set the list of ccl during prefilling process comp_ctx_lengths_prefill = [3072] @@ -33,11 +41,7 @@ kv_offload=True, config=config, continuous_batching=True, - qaic_config={ - "comp_ctx_lengths_prefill": comp_ctx_lengths_prefill, - "comp_ctx_lengths_decode": comp_ctx_lengths_decode, - "ctx_len": ctx_len, - }, + ccl_enabled=True, ) qeff_model.compile( @@ -53,6 +57,8 @@ mxint8_kv_cache=True, aic_enable_depth_first=True, mos=1, + comp_ctx_lengths_prefill=comp_ctx_lengths_prefill, + comp_ctx_lengths_decode=comp_ctx_lengths_decode, ) else: qeff_model = QEFFAutoModelForImageTextToText.from_pretrained( @@ -60,11 +66,7 @@ attn_implementation="eager", kv_offload=True, config=config, - qaic_config={ - "comp_ctx_lengths_prefill": comp_ctx_lengths_prefill, - "comp_ctx_lengths_decode": comp_ctx_lengths_decode, - "ctx_len": ctx_len, - }, + ccl_enabled=True, ) qeff_model.compile( @@ -79,6 +81,8 @@ mxint8_kv_cache=True, aic_enable_depth_first=True, mos=1, + comp_ctx_lengths_prefill=comp_ctx_lengths_prefill, + comp_ctx_lengths_decode=comp_ctx_lengths_decode, ) image_urls = [ diff --git a/examples/performance/compute_context_length/llama4_multi_image.py b/examples/performance/compute_context_length/llama4_multi_image.py index d7c403e5f..0fe8ffb78 100644 --- a/examples/performance/compute_context_length/llama4_multi_image.py +++ b/examples/performance/compute_context_length/llama4_multi_image.py @@ -17,6 +17,14 @@ config.text_config.num_hidden_layers = 4 config.vision_config.num_hidden_layers = 2 +## Activate Compute-Context-Length (CCL) feature by setting ccl_enabled=True when loading the model with from_pretrained(). +## Use the optional comp_ctx_lengths argument to provide two lists of context lengths for the prefilling and decoding processes. If comp_ctx_lengths=None, the model will run with its default context length. +## - The first list, comp_ctx_lengths_prefill, defines the compute-context-length values for the prefilling process. +## -- The process starts with the first value in the list and gradually increases the context length based on the position_id of the current prompt chunk. +## - The second list, comp_ctx_lengths_decode, defines the compute-context-length values for the decoding process. +## -- During decoding, the model selects an appropriate context length from the list based on the input prompt length and cache index. +## -- It starts from the correct value in the list and increases the context length dynamically when the cache index exceeds the current threshold. + ctx_len = 8192 # Set the list of ccl during prefilling process comp_ctx_lengths_prefill = [5376] @@ -28,11 +36,7 @@ attn_implementation="eager", kv_offload=True, config=config, - qaic_config={ - "comp_ctx_lengths_prefill": comp_ctx_lengths_prefill, - "comp_ctx_lengths_decode": comp_ctx_lengths_decode, - "ctx_len": ctx_len, - }, + ccl_enabled=True, ) tokenizer = transformers.AutoTokenizer.from_pretrained(model_id) processor = AutoProcessor.from_pretrained(model_id) @@ -49,6 +53,8 @@ mxint8_kv_cache=True, aic_enable_depth_first=True, mos=1, + comp_ctx_lengths_prefill=comp_ctx_lengths_prefill, + comp_ctx_lengths_decode=comp_ctx_lengths_decode, ) ### Multi_image Prompt ### diff --git a/examples/performance/compute_context_length/mistral3.py b/examples/performance/compute_context_length/mistral3.py index 96ed519f5..664e3c1c1 100644 --- a/examples/performance/compute_context_length/mistral3.py +++ b/examples/performance/compute_context_length/mistral3.py @@ -19,6 +19,7 @@ def run_model( kv_offload=False, prefill_seq_len=128, ctx_len=4096, + ccl_enabled=False, comp_ctx_lengths_prefill=None, comp_ctx_lengths_decode=None, generation_len=128, @@ -37,16 +38,15 @@ def run_model( config = AutoConfig.from_pretrained(model_name) config.vision_config._attn_implementation = "eager" + # For Testing Purpose Only + config.text_config.num_hidden_layers = 4 + config.vision_config.num_hidden_layers = 2 model = QEFFAutoModelForImageTextToText.from_pretrained( model_name, kv_offload=kv_offload, config=config, - qaic_config={ - "comp_ctx_lengths_prefill": comp_ctx_lengths_prefill, - "comp_ctx_lengths_decode": comp_ctx_lengths_decode, - "ctx_len": ctx_len, - }, + ccl_enabled=ccl_enabled, ) ## STEP - 2 Export & Compile the Model @@ -58,6 +58,8 @@ def run_model( num_cores=num_cores, num_devices=num_devices, mxfp6_matmul=False, + comp_ctx_lengths_prefill=comp_ctx_lengths_prefill, + comp_ctx_lengths_decode=comp_ctx_lengths_decode, ) ## STEP - 3 Load and process the inputs for Inference @@ -96,6 +98,7 @@ def run_model( generation_len = 128 num_cores = 16 num_devices = 4 + ccl_enabled = True comp_ctx_lengths_prefill = [4096] comp_ctx_lengths_decode = [6144, ctx_len] @@ -106,6 +109,7 @@ def run_model( image_url=image_url, prefill_seq_len=prefill_seq_len, ctx_len=ctx_len, + ccl_enabled=ccl_enabled, comp_ctx_lengths_prefill=comp_ctx_lengths_prefill, comp_ctx_lengths_decode=comp_ctx_lengths_decode, generation_len=generation_len, diff --git a/examples/performance/compute_context_length/molmo.py b/examples/performance/compute_context_length/molmo.py index f68481631..7434c62b2 100644 --- a/examples/performance/compute_context_length/molmo.py +++ b/examples/performance/compute_context_length/molmo.py @@ -18,21 +18,25 @@ # config.num_hidden_layers = 2 +## Activate Compute-Context-Length (CCL) feature by setting ccl_enabled=True when loading the model with from_pretrained(). +## Use the optional comp_ctx_lengths argument to provide two lists of context lengths for the prefilling and decoding processes. If comp_ctx_lengths=None, the model will run with its default context length. +## - The first list, comp_ctx_lengths_prefill, defines the compute-context-length values for the prefilling process. +## -- The process starts with the first value in the list and gradually increases the context length based on the position_id of the current prompt chunk. +## - The second list, comp_ctx_lengths_decode, defines the compute-context-length values for the decoding process. +## -- During decoding, the model selects an appropriate context length from the list based on the input prompt length and cache index. +## -- It starts from the correct value in the list and increases the context length dynamically when the cache index exceeds the current threshold. + # load the model ctx_len = 8192 -comp_ctx_lengths_prefill = [3072] -comp_ctx_lengths_decode = [4096, 8192] +comp_ctx_lengths_prefill = [3072] # None # +comp_ctx_lengths_decode = [4096, 8192] # None # qeff_model = QEFFAutoModelForCausalLM.from_pretrained( model_id, kv_offload=True, trust_remote_code=True, config=config, - qaic_config={ - "comp_ctx_lengths_prefill": comp_ctx_lengths_prefill, - "comp_ctx_lengths_decode": comp_ctx_lengths_decode, - "ctx_len": ctx_len, - }, + ccl_enabled=True, ) tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True) @@ -51,6 +55,8 @@ aic_enable_depth_first=True, skip_vision=True, mos=1, + comp_ctx_lengths_prefill=comp_ctx_lengths_prefill, + comp_ctx_lengths_decode=comp_ctx_lengths_decode, ) inputs = processor.process(text="Tell me about yourself") @@ -74,6 +80,8 @@ mxint8_kv_cache=True, aic_enable_depth_first=True, mos=1, + comp_ctx_lengths_prefill=comp_ctx_lengths_prefill, + comp_ctx_lengths_decode=comp_ctx_lengths_decode, ) ### IMAGE + TEXT ### diff --git a/examples/performance/compute_context_length/qwen2_5_vl.py b/examples/performance/compute_context_length/qwen2_5_vl.py index 00f43a73f..3266be634 100644 --- a/examples/performance/compute_context_length/qwen2_5_vl.py +++ b/examples/performance/compute_context_length/qwen2_5_vl.py @@ -21,20 +21,24 @@ config = AutoConfig.from_pretrained(model_id) # config.text_config.num_hidden_layers = 2 +## Activate Compute-Context-Length (CCL) feature by setting ccl_enabled=True when loading the model with from_pretrained(). +## Use the optional comp_ctx_lengths argument to provide two lists of context lengths for the prefilling and decoding processes. If comp_ctx_lengths=None, the model will run with its default context length. +## - The first list, comp_ctx_lengths_prefill, defines the compute-context-length values for the prefilling process. +## -- The process starts with the first value in the list and gradually increases the context length based on the position_id of the current prompt chunk. +## - The second list, comp_ctx_lengths_decode, defines the compute-context-length values for the decoding process. +## -- During decoding, the model selects an appropriate context length from the list based on the input prompt length and cache index. +## -- It starts from the correct value in the list and increases the context length dynamically when the cache index exceeds the current threshold. + ctx_len = 8192 -comp_ctx_lengths_prefill = [4096] -comp_ctx_lengths_decode = [6144, ctx_len] +comp_ctx_lengths_prefill = [4096] # None # +comp_ctx_lengths_decode = [6144, ctx_len] # None # qeff_model = QEFFAutoModelForImageTextToText.from_pretrained( model_id, attn_implementation="eager", kv_offload=True, config=config, - qaic_config={ - "comp_ctx_lengths_prefill": comp_ctx_lengths_prefill, - "comp_ctx_lengths_decode": comp_ctx_lengths_decode, - "ctx_len": ctx_len, - }, + ccl_enabled=True, ) tokenizer = transformers.AutoTokenizer.from_pretrained(model_id) processor = AutoProcessor.from_pretrained(model_id) @@ -59,6 +63,8 @@ aic_enable_depth_first=True, skip_vision=True, mos=1, + comp_ctx_lengths_prefill=comp_ctx_lengths_prefill, + comp_ctx_lengths_decode=comp_ctx_lengths_decode, ) messages = [ @@ -103,6 +109,8 @@ mxint8_kv_cache=True, aic_enable_depth_first=True, mos=1, + comp_ctx_lengths_prefill=comp_ctx_lengths_prefill, + comp_ctx_lengths_decode=comp_ctx_lengths_decode, ) ### IMAGE + TEXT ### diff --git a/examples/performance/compute_context_length/qwen2_5_vl_cb.py b/examples/performance/compute_context_length/qwen2_5_vl_cb.py index 6954d356f..75fece6db 100644 --- a/examples/performance/compute_context_length/qwen2_5_vl_cb.py +++ b/examples/performance/compute_context_length/qwen2_5_vl_cb.py @@ -16,7 +16,15 @@ ## For AWQ model update pytorch version to 2.8.* model_id = "Qwen/Qwen2.5-VL-32B-Instruct" config = AutoConfig.from_pretrained(model_id) -# config.text_config.num_hidden_layers = 2 +config.text_config.num_hidden_layers = 4 + +## Activate Compute-Context-Length (CCL) feature by setting ccl_enabled=True when loading the model with from_pretrained(). +## Use the optional comp_ctx_lengths argument to provide two lists of context lengths for the prefilling and decoding processes. If comp_ctx_lengths=None, the model will run with its default context length. +## - The first list, comp_ctx_lengths_prefill, defines the compute-context-length values for the prefilling process. +## -- The process starts with the first value in the list and gradually increases the context length based on the position_id of the current prompt chunk. +## - The second list, comp_ctx_lengths_decode, defines the compute-context-length values for the decoding process. +## -- During decoding, the model selects an appropriate context length from the list based on the input prompt length and cache index. +## -- It starts from the correct value in the list and increases the context length dynamically when the cache index exceeds the current threshold. ctx_len = 8192 comp_ctx_lengths_prefill = [4096] @@ -28,11 +36,7 @@ kv_offload=True, config=config, continuous_batching=True, - qaic_config={ - "comp_ctx_lengths_prefill": comp_ctx_lengths_prefill, - "comp_ctx_lengths_decode": comp_ctx_lengths_decode, - "ctx_len": ctx_len, - }, + ccl_enabled=True, ) tokenizer = transformers.AutoTokenizer.from_pretrained(model_id) processor = AutoProcessor.from_pretrained(model_id) @@ -52,6 +56,8 @@ mxint8_kv_cache=True, aic_enable_depth_first=True, mos=1, + comp_ctx_lengths_prefill=comp_ctx_lengths_prefill, + comp_ctx_lengths_decode=comp_ctx_lengths_decode, ) image_urls = [ @@ -75,6 +81,7 @@ processor=processor, images=image_urls, generation_len=100, + device_ids=[28, 29, 30, 31], ) print(output.generated_ids) print(tokenizer.batch_decode(output.generated_ids)) diff --git a/examples/performance/compute_context_length/qwen3moe.py b/examples/performance/compute_context_length/qwen3moe.py new file mode 100644 index 000000000..9fb4c4d43 --- /dev/null +++ b/examples/performance/compute_context_length/qwen3moe.py @@ -0,0 +1,54 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- + +from transformers import AutoTokenizer + +from QEfficient import QEFFAutoModelForCausalLM +from QEfficient.utils.constants import Constants + +model_name = "Qwen/Qwen3-30B-A3B-Instruct-2507" +""" +# For CB inference, set continuous_batching to True and add full_batch_size,mxfp6,mint8 argument in compile function +# We will use prompt_len=1 for compilation for both cb and non-cb inference +""" + +## Activate Compute-Context-Length (CCL) feature by setting ccl_enabled=True when loading the model with from_pretrained(). +## Use the optional comp_ctx_lengths argument to provide two lists of context lengths for the prefilling and decoding processes. If comp_ctx_lengths=None, the model will run with its default context length. +## - The first list, comp_ctx_lengths_prefill, defines the compute-context-length values for the prefilling process. +## -- The process starts with the first value in the list and gradually increases the context length based on the position_id of the current prompt chunk. +## - The second list, comp_ctx_lengths_decode, defines the compute-context-length values for the decoding process. +## -- During decoding, the model selects an appropriate context length from the list based on the input prompt length and cache index. +## -- It starts from the correct value in the list and increases the context length dynamically when the cache index exceeds the current threshold. + +ctx_len = 1024 +prefill_seq_len = 1 +# In moe models when compiling with prefill_seq_len=1 and non-continuous-batching mode, prefill and decode will share the same ccl specializations. +comp_ctx_lengths_prefill = [256, 512, ctx_len] # None # +comp_ctx_lengths_decode = [256, 512, ctx_len] # None # + +model = QEFFAutoModelForCausalLM.from_pretrained( + model_name, + continuous_batching=False, + ccl_enabled=True, + num_hidden_layers=4, +) + +model.compile( + prefill_seq_len=prefill_seq_len, + ctx_len=ctx_len, + batch_size=1, + num_cores=16, + num_devices=4, + mxfp6_matmul=True, + mxint8_kv_cache=True, + mos=1, + comp_ctx_lengths_prefill=comp_ctx_lengths_prefill, + comp_ctx_lengths_decode=comp_ctx_lengths_decode, +) +# mos=1, +tokenizer = AutoTokenizer.from_pretrained(model_name) +exec_info = model.generate(prompts=Constants.INPUT_STR, tokenizer=tokenizer) diff --git a/examples/performance/compute_context_length/qwen3moe_example/ccl_qwen3moe_inference.py b/examples/performance/compute_context_length/qwen3moe_example/ccl_qwen3moe_inference.py index d2fa208df..9fb4c4d43 100644 --- a/examples/performance/compute_context_length/qwen3moe_example/ccl_qwen3moe_inference.py +++ b/examples/performance/compute_context_length/qwen3moe_example/ccl_qwen3moe_inference.py @@ -16,21 +16,25 @@ # We will use prompt_len=1 for compilation for both cb and non-cb inference """ +## Activate Compute-Context-Length (CCL) feature by setting ccl_enabled=True when loading the model with from_pretrained(). +## Use the optional comp_ctx_lengths argument to provide two lists of context lengths for the prefilling and decoding processes. If comp_ctx_lengths=None, the model will run with its default context length. +## - The first list, comp_ctx_lengths_prefill, defines the compute-context-length values for the prefilling process. +## -- The process starts with the first value in the list and gradually increases the context length based on the position_id of the current prompt chunk. +## - The second list, comp_ctx_lengths_decode, defines the compute-context-length values for the decoding process. +## -- During decoding, the model selects an appropriate context length from the list based on the input prompt length and cache index. +## -- It starts from the correct value in the list and increases the context length dynamically when the cache index exceeds the current threshold. + ctx_len = 1024 prefill_seq_len = 1 -# In moe models when compiling with prefill_seq_len=1 and non-continuous-batching mode, prefill and decode will share the same specializations. -comp_ctx_lengths_prefill = [256, 512, ctx_len] -comp_ctx_lengths_decode = [256, 512, ctx_len] +# In moe models when compiling with prefill_seq_len=1 and non-continuous-batching mode, prefill and decode will share the same ccl specializations. +comp_ctx_lengths_prefill = [256, 512, ctx_len] # None # +comp_ctx_lengths_decode = [256, 512, ctx_len] # None # model = QEFFAutoModelForCausalLM.from_pretrained( model_name, continuous_batching=False, - qaic_config={ - "comp_ctx_lengths_prefill": comp_ctx_lengths_prefill, - "comp_ctx_lengths_decode": comp_ctx_lengths_decode, - "ctx_len": ctx_len, - "prefill_seq_len": prefill_seq_len, - }, + ccl_enabled=True, + num_hidden_layers=4, ) model.compile( @@ -42,6 +46,8 @@ mxfp6_matmul=True, mxint8_kv_cache=True, mos=1, + comp_ctx_lengths_prefill=comp_ctx_lengths_prefill, + comp_ctx_lengths_decode=comp_ctx_lengths_decode, ) # mos=1, tokenizer = AutoTokenizer.from_pretrained(model_name) diff --git a/examples/performance/compute_context_length/vlm_inference.py b/examples/performance/compute_context_length/vlm_inference.py index 0920ddf30..f9577bd2c 100644 --- a/examples/performance/compute_context_length/vlm_inference.py +++ b/examples/performance/compute_context_length/vlm_inference.py @@ -30,6 +30,7 @@ def run_model( kv_offload=True, prefill_seq_len=32, ctx_len=8192, + ccl_enabled=False, comp_ctx_lengths_prefill=None, comp_ctx_lengths_decode=None, generation_len=128, @@ -76,11 +77,7 @@ def run_model( token=hf_token, attn_implementation="eager", kv_offload=kv_offload, - qaic_config={ - "comp_ctx_lengths_prefill": comp_ctx_lengths_prefill, - "comp_ctx_lengths_decode": comp_ctx_lengths_decode, - "ctx_len": ctx_len, - }, + ccl_enabled=ccl_enabled, ) ## STEP 2: Export & Compile the Model @@ -93,6 +90,8 @@ def run_model( num_cores=num_cores, num_devices=num_devices, mxfp6_matmul=False, + comp_ctx_lengths_prefill=comp_ctx_lengths_prefill, + comp_ctx_lengths_decode=comp_ctx_lengths_decode, ) print(f"Model compiled successfully to: {qpc_path}") @@ -177,6 +176,11 @@ def main(): default=8192, help="Maximum context length", ) + parser.add_argument( + "--ccl-enabled", + action="store_true", + help="Enable compute-context-length (CCL) feature", + ) parser.add_argument( "--comp-ctx-lengths-prefill", type=lambda x: [int(i) for i in x.split(",")], @@ -223,6 +227,7 @@ def main(): kv_offload=args.kv_offload, prefill_seq_len=args.prefill_seq_len, ctx_len=args.ctx_len, + ccl_enabled=args.ccl_enabled, comp_ctx_lengths_prefill=args.comp_ctx_lengths_prefill, comp_ctx_lengths_decode=args.comp_ctx_lengths_decode, generation_len=args.generation_len,