diff --git a/QEfficient/base/modeling_qeff.py b/QEfficient/base/modeling_qeff.py index 6c9f88d9f..d17e342f7 100644 --- a/QEfficient/base/modeling_qeff.py +++ b/QEfficient/base/modeling_qeff.py @@ -292,6 +292,7 @@ def _compile( custom_io: Optional[Dict[str, str]] = None, mdp_ts_num_devices: int = 1, num_speculative_tokens: Optional[int] = None, + mxfp6_matmul: bool = constants.DEFAULT_AIC_MXFP6_MATMUL, enable_qnn: Optional[bool] = False, qnn_config: Optional[str] = None, **compiler_options, @@ -307,6 +308,7 @@ def _compile( :custom_io (dict): Custom IO to specify the input and outputs in different formats than default :mdp_ts_num_devices (int): Number of devices to partition to use Multi-Device Partitioning with tensor-slicing. :num_speculative_tokens (int, optional): Number of speculative tokens to take as input for Speculative Decoding Target Language Model. + :mxfp6_matmul (bool): Use MXFP6 to compress weights for MatMul nodes to run faster on device. ``Defaults to False``. :enable_qnn (bool): Enables QNN Compilation. ``Defaults to False.`` :qnn_config (str): Path of QNN Config parameters file. Any extra parameters for QNN compilation can be passed via this file. ``Defaults to None.`` :compiler_options: Pass any compiler option as input. @@ -337,7 +339,7 @@ def _compile( custom_io=custom_io, device_group=list(range(mdp_ts_num_devices)), num_cores=compiler_options.get("aic_num_cores", constants.DEFAULT_AIC_NUM_CORES), - mxfp6=compiler_options.get("mxfp6_matmul", constants.DEFAULT_AIC_MXPF6_MATMUL), + mxfp6=mxfp6_matmul, mxint8=mxint8_kv_cache, qnn_config=qnn_config, ) @@ -349,6 +351,9 @@ def _compile( if mdp_ts_json_path := compiler_options.pop("mdp_load_partition_config", None): command.append(f"-mdp-load-partition-config={mdp_ts_json_path}") + if mxfp6_matmul: + command.append("-mxfp6-matmul") + for key, value in compiler_options.items(): option = "-" + key.replace("_", "-") if isinstance(value, bool): diff --git a/QEfficient/compile/qnn_compiler.py b/QEfficient/compile/qnn_compiler.py index e2ec20364..f0073063b 100644 --- a/QEfficient/compile/qnn_compiler.py +++ b/QEfficient/compile/qnn_compiler.py @@ -106,8 +106,17 @@ def parse_qnn_config(self): for key, value in config_data.items(): if key == QnnConstants.CONVERTER_ARGS_EXTENSION_STR: self.check_extension_arg(key, value, QnnConstants.IMMUTABLE_CONVERTER_ARGS) - if key == QnnConstants.CONTEXT_BIN_ARGS_EXTENSION_STR: + elif key == QnnConstants.CONTEXT_BIN_ARGS_EXTENSION_STR: self.check_extension_arg(key, value, QnnConstants.IMMUTABLE_CONTEXT_BIN_GEN_ARGS) + elif key == QnnConstants.QNN_COMPILATION_BACKEND_STR: + immutable_param = [ + sub_key for sub_key in value.keys() if sub_key in QnnConstants.IMMUTABLE_COMPILATION_BACKEND_ARGS + ] + if immutable_param: + raise AttributeError( + f"Immutable Parameters {immutable_param} found in {QnnConstants.QNN_COMPILATION_BACKEND_STR}. Please remove them from QNN Configuration file." + ) + self.qnn_config[key] = value def create_qnn_tensor_slicing_json(self) -> str: diff --git a/QEfficient/utils/constants.py b/QEfficient/utils/constants.py index cc52658c6..5e2983a17 100644 --- a/QEfficient/utils/constants.py +++ b/QEfficient/utils/constants.py @@ -27,7 +27,7 @@ # Compiler defaults DEFAULT_AIC_NUM_CORES = 16 -DEFAULT_AIC_MXPF6_MATMUL = False +DEFAULT_AIC_MXFP6_MATMUL = False # Hashing defaults HASH_HEXDIGEST_STR_LEN = 16 KWARGS_INCLUSION_LIST = [ @@ -207,6 +207,10 @@ class QnnConstants: "--config_file ", ] + IMMUTABLE_COMPILATION_BACKEND_ARGS = [ + "compiler_mxfp6_matmul_weights", + ] + QNN_SAMPLE_CONFIG = { "converter_args_extension": "--onnx_defer_loading", "context_binary_generator_args_extension": "--log_level debug",