diff --git a/QEfficient/base/modeling_qeff.py b/QEfficient/base/modeling_qeff.py index d9d6823ae..956989adf 100644 --- a/QEfficient/base/modeling_qeff.py +++ b/QEfficient/base/modeling_qeff.py @@ -14,10 +14,8 @@ from abc import ABC, abstractmethod from pathlib import Path from typing import Dict, List, Optional - import onnx import torch - from QEfficient.base.onnx_transforms import OnnxTransform from QEfficient.base.pytorch_transforms import PytorchTransform from QEfficient.compile.qnn_compiler import compile as qnn_compile @@ -45,9 +43,10 @@ class QEFFBaseModel(ABC): def _transform_names(cls) -> List[str]: return [x.__name__ for x in cls._pytorch_transforms + cls._onnx_transforms] - def __init__(self, model: torch.nn.Module) -> None: + def __init__(self, model: torch.nn.Module, onnx_slim_transfom: bool = False) -> None: super().__init__() self.model = model + self.onnx_slim_transform = onnx_slim_transfom self.onnx_path: Optional[str] = None self.qpc_path: Optional[str] = None self.qpc_session: Optional[QAICInferenceSession] = None @@ -119,6 +118,7 @@ def _export( example_inputs: Dict[str, torch.Tensor], output_names: List[str], dynamic_axes: Dict[str, Dict[int, str]], + onnx_slim_transform: bool = False, export_kwargs: Optional[Dict[str, any]] = None, onnx_transform_kwargs: Optional[Dict[str, any]] = None, export_dir: Optional[str] = None, @@ -146,7 +146,6 @@ def _export( tmp_onnx_dir.mkdir(parents=True, exist_ok=True) # Create input_names from example_inputs - input_names = [] for param in inspect.signature(self.model.forward).parameters: if param in example_inputs: @@ -183,11 +182,14 @@ def _export( **export_kwargs, ) logger.info("Pytorch export successful") - model = onnx.load(tmp_onnx_path, load_external_data=False) transform_kwargs = { - "onnx_base_dir": str(tmp_onnx_dir), + "temp_onnx_path": tmp_onnx_path, "model_name": self.model_name, + "enable_onnx_slim_transform": onnx_slim_transform, + "onnx_base_dir": str(tmp_onnx_dir), + + } if onnx_transform_kwargs is not None: transform_kwargs.update(onnx_transform_kwargs) @@ -248,8 +250,7 @@ def _compile( For QNN Compilation path, when enable_qnn is set to True, any parameter passed in compiler_options will be ignored. """ if onnx_path is None and self.onnx_path is None: - self.export() - + self.export() onnx_path = Path(onnx_path or self.onnx_path) compile_dir = Path(compile_dir or onnx_path.parent) qpc_path = compile_dir / "qpc" @@ -368,5 +369,4 @@ def _compile( ) self.qpc_path = qpc_path - return qpc_path diff --git a/QEfficient/base/onnx_transforms.py b/QEfficient/base/onnx_transforms.py index 61b5c00f6..08bdb1ea3 100644 --- a/QEfficient/base/onnx_transforms.py +++ b/QEfficient/base/onnx_transforms.py @@ -8,6 +8,8 @@ from typing import Optional, Tuple import numpy as np +import onnx +import onnxslim from onnx import ModelProto, external_data_helper, numpy_helper @@ -36,7 +38,6 @@ class FP16ClipTransform(OnnxTransform): """ Clips the tensor values to be in FP16 range, but preserves -inf values. """ - @classmethod def apply(cls, model: ModelProto, *, onnx_base_dir: Optional[str] = None, **kwargs) -> Tuple[ModelProto, bool]: """ @@ -99,3 +100,34 @@ def apply( current_file_size = tsize external_data_helper.set_external_data(tensor, f"{model_name}_{file_num}.onnx.data") return model, transformed + + +class OnnxSlimTransform(OnnxTransform): + """ + Applies onnx-slim transformations on the given ONNX graph. + """ + + @classmethod + def apply( + cls, + model: ModelProto, + *, + onnx_base_dir: Optional[str] = None, + **kwargs, + ) -> Tuple[ModelProto, bool]: + """ + :param enable_onnx_slim_transform: If True, applies onnx-slim transformations. + :param temp_onnx_path: Path to save the slimmed ONNX model. + """ + transformed = False + onnx_slim_transform = kwargs.get("enable_onnx_slim_transform", False) + temp_onnx_path = kwargs.get("temp_onnx_path", None) + if not temp_onnx_path: + err_str = "temp_onnx_path is required for onnx-slim transform." + raise RuntimeError(err_str) + if onnx_slim_transform: + transformed = True + slimmed_model = onnxslim.slim(model) + onnx.save(slimmed_model, temp_onnx_path) + return slimmed_model, transformed + return model, transformed diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index d0c732052..34367ca96 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -26,7 +26,7 @@ import QEfficient from QEfficient.base.modeling_qeff import QEFFBaseModel -from QEfficient.base.onnx_transforms import FP16ClipTransform, SplitTensorsTransform +from QEfficient.base.onnx_transforms import FP16ClipTransform, OnnxSlimTransform, SplitTensorsTransform from QEfficient.base.pytorch_transforms import SplitGateUpWeightsTransform from QEfficient.generation.cloud_infer import QAICInferenceSession from QEfficient.generation.text_generation_inference import ( @@ -82,17 +82,23 @@ def __repr__(self) -> str: @classmethod @with_replaced_quantizers - def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs): + def from_pretrained(cls, pretrained_model_name_or_path: str, onnx_slim_transform: bool = False, *args, **kwargs): if kwargs.get("attn_implementation", None) not in {None, "eager"}: logger.warning('Updating attn_implementation="eager"') if kwargs.get("low_cpu_mem_usage", None): logger.warning("Updating low_cpu_mem_usage=False") - kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False}) + kwargs.update( + { + "attn_implementation": "eager", + "low_cpu_mem_usage": False, + + } + ) model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, *args, **kwargs) - return cls(model, pretrained_model_name_or_path=pretrained_model_name_or_path) + return cls(model, pretrained_model_name_or_path=pretrained_model_name_or_path, **kwargs) @property def model_name(self) -> str: @@ -161,9 +167,9 @@ class QEFFAutoModel(QEFFTransformersBase): _hf_auto_class = AutoModel _pytorch_transforms = [CustomOpsTransform, AwqToMatmulNbitsTransform, GPTQToMatmulNbitsTransform] - _onnx_transforms = [FP16ClipTransform, SplitTensorsTransform] + _onnx_transforms = [OnnxSlimTransform, FP16ClipTransform, SplitTensorsTransform] - def __init__(self, model: nn.Module, pooling=None, **kwargs): + def __init__(self, model: nn.Module, pooling=None, onnx_slim_transform: bool = False, **kwargs): super().__init__(model) # Make Embedding specific transforms like appending pooling @@ -171,12 +177,14 @@ def __init__(self, model: nn.Module, pooling=None, **kwargs): self.model, _ = PoolingTransform.apply(self.model, pooling) self.model.base_model.config.use_cache = True - + self.onnx_slim_transform = onnx_slim_transform self.pretrained_model_name_or_path = kwargs.get("pretrained_model_name_or_path", None) @classmethod @with_replaced_quantizers - def from_pretrained(cls, pretrained_model_name_or_path, pooling=None, *args, **kwargs): + def from_pretrained( + cls, pretrained_model_name_or_path, pooling=None, onnx_slim_transform: bool = False, *args, **kwargs + ): """ This method serves as the easiest entry point into using QEfficient. The interface is designed to be similar to transformers.AutoModel. Once the model is initialized, you can use other methods such as export, compile, and generate on the same object. @@ -228,7 +236,13 @@ def from_pretrained(cls, pretrained_model_name_or_path, pooling=None, *args, **k model, kv_offload=kv_offload ) - return cls(model, pretrained_model_name_or_path=pretrained_model_name_or_path, pooling=pooling, **kwargs) + return cls( + model, + pretrained_model_name_or_path=pretrained_model_name_or_path, + pooling=pooling, + onnx_slim_transform=onnx_slim_transform, + **kwargs, + ) @property def model_hash(self) -> str: @@ -252,7 +266,7 @@ def model_hash(self) -> str: def get_model_config(self) -> dict: return self.model.config.__dict__ - def export(self, export_dir: Optional[str] = None) -> str: + def export(self, export_dir: Optional[str] = None, **kwargs) -> str: """ Exports the model to ``ONNX`` format using ``torch.onnx.export``. @@ -278,6 +292,7 @@ def export(self, export_dir: Optional[str] = None) -> str: example_inputs, output_names, dynamic_axes, + onnx_slim_transform=self.onnx_slim_transform, export_dir=export_dir, ) @@ -446,14 +461,21 @@ class QEffVisionEncoderForTextImageToTextModel(QEFFBaseModel): KVCacheTransform, KVCacheExternalModuleMapperTransform, ] - _onnx_transforms = [FP16ClipTransform, SplitTensorsTransform] + _onnx_transforms = [OnnxSlimTransform, FP16ClipTransform, SplitTensorsTransform] - def __init__(self, model: nn.modules): + def __init__(self, model: nn.modules, onnx_slim_transform: bool = False): super().__init__(model) self.model = model.get_qeff_vision_encoder() + self.onnx_slim_transform = onnx_slim_transform def export(self, inputs, output_names, dynamic_axes, export_dir=None): - return self._export(inputs, output_names, dynamic_axes, export_dir) + return self._export( + inputs, + output_names, + dynamic_axes, + export_dir, + onnx_slim_transform=self.onnx_slim_transform, + ) def compile( self, @@ -514,14 +536,21 @@ class QEffCausalLMForTextImageToTextModel(QEFFBaseModel): VlmKVOffloadTransform, SplitGateUpWeightsTransform, ] - _onnx_transforms = [FP16ClipTransform, SplitTensorsTransform] + _onnx_transforms = [OnnxSlimTransform, FP16ClipTransform, SplitTensorsTransform] - def __init__(self, model): + def __init__(self, model, onnx_slim_transform: bool = False): super().__init__(model) self.model = model.get_qeff_language_decoder() + self.onnx_slim_transform = onnx_slim_transform def export(self, inputs, output_names, dynamic_axes, export_dir=None): - return self._export(inputs, output_names, dynamic_axes, export_dir) + return self._export( + inputs, + output_names, + dynamic_axes, + export_dir, + onnx_slim_transform=self.onnx_slim_transform, + ) def compile( self, @@ -579,6 +608,7 @@ class _QEffAutoModelForImageTextToTextDualQPC: def __init__( self, model: nn.Module, + onnx_slim_transform: bool = False, **kwargs, ): if kwargs.pop("full_batch_size", None): @@ -589,6 +619,7 @@ def __init__( self.vision_model = QEffVisionEncoderForTextImageToTextModel(model) self.lang_model = QEffCausalLMForTextImageToTextModel(model) self.input_shapes, self.output_names = None, None + self.onnx_slim_transform = onnx_slim_transform @property def model_name(self) -> str: @@ -598,7 +629,7 @@ def model_name(self) -> str: return mname @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs): + def from_pretrained(cls, pretrained_model_name_or_path: str, onnx_slim_transform: bool = False, **kwargs): if kwargs.get("attn_implementation", None) not in {None, "eager"}: logger.warning('Updating attn_implementation="eager"') @@ -606,8 +637,13 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs): logger.warning("Updating low_cpu_mem_usage=False") kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False}) - model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, **kwargs) - return cls(model, pretrained_model_name_or_path=pretrained_model_name_or_path, **kwargs) + model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, onnx_slim_transform, **kwargs) + return cls( + model, + pretrained_model_name_or_path=pretrained_model_name_or_path, + onnx_slim_transform=onnx_slim_transform, + **kwargs, + ) @property def onnx_path(self): @@ -937,11 +973,12 @@ class _QEFFAutoModelForImageTextToTextSingleQPC(QEFFTransformersBase, Multimodal VlmNoKVOffloadTransform, SplitGateUpWeightsTransform, ] - _onnx_transforms = [FP16ClipTransform, SplitTensorsTransform] + _onnx_transforms = [OnnxSlimTransform, FP16ClipTransform, SplitTensorsTransform] def __init__( self, model: nn.Module, + onnx_slim_transform: bool = False, **kwargs, ): if kwargs.pop("full_batch_size", None): @@ -956,11 +993,13 @@ def __init__( else: self.model.config.text_config.use_cache = True self.pretrained_model_name_or_path = kwargs.get("pretrained_model_name_or_path", None) + self.onnx_slim_transform = onnx_slim_transform @classmethod def from_pretrained( cls, pretrained_model_name_or_path, + onnx_slim_transform: bool = False, *args, **kwargs, ): @@ -973,10 +1012,14 @@ def from_pretrained( kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False}) from transformers import AutoConfig - config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=True) + config = AutoConfig.from_pretrained( + pretrained_model_name_or_path, trust_remote_code=True, onnx_slim_transform=onnx_slim_transform, **kwargs + ) config._attn_implementation = "eager" config.vision_config.use_flash_attn = "false" - model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, config, *args, **kwargs) + model = cls._hf_auto_class.from_pretrained( + pretrained_model_name_or_path, config, onnx_slim_transform=onnx_slim_transform, *args, **kwargs + ) return cls(model, pretrained_model_name_or_path=pretrained_model_name_or_path, **kwargs) @@ -988,7 +1031,13 @@ def export( inputs = self.model.get_dummy_inputs() dynamic_axes = self.model.get_onnx_dynamic_axes() output_names = self.model.get_output_names() - return self._export(inputs, output_names, dynamic_axes, export_dir=export_dir) + return self._export( + inputs, + output_names, + dynamic_axes, + export_dir=export_dir, + onnx_slim_transform=self.onnx_slim_transform, + ) def compile( self, @@ -1308,7 +1357,13 @@ def __new__(self, model: nn.Module, kv_offload: Optional[bool] = True, **kwargs) @classmethod @with_replaced_quantizers - def from_pretrained(cls, pretrained_model_name_or_path: str, kv_offload: Optional[bool] = None, **kwargs): + def from_pretrained( + cls, + pretrained_model_name_or_path: str, + onnx_slim_transform: bool = False, + kv_offload: Optional[bool] = None, + **kwargs, + ): """Used to load models supported by transformers.AutoModelForImageTextToText for Cloud AI 100. Args: @@ -1329,8 +1384,16 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, kv_offload: Optiona NotImplementedError("Continuous batching is not supported for image-text-to-text models yet.") kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False}) - model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, **kwargs) - return cls(model, kv_offload=kv_offload, pretrained_model_name_or_path=pretrained_model_name_or_path, **kwargs) + model = cls._hf_auto_class.from_pretrained( + pretrained_model_name_or_path, onnx_slim_transform=onnx_slim_transform, **kwargs + ) + return cls( + model, + kv_offload=kv_offload, + pretrained_model_name_or_path=pretrained_model_name_or_path, + onnx_slim_transform=onnx_slim_transform, + **kwargs, + ) MISCLASSIFIED_CAUSAL_LM_TO_QEFF_AUTO_CLASS_MAP = {"InternVLChatModel": QEFFAutoModelForImageTextToText} @@ -1379,13 +1442,14 @@ class QEFFAutoModelForCausalLM(QEFFBaseModel): SplitGateUpWeightsTransform, KVCacheExternalModuleMapperTransform, ] - _onnx_transforms = [FP16ClipTransform, SplitTensorsTransform] + _onnx_transforms = [OnnxSlimTransform, FP16ClipTransform, SplitTensorsTransform] def __init__( self, model: nn.Module, continuous_batching: bool = False, qaic_config: Optional[dict] = None, + onnx_slim_transform: bool = False, **kwargs, ): model_class_name = model.__class__.__name__ @@ -1414,6 +1478,7 @@ def __init__( self.pretrained_model_name_or_path = kwargs.get("pretrained_model_name_or_path", None) self.model, transformed = SpDTransform.apply(self.model, qaic_config, **kwargs) self.is_tlm = transformed + self.onnx_slim_transform = onnx_slim_transform # ---Sampling--- # Note: SamplerTransform should be applied after all other transforms @@ -1440,6 +1505,7 @@ def from_pretrained( pretrained_model_name_or_path, continuous_batching: bool = False, qaic_config: Optional[dict] = None, + onnx_slim_transform: bool = False, *args, **kwargs, ): @@ -1509,6 +1575,7 @@ def from_pretrained( continuous_batching=continuous_batching, qaic_config=qaic_config, pretrained_model_name_or_path=pretrained_model_name_or_path, + onnx_slim_transform=onnx_slim_transform, **kwargs, ) @@ -1614,6 +1681,7 @@ def export(self, export_dir: Optional[str] = None) -> str: output_names, dynamic_axes, export_dir=export_dir, + onnx_slim_transform=self.onnx_slim_transform, ) def get_sampling_inputs_and_outputs( @@ -1958,9 +2026,9 @@ class QEFFAutoModelForSpeechSeq2Seq(QEFFTransformersBase, MultimodalUtilityMixin _hf_auto_class = AutoModelForSpeechSeq2Seq _pytorch_transforms = [CustomOpsTransform, AwqToMatmulNbitsTransform, GPTQToMatmulNbitsTransform, KVCacheTransform] - _onnx_transforms = [FP16ClipTransform, SplitTensorsTransform] + _onnx_transforms = [OnnxSlimTransform, FP16ClipTransform, SplitTensorsTransform] - def __init__(self, model: nn.Module, **kwargs): + def __init__(self, model: nn.Module, onnx_slim_transform: bool = False, **kwargs): model_class_name = model.__class__.__name__ if not (model_class_name.endswith("ForConditionalGeneration")): raise TypeError(f"Required pytorch module with ForConditionalGeneration, got {model_class_name}") @@ -1969,6 +2037,7 @@ def __init__(self, model: nn.Module, **kwargs): self.model.config.use_cache = True self.num_layers = model.config.num_hidden_layers self.pretrained_model_name_or_path = kwargs.get("pretrained_model_name_or_path", None) + self.onnx_slim_transform = onnx_slim_transform @property def model_hash(self) -> str: @@ -2003,7 +2072,9 @@ def export(self, export_dir: Optional[str] = None) -> str: inputs = self.model.get_dummy_inputs() dynamic_axes = self.model.get_onnx_dynamic_axes() output_names = self.model.get_output_names() - return self._export(inputs, output_names, dynamic_axes, export_dir=export_dir) + return self._export( + inputs, output_names, dynamic_axes, export_dir=export_dir, onnx_slim_transform=self.onnx_slim_transform + ) def compile( self, diff --git a/pyproject.toml b/pyproject.toml index 479736c22..deccd7499 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,6 +33,7 @@ dependencies = [ "numpy==1.26.4", "protobuf==6.31.0", "onnxscript==0.2.5", + "onnxslim==0.1.64", "pillow===10.4.0", "sympy", "tensorboard",