diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py index be4b86321..8e824b488 100644 --- a/QEfficient/__init__.py +++ b/QEfficient/__init__.py @@ -8,19 +8,50 @@ import os import warnings -from QEfficient.utils import custom_format_warning - # For faster downloads via hf_transfer # This code is put above import statements as this needs to be executed before -# hf_transfer is imported (will happen on line 15 via leading imports) +# hf_transfer is imported (will happen on line 14 via leading imports) os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" + # Placeholder for all non-transformer models registered in QEfficient import QEfficient.utils.model_registery # noqa: F401 +from QEfficient.base import ( + QEFFAutoModel, + QEFFAutoModelForCausalLM, + QEFFAutoModelForImageTextToText, + QEFFAutoModelForSpeechSeq2Seq, + QEFFCommonLoader, +) +from QEfficient.compile.compile_helper import compile +from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter +from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv +from QEfficient.peft import QEffAutoPeftModelForCausalLM +from QEfficient.transformers.transform import transform +from QEfficient.utils import custom_format_warning from QEfficient.utils.logging_utils import logger # custom warning for the better logging experience warnings.formatwarning = custom_format_warning +# Conditionally import QAIC-related modules if the SDK is installed +__version__ = "0.0.1.dev0" + +# Users can use QEfficient.export for exporting models to ONNX +export = qualcomm_efficient_converter + +__all__ = [ + "transform", + "export", + "compile", + "cloud_ai_100_exec_kv", + "QEFFAutoModel", + "QEFFAutoModelForCausalLM", + "QEffAutoPeftModelForCausalLM", + "QEFFAutoModelForImageTextToText", + "QEFFAutoModelForSpeechSeq2Seq", + "QEFFCommonLoader", +] + def check_qaic_sdk(): """Check if QAIC SDK is installed""" @@ -36,38 +67,5 @@ def check_qaic_sdk(): return False -# Conditionally import QAIC-related modules if the SDK is installed -__version__ = "0.0.1.dev0" - -if check_qaic_sdk(): - from QEfficient.base import ( - QEFFAutoModel, - QEFFAutoModelForCausalLM, - QEFFAutoModelForImageTextToText, - QEFFAutoModelForSpeechSeq2Seq, - QEFFCommonLoader, - ) - from QEfficient.compile.compile_helper import compile - from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter - from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv - from QEfficient.peft import QEffAutoPeftModelForCausalLM - from QEfficient.transformers.transform import transform - - # Users can use QEfficient.export for exporting models to ONNX - export = qualcomm_efficient_converter - - __all__ = [ - "transform", - "export", - "compile", - "cloud_ai_100_exec_kv", - "QEFFAutoModel", - "QEFFAutoModelForCausalLM", - "QEffAutoPeftModelForCausalLM", - "QEFFAutoModelForImageTextToText", - "QEFFAutoModelForSpeechSeq2Seq", - "QEFFCommonLoader", - ] - -else: +if not check_qaic_sdk(): logger.warning("QAIC SDK is not installed, eager mode features won't be available!") diff --git a/QEfficient/generation/cloud_infer.py b/QEfficient/generation/cloud_infer.py index 8519d824c..351d7f014 100644 --- a/QEfficient/generation/cloud_infer.py +++ b/QEfficient/generation/cloud_infer.py @@ -5,43 +5,34 @@ # # ----------------------------------------------------------------------------- +import importlib +import platform +import sys +from functools import cached_property from pathlib import Path from typing import Dict, List, Optional, Union from warnings import warn import numpy as np -try: - import qaicrt -except ImportError: - import platform - import sys - - sys.path.append(f"/opt/qti-aic/dev/lib/{platform.machine()}") - import qaicrt - -try: - import QAicApi_pb2 as aicapi -except ImportError: - import sys - - sys.path.append("/opt/qti-aic/dev/python") - import QAicApi_pb2 as aicapi - -aic_to_np_dtype_mapping = { - aicapi.FLOAT_TYPE: np.dtype(np.float32), - aicapi.FLOAT_16_TYPE: np.dtype(np.float16), - aicapi.INT8_Q_TYPE: np.dtype(np.int8), - aicapi.UINT8_Q_TYPE: np.dtype(np.uint8), - aicapi.INT16_Q_TYPE: np.dtype(np.int16), - aicapi.INT32_Q_TYPE: np.dtype(np.int32), - aicapi.INT32_I_TYPE: np.dtype(np.int32), - aicapi.INT64_I_TYPE: np.dtype(np.int64), - aicapi.INT8_TYPE: np.dtype(np.int8), -} - class QAICInferenceSession: + @cached_property + def qaicrt(self): + try: + return importlib.import_module("qaicrt") + except ImportError: + sys.path.append(f"/opt/qti-aic/dev/lib/{platform.machine()}") + return importlib.import_module("qaicrt") + + @cached_property + def aicapi(self): + try: + return importlib.import_module("QAicApi_pb2") + except ImportError: + sys.path.append("/opt/qti-aic/dev/python") + return importlib.import_module("QAicApi_pb2") + def __init__( self, qpc_path: Union[Path, str], @@ -51,66 +42,86 @@ def __init__( ): """ Initialise for QAIC inference Session - --------- - :qpc_path: str. Path to the save generated binary file after compilation. - :device_ids: List[int]. Device Ids to be used for compilation. if devices > 1, it enables multiple card setup. - :activate: bool. If false, activation will be disabled. Default=True. - :enable_debug_logs: bool. If True, It will enable debug logs. Default=False. + :param qpc_path: Path to the saved compiled QPC binary. + :param device_ids: Device IDs to be used; if > 1, enables multi-card setup. + :param activate: If False, activation will be skipped. Default=True. + :param enable_debug_logs: If True, enable debug logs. Default=False. """ + + # Build dtype mapping once (depends on self.aicapi constants) + self.aic_to_np_dtype_mapping = { + self.aicapi.FLOAT_TYPE: np.dtype(np.float32), + self.aicapi.FLOAT_16_TYPE: np.dtype(np.float16), + self.aicapi.INT8_Q_TYPE: np.dtype(np.int8), + self.aicapi.UINT8_Q_TYPE: np.dtype(np.uint8), + self.aicapi.INT16_Q_TYPE: np.dtype(np.int16), + self.aicapi.INT32_Q_TYPE: np.dtype(np.int32), + self.aicapi.INT32_I_TYPE: np.dtype(np.int32), + self.aicapi.INT64_I_TYPE: np.dtype(np.int64), + self.aicapi.INT8_TYPE: np.dtype(np.int8), + } # Load QPC if device_ids is not None: - devices = qaicrt.QIDList(device_ids) - self.context = qaicrt.Context(devices) - self.queue = qaicrt.Queue(self.context, device_ids[0]) + devices = self.qaicrt.QIDList(device_ids) + self.context = self.qaicrt.Context(devices) + self.queue = self.qaicrt.Queue(self.context, device_ids[0]) else: - self.context = qaicrt.Context() - self.queue = qaicrt.Queue(self.context, 0) # Async API + self.context = self.qaicrt.Context() + self.queue = self.qaicrt.Queue(self.context, 0) # Async API + if enable_debug_logs: - if self.context.setLogLevel(qaicrt.QLogLevel.QL_DEBUG) != qaicrt.QStatus.QS_SUCCESS: + if self.context.setLogLevel(self.qaicrt.QLogLevel.QL_DEBUG) != self.qaicrt.QStatus.QS_SUCCESS: raise RuntimeError("Failed to setLogLevel") - qpc = qaicrt.Qpc(str(qpc_path)) + + qpc = self.qaicrt.Qpc(str(qpc_path)) + # Load IO Descriptor - iodesc = aicapi.IoDesc() + iodesc = self.aicapi.IoDesc() status, iodesc_data = qpc.getIoDescriptor() - if status != qaicrt.QStatus.QS_SUCCESS: + if status != self.qaicrt.QStatus.QS_SUCCESS: raise RuntimeError("Failed to getIoDescriptor") iodesc.ParseFromString(bytes(iodesc_data)) + self.allowed_shapes = [ - [(aic_to_np_dtype_mapping[x.type].itemsize, list(x.dims)) for x in allowed_shape.shapes] + [(self.aic_to_np_dtype_mapping[x.type].itemsize, list(x.dims)) for x in allowed_shape.shapes] for allowed_shape in iodesc.allowed_shapes ] self.bindings = iodesc.selected_set.bindings self.binding_index_map = {binding.name: binding.index for binding in self.bindings} + # Create and load Program - prog_properties = qaicrt.QAicProgramProperties() + prog_properties = self.qaicrt.QAicProgramProperties() prog_properties.SubmitRetryTimeoutMs = 60_000 if device_ids and len(device_ids) > 1: prog_properties.devMapping = ":".join(map(str, device_ids)) - self.program = qaicrt.Program(self.context, None, qpc, prog_properties) - if self.program.load() != qaicrt.QStatus.QS_SUCCESS: + + self.program = self.qaicrt.Program(self.context, None, qpc, prog_properties) + if self.program.load() != self.qaicrt.QStatus.QS_SUCCESS: raise RuntimeError("Failed to load program") + if activate: self.activate() + # Create input qbuffers and buf_dims - self.qbuffers = [qaicrt.QBuffer(bytes(binding.size)) for binding in self.bindings] - self.buf_dims = qaicrt.BufferDimensionsVecRef( - [(aic_to_np_dtype_mapping[binding.type].itemsize, list(binding.dims)) for binding in self.bindings] + self.qbuffers = [self.qaicrt.QBuffer(bytes(binding.size)) for binding in self.bindings] + self.buf_dims = self.qaicrt.BufferDimensionsVecRef( + [(self.aic_to_np_dtype_mapping[binding.type].itemsize, list(binding.dims)) for binding in self.bindings] ) @property def input_names(self) -> List[str]: - return [binding.name for binding in self.bindings if binding.dir == aicapi.BUFFER_IO_TYPE_INPUT] + return [binding.name for binding in self.bindings if binding.dir == self.aicapi.BUFFER_IO_TYPE_INPUT] @property def output_names(self) -> List[str]: - return [binding.name for binding in self.bindings if binding.dir == aicapi.BUFFER_IO_TYPE_OUTPUT] + return [binding.name for binding in self.bindings if binding.dir == self.aicapi.BUFFER_IO_TYPE_OUTPUT] def activate(self): """Activate qpc""" self.program.activate() - self.execObj = qaicrt.ExecObj(self.context, self.program) + self.execObj = self.qaicrt.ExecObj(self.context, self.program) def deactivate(self): """Deactivate qpc""" @@ -131,7 +142,7 @@ def set_buffers(self, buffers: Dict[str, np.ndarray]): warn(f'Buffer: "{buffer_name}" not found') continue buffer_index = self.binding_index_map[buffer_name] - self.qbuffers[buffer_index] = qaicrt.QBuffer(buffer.tobytes()) + self.qbuffers[buffer_index] = self.qaicrt.QBuffer(buffer.tobytes()) self.buf_dims[buffer_index] = ( buffer.itemsize, buffer.shape if len(buffer.shape) > 0 else (1,), @@ -157,21 +168,19 @@ def run(self, inputs: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]: Return: :Dict[str, np.ndarray]: """ - # Set inputs + self.set_buffers(inputs) - if self.execObj.setData(self.qbuffers, self.buf_dims) != qaicrt.QStatus.QS_SUCCESS: + if self.execObj.setData(self.qbuffers, self.buf_dims) != self.qaicrt.QStatus.QS_SUCCESS: raise MemoryError("Failed to setData") - # # Run with sync API - # if self.execObj.run(self.qbuffers) != qaicrt.QStatus.QS_SUCCESS: - # Run with async API - if self.queue.enqueue(self.execObj) != qaicrt.QStatus.QS_SUCCESS: + + if self.queue.enqueue(self.execObj) != self.qaicrt.QStatus.QS_SUCCESS: raise MemoryError("Failed to enqueue") - if self.execObj.waitForCompletion() != qaicrt.QStatus.QS_SUCCESS: + + if self.execObj.waitForCompletion() != self.qaicrt.QStatus.QS_SUCCESS: error_message = "Failed to run" - # Print additional error messages for unmatched dimension error + if self.allowed_shapes: - error_message += "\n\n" - error_message += '(Only if "No matching dimension found" error is present above)' + error_message += "\n\n(Only if 'No matching dimension found' error is present above)" error_message += "\nAllowed shapes:" for i, allowed_shape in enumerate(self.allowed_shapes): error_message += f"\n{i}\n" @@ -189,11 +198,11 @@ def run(self, inputs: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]: continue error_message += f"{binding.name}:\t{elemsize}\t{shape}\n" raise ValueError(error_message) - # Get output buffers + status, output_qbuffers = self.execObj.getData() - if status != qaicrt.QStatus.QS_SUCCESS: + if status != self.qaicrt.QStatus.QS_SUCCESS: raise MemoryError("Failed to getData") - # Build output + outputs = {} for output_name in self.output_names: buffer_index = self.binding_index_map[output_name] @@ -201,6 +210,6 @@ def run(self, inputs: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]: continue outputs[output_name] = np.frombuffer( bytes(output_qbuffers[buffer_index]), - aic_to_np_dtype_mapping[self.bindings[buffer_index].type], + self.aic_to_np_dtype_mapping[self.bindings[buffer_index].type], ).reshape(self.buf_dims[buffer_index][1]) return outputs