Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 35 additions & 37 deletions QEfficient/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,50 @@
import os
import warnings

from QEfficient.utils import custom_format_warning

# For faster downloads via hf_transfer
# This code is put above import statements as this needs to be executed before
# hf_transfer is imported (will happen on line 15 via leading imports)
# hf_transfer is imported (will happen on line 14 via leading imports)
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

# Placeholder for all non-transformer models registered in QEfficient
import QEfficient.utils.model_registery # noqa: F401
from QEfficient.base import (
QEFFAutoModel,
QEFFAutoModelForCausalLM,
QEFFAutoModelForImageTextToText,
QEFFAutoModelForSpeechSeq2Seq,
QEFFCommonLoader,
)
from QEfficient.compile.compile_helper import compile
from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv
from QEfficient.peft import QEffAutoPeftModelForCausalLM
from QEfficient.transformers.transform import transform
from QEfficient.utils import custom_format_warning
from QEfficient.utils.logging_utils import logger

# custom warning for the better logging experience
warnings.formatwarning = custom_format_warning

# Conditionally import QAIC-related modules if the SDK is installed
__version__ = "0.0.1.dev0"

# Users can use QEfficient.export for exporting models to ONNX
export = qualcomm_efficient_converter

__all__ = [
"transform",
"export",
"compile",
"cloud_ai_100_exec_kv",
"QEFFAutoModel",
"QEFFAutoModelForCausalLM",
"QEffAutoPeftModelForCausalLM",
"QEFFAutoModelForImageTextToText",
"QEFFAutoModelForSpeechSeq2Seq",
"QEFFCommonLoader",
]


def check_qaic_sdk():
"""Check if QAIC SDK is installed"""
Expand All @@ -36,38 +67,5 @@ def check_qaic_sdk():
return False


# Conditionally import QAIC-related modules if the SDK is installed
__version__ = "0.0.1.dev0"

if check_qaic_sdk():
from QEfficient.base import (
QEFFAutoModel,
QEFFAutoModelForCausalLM,
QEFFAutoModelForImageTextToText,
QEFFAutoModelForSpeechSeq2Seq,
QEFFCommonLoader,
)
from QEfficient.compile.compile_helper import compile
from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv
from QEfficient.peft import QEffAutoPeftModelForCausalLM
from QEfficient.transformers.transform import transform

# Users can use QEfficient.export for exporting models to ONNX
export = qualcomm_efficient_converter

__all__ = [
"transform",
"export",
"compile",
"cloud_ai_100_exec_kv",
"QEFFAutoModel",
"QEFFAutoModelForCausalLM",
"QEffAutoPeftModelForCausalLM",
"QEFFAutoModelForImageTextToText",
"QEFFAutoModelForSpeechSeq2Seq",
"QEFFCommonLoader",
]

else:
if not check_qaic_sdk():
logger.warning("QAIC SDK is not installed, eager mode features won't be available!")
145 changes: 77 additions & 68 deletions QEfficient/generation/cloud_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,43 +5,34 @@
#
# -----------------------------------------------------------------------------

import importlib
import platform
import sys
from functools import cached_property
from pathlib import Path
from typing import Dict, List, Optional, Union
from warnings import warn

import numpy as np

try:
import qaicrt
except ImportError:
import platform
import sys

sys.path.append(f"/opt/qti-aic/dev/lib/{platform.machine()}")
import qaicrt

try:
import QAicApi_pb2 as aicapi
except ImportError:
import sys

sys.path.append("/opt/qti-aic/dev/python")
import QAicApi_pb2 as aicapi

aic_to_np_dtype_mapping = {
aicapi.FLOAT_TYPE: np.dtype(np.float32),
aicapi.FLOAT_16_TYPE: np.dtype(np.float16),
aicapi.INT8_Q_TYPE: np.dtype(np.int8),
aicapi.UINT8_Q_TYPE: np.dtype(np.uint8),
aicapi.INT16_Q_TYPE: np.dtype(np.int16),
aicapi.INT32_Q_TYPE: np.dtype(np.int32),
aicapi.INT32_I_TYPE: np.dtype(np.int32),
aicapi.INT64_I_TYPE: np.dtype(np.int64),
aicapi.INT8_TYPE: np.dtype(np.int8),
}


class QAICInferenceSession:
@cached_property
def qaicrt(self):
try:
return importlib.import_module("qaicrt")
except ImportError:
sys.path.append(f"/opt/qti-aic/dev/lib/{platform.machine()}")
return importlib.import_module("qaicrt")

@cached_property
def aicapi(self):
try:
return importlib.import_module("QAicApi_pb2")
except ImportError:
sys.path.append("/opt/qti-aic/dev/python")
return importlib.import_module("QAicApi_pb2")

Comment on lines +20 to +35
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is done automatically by python we don't need to do this IMO.
Is this required?

def __init__(
self,
qpc_path: Union[Path, str],
Expand All @@ -51,66 +42,86 @@ def __init__(
):
"""
Initialise for QAIC inference Session
---------

:qpc_path: str. Path to the save generated binary file after compilation.
:device_ids: List[int]. Device Ids to be used for compilation. if devices > 1, it enables multiple card setup.
:activate: bool. If false, activation will be disabled. Default=True.
:enable_debug_logs: bool. If True, It will enable debug logs. Default=False.
:param qpc_path: Path to the saved compiled QPC binary.
:param device_ids: Device IDs to be used; if > 1, enables multi-card setup.
:param activate: If False, activation will be skipped. Default=True.
:param enable_debug_logs: If True, enable debug logs. Default=False.
"""

# Build dtype mapping once (depends on self.aicapi constants)
self.aic_to_np_dtype_mapping = {
self.aicapi.FLOAT_TYPE: np.dtype(np.float32),
self.aicapi.FLOAT_16_TYPE: np.dtype(np.float16),
self.aicapi.INT8_Q_TYPE: np.dtype(np.int8),
self.aicapi.UINT8_Q_TYPE: np.dtype(np.uint8),
self.aicapi.INT16_Q_TYPE: np.dtype(np.int16),
self.aicapi.INT32_Q_TYPE: np.dtype(np.int32),
self.aicapi.INT32_I_TYPE: np.dtype(np.int32),
self.aicapi.INT64_I_TYPE: np.dtype(np.int64),
self.aicapi.INT8_TYPE: np.dtype(np.int8),
}
# Load QPC
if device_ids is not None:
devices = qaicrt.QIDList(device_ids)
self.context = qaicrt.Context(devices)
self.queue = qaicrt.Queue(self.context, device_ids[0])
devices = self.qaicrt.QIDList(device_ids)
self.context = self.qaicrt.Context(devices)
self.queue = self.qaicrt.Queue(self.context, device_ids[0])
else:
self.context = qaicrt.Context()
self.queue = qaicrt.Queue(self.context, 0) # Async API
self.context = self.qaicrt.Context()
self.queue = self.qaicrt.Queue(self.context, 0) # Async API

if enable_debug_logs:
if self.context.setLogLevel(qaicrt.QLogLevel.QL_DEBUG) != qaicrt.QStatus.QS_SUCCESS:
if self.context.setLogLevel(self.qaicrt.QLogLevel.QL_DEBUG) != self.qaicrt.QStatus.QS_SUCCESS:
raise RuntimeError("Failed to setLogLevel")
qpc = qaicrt.Qpc(str(qpc_path))

qpc = self.qaicrt.Qpc(str(qpc_path))

# Load IO Descriptor
iodesc = aicapi.IoDesc()
iodesc = self.aicapi.IoDesc()
status, iodesc_data = qpc.getIoDescriptor()
if status != qaicrt.QStatus.QS_SUCCESS:
if status != self.qaicrt.QStatus.QS_SUCCESS:
raise RuntimeError("Failed to getIoDescriptor")
iodesc.ParseFromString(bytes(iodesc_data))

self.allowed_shapes = [
[(aic_to_np_dtype_mapping[x.type].itemsize, list(x.dims)) for x in allowed_shape.shapes]
[(self.aic_to_np_dtype_mapping[x.type].itemsize, list(x.dims)) for x in allowed_shape.shapes]
for allowed_shape in iodesc.allowed_shapes
]
self.bindings = iodesc.selected_set.bindings
self.binding_index_map = {binding.name: binding.index for binding in self.bindings}

# Create and load Program
prog_properties = qaicrt.QAicProgramProperties()
prog_properties = self.qaicrt.QAicProgramProperties()
prog_properties.SubmitRetryTimeoutMs = 60_000
if device_ids and len(device_ids) > 1:
prog_properties.devMapping = ":".join(map(str, device_ids))
self.program = qaicrt.Program(self.context, None, qpc, prog_properties)
if self.program.load() != qaicrt.QStatus.QS_SUCCESS:

self.program = self.qaicrt.Program(self.context, None, qpc, prog_properties)
if self.program.load() != self.qaicrt.QStatus.QS_SUCCESS:
raise RuntimeError("Failed to load program")

if activate:
self.activate()

# Create input qbuffers and buf_dims
self.qbuffers = [qaicrt.QBuffer(bytes(binding.size)) for binding in self.bindings]
self.buf_dims = qaicrt.BufferDimensionsVecRef(
[(aic_to_np_dtype_mapping[binding.type].itemsize, list(binding.dims)) for binding in self.bindings]
self.qbuffers = [self.qaicrt.QBuffer(bytes(binding.size)) for binding in self.bindings]
self.buf_dims = self.qaicrt.BufferDimensionsVecRef(
[(self.aic_to_np_dtype_mapping[binding.type].itemsize, list(binding.dims)) for binding in self.bindings]
)

@property
def input_names(self) -> List[str]:
return [binding.name for binding in self.bindings if binding.dir == aicapi.BUFFER_IO_TYPE_INPUT]
return [binding.name for binding in self.bindings if binding.dir == self.aicapi.BUFFER_IO_TYPE_INPUT]

@property
def output_names(self) -> List[str]:
return [binding.name for binding in self.bindings if binding.dir == aicapi.BUFFER_IO_TYPE_OUTPUT]
return [binding.name for binding in self.bindings if binding.dir == self.aicapi.BUFFER_IO_TYPE_OUTPUT]

def activate(self):
"""Activate qpc"""

self.program.activate()
self.execObj = qaicrt.ExecObj(self.context, self.program)
self.execObj = self.qaicrt.ExecObj(self.context, self.program)

def deactivate(self):
"""Deactivate qpc"""
Expand All @@ -131,7 +142,7 @@ def set_buffers(self, buffers: Dict[str, np.ndarray]):
warn(f'Buffer: "{buffer_name}" not found')
continue
buffer_index = self.binding_index_map[buffer_name]
self.qbuffers[buffer_index] = qaicrt.QBuffer(buffer.tobytes())
self.qbuffers[buffer_index] = self.qaicrt.QBuffer(buffer.tobytes())
self.buf_dims[buffer_index] = (
buffer.itemsize,
buffer.shape if len(buffer.shape) > 0 else (1,),
Expand All @@ -157,21 +168,19 @@ def run(self, inputs: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]:
Return:
:Dict[str, np.ndarray]:
"""
# Set inputs

self.set_buffers(inputs)
if self.execObj.setData(self.qbuffers, self.buf_dims) != qaicrt.QStatus.QS_SUCCESS:
if self.execObj.setData(self.qbuffers, self.buf_dims) != self.qaicrt.QStatus.QS_SUCCESS:
raise MemoryError("Failed to setData")
# # Run with sync API
# if self.execObj.run(self.qbuffers) != qaicrt.QStatus.QS_SUCCESS:
# Run with async API
if self.queue.enqueue(self.execObj) != qaicrt.QStatus.QS_SUCCESS:

if self.queue.enqueue(self.execObj) != self.qaicrt.QStatus.QS_SUCCESS:
raise MemoryError("Failed to enqueue")
if self.execObj.waitForCompletion() != qaicrt.QStatus.QS_SUCCESS:

if self.execObj.waitForCompletion() != self.qaicrt.QStatus.QS_SUCCESS:
error_message = "Failed to run"
# Print additional error messages for unmatched dimension error

if self.allowed_shapes:
error_message += "\n\n"
error_message += '(Only if "No matching dimension found" error is present above)'
error_message += "\n\n(Only if 'No matching dimension found' error is present above)"
error_message += "\nAllowed shapes:"
for i, allowed_shape in enumerate(self.allowed_shapes):
error_message += f"\n{i}\n"
Expand All @@ -189,18 +198,18 @@ def run(self, inputs: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]:
continue
error_message += f"{binding.name}:\t{elemsize}\t{shape}\n"
raise ValueError(error_message)
# Get output buffers

status, output_qbuffers = self.execObj.getData()
if status != qaicrt.QStatus.QS_SUCCESS:
if status != self.qaicrt.QStatus.QS_SUCCESS:
raise MemoryError("Failed to getData")
# Build output

outputs = {}
for output_name in self.output_names:
buffer_index = self.binding_index_map[output_name]
if self.qbuffers[buffer_index].size == 0:
continue
outputs[output_name] = np.frombuffer(
bytes(output_qbuffers[buffer_index]),
aic_to_np_dtype_mapping[self.bindings[buffer_index].type],
self.aic_to_np_dtype_mapping[self.bindings[buffer_index].type],
).reshape(self.buf_dims[buffer_index][1])
return outputs
Loading