Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
203605d
Add support for AArch64 JIT-based 3D Deconvolution and Convolution Ex…
allnes Oct 16, 2025
f77cee0
Optimize AArch64 JIT Conv3D executor by adding vector fast path for `…
allnes Oct 16, 2025
3248751
Add FP32 support to AArch64 JIT-based 3D Deconvolution and Convolutio…
allnes Oct 16, 2025
b9cee94
Refactor AArch64 JIT 3D Deconvolution and Convolution Executors to us…
allnes Oct 19, 2025
6e7c413
Refactor AArch64 JIT 3D Deconvolution Executor for improved readabili…
allnes Oct 19, 2025
b4da434
Refactor AArch64 JIT 3D Deconvolution and Convolution Executors for i…
allnes Oct 19, 2025
787e3f0
Add early weight preparation and alternative packing for S=2 in AArch…
allnes Oct 21, 2025
09483f1
Refactor AArch64 JIT 3D Deconvolution Executor by introducing `pack_i…
allnes Oct 21, 2025
61a4af7
Remove unused environment variables, redundant code paths, and obsole…
allnes Oct 22, 2025
c8d4547
Remove unused helper functions, redundant conditions, and raw weight …
allnes Oct 22, 2025
49ef0c7
Introduce early weight preparation in AArch64 JIT 3D Convolution and …
allnes Oct 22, 2025
ca892ff
Remove unused fast paths, redundant logic, and obsolete conditions in…
allnes Oct 22, 2025
3653f29
Simplify and clean up AArch64 JIT 3D Convolution and Deconvolution Ex…
allnes Oct 22, 2025
f822a7b
Refactor AArch64 JIT 3D Convolution and Deconvolution Executors by co…
allnes Oct 22, 2025
d8ed2c3
Refactor AArch64 JIT 3D Convolution Executor by consolidating repetit…
allnes Oct 22, 2025
7e0d960
Remove JitConv3DExecutorF32 implementation, associated helper functio…
allnes Oct 22, 2025
7b7588d
Remove JitConv3DKernelF32 implementation, associated helper functions…
allnes Oct 23, 2025
99d361f
Refactor AArch64 JIT 3D Convolution and Deconvolution Executors by in…
allnes Oct 23, 2025
c69e4a1
Refactor AArch64 JIT 3D Convolution Executor by introducing reusable …
allnes Oct 23, 2025
1cba4b1
Refactor Deconvolution node by replacing `execPtrDeconvACL` with fact…
allnes Oct 23, 2025
7e6fa47
Merge branch 'master' into an/unet-aarch64-jit-opt
allnes Oct 27, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
110 changes: 90 additions & 20 deletions src/plugins/intel_cpu/src/nodes/deconv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,10 @@
#include "nodes/common/blocked_desc_creator.h"
#include "nodes/common/dnnl_executor.h"
#include "nodes/executors/deconv_list.hpp"
#include "utils/arch_macros.h"
#if defined(OPENVINO_ARCH_ARM64)
# include "nodes/executors/aarch64/jit_deconv3d.hpp"
#endif
#include "nodes/executors/executor.hpp"
#include "nodes/node_config.h"
#include "onednn/dnnl.h"
Expand Down Expand Up @@ -634,8 +638,8 @@ void Deconvolution::getSupportedDescriptors() {

return AclDeconvExecutorBuilder::customIsSupported(deconvAttrs, srcMemoryDescs, dstMemoryDescs);
};
useACL = checkDesc(LayoutType::nspc) || checkDesc(LayoutType::ncsp);
if (useACL) {

if (checkDesc(LayoutType::nspc) || checkDesc(LayoutType::ncsp)) {
return;
}
#endif
Expand Down Expand Up @@ -788,22 +792,18 @@ VectorDims Deconvolution::shapeInferInternal(const VectorDims& inDims, std::vect
}

void Deconvolution::execute(const dnnl::stream& strm) {
if (useACL) {
if (execPtrFactory) {
std::vector<MemoryCPtr> srcMemory;
for (size_t i = 0; i < getOriginalInputsNumber(); i++) {
for (size_t i = 0; i < getOriginalInputsNumber(); i++)
srcMemory.push_back(getSrcMemoryAtPort(i));
}
std::vector<MemoryPtr> dstMemory;
for (size_t i = 0; i < getOriginalOutputsNumber(); i++) {
for (size_t i = 0; i < getOriginalOutputsNumber(); i++)
dstMemory.push_back(getDstMemoryAtPort(i));
}
// TODO: need to pass post ops data
execPtrDeconvACL->exec(srcMemory, dstMemory, nullptr);
execPtrFactory->exec(srcMemory, dstMemory, nullptr);
return;
}

CPU_NODE_ASSERT(execPtr, "executor is not compiled");

execPtr->exec(primArgs, strm);

if (externOutShape) {
Expand Down Expand Up @@ -965,7 +965,9 @@ void Deconvolution::prepareParams() {
auto* selected_pd = getSelectedPrimitiveDescriptor();
CPU_NODE_ASSERT(selected_pd, "Preferable primitive descriptor is not set.");

if (useACL) {
// Minimal integration: always try factory path (ACL/JIT) with early-packing ctor;
// fall back to oneDNN path if factory does not provide an executor.
{
if (isDynamicNode()) {
initPaddingR(getParentEdgeAt(0)->getMemory().getDescPtr()->getShape(),
getChildEdgeAt(0)->getMemory().getDescPtr()->getShape());
Expand All @@ -979,12 +981,24 @@ void Deconvolution::prepareParams() {
dstMemoryDescs.push_back(getChildEdgeAt(i)->getMemory().getDescWithType<DnnlMemoryDesc>());
}

execPtrDeconvACL = selected_pd->getExecutorFactoryAs<DeconvExecutorFactory>()->makeExecutor(deconvAttrs,
srcMemoryDescs,
dstMemoryDescs,
*attr);
selected_pd->setImplementationType(execPtrDeconvACL->getImplType());
return;
std::vector<MemoryCPtr> srcMemoriesEarly;
for (size_t i = 0; i < getOriginalInputsNumber(); i++) {
srcMemoriesEarly.push_back(getSrcMemoryAtPort(i));
}

try {
auto factory = selected_pd->getExecutorFactoryAs<DeconvExecutorFactory>();
if (factory) {
auto exec = factory->makeExecutorWithMem(deconvAttrs, srcMemoryDescs, dstMemoryDescs, *attr, srcMemoriesEarly);
if (exec) {
execPtrFactory = exec;
selected_pd->setImplementationType(execPtrFactory->getImplType());
return;
}
}
} catch (...) {
// Fallback to oneDNN path when factory isn't applicable
}
}
auto inMemoryDesc = getParentEdgeAt(0)->getMemory().getDescWithType<DnnlMemoryDesc>();
auto outMemoryDesc = getChildEdgeAt(0)->getMemory().getDescWithType<DnnlMemoryDesc>();
Expand Down Expand Up @@ -1296,10 +1310,66 @@ bool Deconvolution::canFuseBias() const {
}

void Deconvolution::initSupportedPrimitiveDescriptors() {
if (!useACL) {
Node::initSupportedPrimitiveDescriptors();
return;
// Prefer AArch64 JIT deconv for 5D FP16/FP32 on ARM64 regardless of ACL
#if defined(OPENVINO_ARCH_ARM64)
{
const auto rank = getInputShapeAtPort(0).getRank();
const bool is5D = (rank == 5);
const bool fp16_ok = getOriginalInputPrecisionAtPort(0) == ov::element::f16 &&
getOriginalInputPrecisionAtPort(1) == ov::element::f16 &&
getOriginalOutputPrecisionAtPort(0) == ov::element::f16;
const bool fp32_ok = getOriginalInputPrecisionAtPort(0) == ov::element::f32 &&
getOriginalInputPrecisionAtPort(1) == ov::element::f32 &&
getOriginalOutputPrecisionAtPort(0) == ov::element::f32;
if (is5D && (fp16_ok || fp32_ok)) {
auto [inDims, outDims] = makeDummyInOutShape();
auto tmpInShape = Shape(inDims);
auto tmpOutShape = Shape(outDims);
initPaddingR(tmpInShape, tmpOutShape);

const auto& creatorsMap = BlockedDescCreator::getCommonCreators();
NodeConfig config;
config.inConfs.resize(getParentEdges().size());
config.outConfs.resize(getOriginalOutputsNumber());

auto setDesc = [&](size_t port, bool isInput) {
const auto prec =
isInput ? getOriginalInputPrecisionAtPort(port) : getOriginalOutputPrecisionAtPort(port);
const auto& shp = isInput ? getInputShapeAtPort(port) : getOutputShapeAtPort(port);
auto d = creatorsMap.at(LayoutType::ncsp)->createSharedDesc(prec, shp);
if (isInput)
config.inConfs[port].setMemDesc(d);
else
config.outConfs[port].setMemDesc(d);
};
setDesc(0, true);
setDesc(1, true);
for (size_t i = 2; i < getParentEdges().size(); ++i)
setDesc(i, true);
setDesc(0, false);

std::vector<MemoryDescPtr> srcMemoryDescs;
srcMemoryDescs.push_back(config.inConfs[0].getMemDesc()->cloneWithNewDims(tmpInShape.getDims()));
for (size_t i = 1; i < config.inConfs.size(); i++)
srcMemoryDescs.push_back(config.inConfs[i].getMemDesc()->clone());
std::vector<MemoryDescPtr> dstMemoryDescs;
dstMemoryDescs.push_back(config.outConfs[0].getMemDesc()->cloneWithNewDims(tmpOutShape.getDims()));
for (size_t i = 1; i < config.outConfs.size(); i++)
dstMemoryDescs.push_back(config.outConfs[i].getMemDesc()->clone());

auto factory =
std::make_shared<DeconvExecutorFactory>(deconvAttrs,
srcMemoryDescs,
dstMemoryDescs,
std::make_shared<ExecutorContext>(context, getImplPriority()));
supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::jit_asimd, factory);
return;
}
}
#endif

Node::initSupportedPrimitiveDescriptors();
return;

auto [inDims, outDims] = makeDummyInOutShape();
auto tmpInShape = Shape(inDims);
Expand Down
4 changes: 2 additions & 2 deletions src/plugins/intel_cpu/src/nodes/deconv.h
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,8 @@ class Deconvolution : public Node {
AttrPtr initPrimitiveAttr() override;
AttrPtr makePrimitiveAttr(const VectorDims& dims);
std::vector<dnnl::memory::format_tag> getAvailableFormatsForDims(const Shape& dims) const override;
std::shared_ptr<DeconvExecutor> execPtrDeconvACL = nullptr;
// Factory-based executor (JIT/ACL), created via DeconvExecutorFactory
std::shared_ptr<DeconvExecutor> execPtrFactory = nullptr;

private:
using executorPtr = std::shared_ptr<DnnlExecutorLegacy>;
Expand Down Expand Up @@ -101,7 +102,6 @@ class Deconvolution : public Node {
VectorDims dnnlCompatibleWeiDims;
VectorDims expectedBiasDims;

bool useACL = false;
DeconvAttrs deconvAttrs;

Shape inShape, outShape;
Expand Down
Loading
Loading