-
Notifications
You must be signed in to change notification settings - Fork 2.7k
[Snippets][CPU] Support static and dynamic offsets in JIT Gemm and GemmCopyB emitters #31375
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
aobolensk
merged 18 commits into
openvinotoolkit:master
from
aobolensk:snippets-arm-matmul-offsets
Jul 31, 2025
Merged
Changes from all commits
Commits
Show all changes
18 commits
Select commit
Hold shift + click to select a range
c1ddb94
[Snippets][CPU] Support static and dynamic offsets in JIT Gemm and Ge…
aobolensk 36c75d7
Address review comments
aobolensk 7dc7b92
gpr length
aobolensk 8031b12
Remove redundant comments
aobolensk 343629e
Revert "gpr length"
aobolensk 2d3d563
Move utils
aobolensk 6ce8366
Address comments
aobolensk 9cef4e0
fmt & tidy
aobolensk f35a155
tidy 2
aobolensk 6aac927
x64 tidy
aobolensk 71c987e
Address utils related review comments
aobolensk 35bc548
Merge remote-tracking branch 'origin/master' into snippets-arm-matmul…
aobolensk 22d8db2
Address Vladislav's comments
aobolensk 30d1211
Merge branch 'master' into snippets-arm-matmul-offsets
aobolensk aa908ec
Address Alexandra's comments
aobolensk 0fa72c8
Fix Alexandra's follow-up comments
aobolensk 3c8325b
Merge remote-tracking branch 'origin/master' into snippets-arm-matmul…
aobolensk d9006b8
comment + tidy
aobolensk File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
chenhu-wang marked this conversation as resolved.
Show resolved
Hide resolved
|
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
181 changes: 181 additions & 0 deletions
181
src/plugins/intel_cpu/src/emitters/snippets/aarch64/utils.cpp
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,181 @@ | ||
// Copyright (C) 2025 Intel Corporation | ||
// SPDX-License-Identifier: Apache-2.0 | ||
// | ||
|
||
#include "utils.hpp" | ||
|
||
#include <xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_adr.h> | ||
#include <xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_reg.h> | ||
|
||
#include <cpu/aarch64/jit_generator.hpp> | ||
#include <cstddef> | ||
#include <cstdint> | ||
#include <set> | ||
#include <unordered_set> | ||
#include <vector> | ||
|
||
#include "emitters/snippets/jit_snippets_call_args.hpp" | ||
#include "emitters/utils.hpp" | ||
#include "openvino/core/except.hpp" | ||
#include "snippets/emitter.hpp" | ||
#include "snippets/utils/utils.hpp" | ||
#include "utils/general_utils.h" | ||
|
||
using namespace dnnl::impl::cpu::aarch64; | ||
|
||
namespace ov::intel_cpu::aarch64::utils { | ||
|
||
std::vector<Xbyak_aarch64::XReg> get_aux_gprs(const std::vector<size_t>& used_gpr_idxs, size_t count) { | ||
// X0 and X1 - runtime parameter registers in the kernel | ||
// X18 - platform register should not be used | ||
// SP - stack pointer should be preserved | ||
static const std::unordered_set<size_t> blacklist_gpr_idxs = { | ||
0, // abi_param1 (X0) | ||
1, // abi_param2 (X1) | ||
18, // Platform register (X18) | ||
31, // Stack pointer (SP) | ||
}; | ||
|
||
OPENVINO_ASSERT(count <= 32 - blacklist_gpr_idxs.size(), | ||
"Cannot allocate more than ", | ||
32 - blacklist_gpr_idxs.size(), | ||
" auxiliary registers"); | ||
|
||
// Convert used_gpr_idxs to unordered_set for O(1) lookups | ||
const std::unordered_set<size_t> used_set(used_gpr_idxs.begin(), used_gpr_idxs.end()); | ||
|
||
std::vector<Xbyak_aarch64::XReg> aux_regs; | ||
aux_regs.reserve(count); | ||
|
||
// Iterate from X30 down to X0 (allocate from the end) | ||
for (size_t idx = 30; idx != SIZE_MAX; --idx) { | ||
if (used_set.count(idx) || blacklist_gpr_idxs.count(idx)) { | ||
continue; | ||
} | ||
aux_regs.emplace_back(idx); | ||
if (aux_regs.size() == count) { | ||
break; | ||
} | ||
} | ||
|
||
OPENVINO_ASSERT(aux_regs.size() == count, "Expected ", count, " auxiliary registers, but got ", aux_regs.size()); | ||
return aux_regs; | ||
} | ||
|
||
Xbyak_aarch64::XReg get_aux_gpr(const std::vector<size_t>& used_gpr_idxs) { | ||
return get_aux_gprs(used_gpr_idxs, 1)[0]; | ||
} | ||
|
||
Xbyak_aarch64::XReg init_memory_access_aux_gpr(const std::vector<size_t>& used_gpr_reg_idxs, | ||
const std::vector<size_t>& aux_gpr_idxs, | ||
std::set<snippets::Reg>& regs_to_spill) { | ||
if (!aux_gpr_idxs.empty()) { | ||
return Xbyak_aarch64::XReg(static_cast<int>(aux_gpr_idxs[0])); | ||
} | ||
const auto aux_reg = get_aux_gpr(used_gpr_reg_idxs); | ||
regs_to_spill.emplace(snippets::RegType::gpr, aux_reg.getIdx()); | ||
return aux_reg; | ||
} | ||
|
||
void push_ptr_with_runtime_offset_on_stack(dnnl::impl::cpu::aarch64::jit_generator* h, | ||
int32_t stack_offset, | ||
const Xbyak_aarch64::XReg& ptr_reg, | ||
const std::vector<Xbyak_aarch64::XReg>& aux_regs, | ||
size_t runtime_offset) { | ||
// Safety assertions as suggested | ||
OV_CPU_JIT_EMITTER_ASSERT(aux_regs.size() >= 3, "aux_regs must contain at least 3 registers"); | ||
|
||
// Assert that ptr_reg is not in aux_regs | ||
for (const auto& reg : aux_regs) { | ||
OV_CPU_JIT_EMITTER_ASSERT(reg.getIdx() != ptr_reg.getIdx(), "ptr_reg must not be in aux_regs"); | ||
} | ||
|
||
// Use safe auxiliary registers from the provided set | ||
const Xbyak_aarch64::XReg aux_reg = aux_regs[0]; // For storing adjusted pointer | ||
const Xbyak_aarch64::XReg temp_reg = aux_regs[1]; // For temporary calculations | ||
const Xbyak_aarch64::XReg addr_reg = aux_regs[2]; // For address calculations in add_imm | ||
|
||
// Copy pointer to aux register | ||
h->mov(aux_reg, ptr_reg); | ||
|
||
// Load the runtime offset from abi_param1 (X0) and add it to the pointer | ||
Xbyak_aarch64::XReg abi_param1(0); | ||
|
||
// Load the offset value from the runtime parameter location | ||
h->add_imm(temp_reg, abi_param1, runtime_offset, addr_reg); | ||
h->ldr(temp_reg, Xbyak_aarch64::ptr(temp_reg)); | ||
|
||
h->add(aux_reg, aux_reg, temp_reg); | ||
|
||
// Store the adjusted pointer on stack | ||
h->str(aux_reg, Xbyak_aarch64::ptr(h->sp, stack_offset)); | ||
} | ||
|
||
void push_ptr_with_static_offset_on_stack(dnnl::impl::cpu::aarch64::jit_generator* h, | ||
int32_t stack_offset, | ||
const Xbyak_aarch64::XReg& ptr_reg, | ||
const std::vector<Xbyak_aarch64::XReg>& aux_regs, | ||
size_t ptr_offset) { | ||
// If there's no static offset, just store the pointer | ||
if (ptr_offset == 0) { | ||
h->str(ptr_reg, Xbyak_aarch64::ptr(h->sp, stack_offset)); | ||
return; | ||
} | ||
|
||
// Safety assertions as suggested | ||
OV_CPU_JIT_EMITTER_ASSERT(aux_regs.size() >= 2, "aux_regs must contain at least 2 registers"); | ||
|
||
// Assert that ptr_reg is not in aux_regs | ||
for (const auto& reg : aux_regs) { | ||
OV_CPU_JIT_EMITTER_ASSERT(reg.getIdx() != ptr_reg.getIdx(), "ptr_reg must not be in aux_regs"); | ||
} | ||
|
||
// Use safe auxiliary registers from the provided vector | ||
const Xbyak_aarch64::XReg temp_reg = aux_regs[0]; // For storing adjusted pointer | ||
const Xbyak_aarch64::XReg addr_reg = aux_regs[1]; // For address calculations in add_imm | ||
|
||
// For non-zero offsets, apply the offset and then store | ||
h->add_imm(temp_reg, ptr_reg, ptr_offset, addr_reg); | ||
|
||
// Store the adjusted pointer on stack | ||
h->str(temp_reg, Xbyak_aarch64::ptr(h->sp, stack_offset)); | ||
} | ||
|
||
void push_and_load_ptrs_with_offsets(dnnl::impl::cpu::aarch64::jit_generator* h, | ||
const std::vector<Xbyak_aarch64::XReg>& mem_ptrs, | ||
const std::vector<size_t>& memory_offsets, | ||
const std::vector<size_t>& buffer_ids, | ||
const std::vector<Xbyak_aarch64::XReg>& aux_regs, | ||
const std::vector<Xbyak_aarch64::XReg>& load_regs) { | ||
const size_t gpr_length = 8; // 64-bit register length | ||
const size_t sp_alignment = 16; // AArch64 stack alignment requirement | ||
|
||
// Allocate stack space for all pointers | ||
const auto sp_size = rnd_up(mem_ptrs.size() * gpr_length, sp_alignment); | ||
h->sub(h->sp, h->sp, sp_size); | ||
|
||
// Push all pointers with offsets onto stack | ||
for (size_t i = 0; i < mem_ptrs.size(); i++) { | ||
const auto& ptr_reg = mem_ptrs[i]; | ||
int32_t stack_offset = i * gpr_length; | ||
|
||
if (ov::snippets::utils::is_dynamic_value(memory_offsets[i])) { | ||
// Dynamic offset: read from runtime parameters | ||
size_t runtime_offset = GET_OFF(buffer_offsets) + buffer_ids[i] * sizeof(size_t); | ||
push_ptr_with_runtime_offset_on_stack(h, stack_offset, ptr_reg, aux_regs, runtime_offset); | ||
} else { | ||
// Static offset: add compile-time constant | ||
push_ptr_with_static_offset_on_stack(h, stack_offset, ptr_reg, aux_regs, memory_offsets[i]); | ||
} | ||
} | ||
|
||
// Load back the adjusted pointers to specified registers | ||
for (size_t i = 0; i < load_regs.size() && i < mem_ptrs.size(); i++) { | ||
h->ldr(load_regs[i], Xbyak_aarch64::ptr(h->sp, static_cast<int32_t>(i * gpr_length))); | ||
} | ||
|
||
// Restore stack pointer | ||
h->add(h->sp, h->sp, sp_size); | ||
} | ||
|
||
} // namespace ov::intel_cpu::aarch64::utils |
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.