diff --git a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_emitter.hpp b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_emitter.hpp index f8b0026213eb4a..dd77a14388d67e 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_emitter.hpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_emitter.hpp @@ -70,6 +70,8 @@ class jit_emitter : public ov::snippets::Emitter { static std::set> get_supported_precisions( const std::shared_ptr& node = nullptr); + static constexpr int sp_alignment = 16; + protected: static size_t get_max_vecs_count(); static int32_t get_vec_length(); @@ -155,6 +157,10 @@ class jit_emitter : public ov::snippets::Emitter { } } + int32_t get_gpr_length() const { + return h->x0.getBit() / 8; + } + private: mutable std::vector preserved_vec_idxs; mutable std::vector preserved_gpr_idxs; @@ -179,10 +185,6 @@ class jit_emitter : public ov::snippets::Emitter { return 32; } - int32_t get_gpr_length() const { - return h->x0.getBit() / 8; - } - void store_context(const std::vector& gpr_regs, const std::vector& vec_regs, const std::unordered_set& ignore_vec_regs = {}) const; diff --git a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_gemm_copy_b_emitter.cpp b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_gemm_copy_b_emitter.cpp index 3e5eecfbeee61d..c722d5e9bd34b8 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_gemm_copy_b_emitter.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_gemm_copy_b_emitter.cpp @@ -17,6 +17,9 @@ #include #include "emitters/snippets/aarch64/kernel_executors/gemm_copy_b.hpp" +#include "emitters/snippets/aarch64/utils.hpp" +#include "emitters/snippets/jit_snippets_call_args.hpp" +#include "emitters/snippets/utils/utils.hpp" #include "emitters/utils.hpp" #include "openvino/core/node.hpp" #include "openvino/core/type.hpp" @@ -53,6 +56,13 @@ jit_gemm_copy_b_emitter::jit_gemm_copy_b_emitter(jit_generator* h, OV_CPU_JIT_EMITTER_ASSERT(n_blk_size > 0, "n_blk_size of gemm_repack is expected to be greater than 0."); GemmCopyBKernelKaiConfig kernel_config(n_blk_size); m_kernel_executor = kernel_table->register_kernel(expr, kernel_config); + + // Initialize memory offsets similar to x64 brgemm_copy_b implementation + m_memory_offsets = {gemm_repack->get_offset_in(), gemm_repack->get_offset_out()}; + + // Initialize buffer IDs using the utils function + m_buffer_ids = {ov::intel_cpu::utils::get_buffer_cluster_id(expr->get_input_port(0)), + ov::intel_cpu::utils::get_buffer_cluster_id(expr->get_output_port(0))}; } std::set> jit_gemm_copy_b_emitter::get_supported_precisions( @@ -64,6 +74,8 @@ std::set> jit_gemm_copy_b_emitter::get_supported_prec void jit_gemm_copy_b_emitter::validate_arguments(const std::vector& in, const std::vector& out) const { OV_CPU_JIT_EMITTER_ASSERT(in.size() == 1, "Expects 1 input reg, got", in.size()); OV_CPU_JIT_EMITTER_ASSERT(out.size() == 1, "Expects 1 output reg, got", out.size()); + OV_CPU_JIT_EMITTER_ASSERT(m_memory_offsets.size() == 2, "Expected 2 memory offsets for input and output"); + OV_CPU_JIT_EMITTER_ASSERT(m_buffer_ids.size() == 2, "Expected 2 buffer IDs for input and output"); } void jit_gemm_copy_b_emitter::emit_impl(const std::vector& in, const std::vector& out) const { @@ -75,10 +87,17 @@ void jit_gemm_copy_b_emitter::emit_impl(const std::vector& in, const std Xbyak_aarch64::XReg x0(0); Xbyak_aarch64::XReg x1(1); Xbyak_aarch64::XReg x2(2); - h->str(Xbyak_aarch64::XReg(in[0]), pre_ptr(h->sp, -get_vec_length())); - h->str(Xbyak_aarch64::XReg(out[0]), pre_ptr(h->sp, -get_vec_length())); - h->ldr(x2, post_ptr(h->sp, get_vec_length())); - h->ldr(x1, post_ptr(h->sp, get_vec_length())); + Xbyak_aarch64::XReg aux_reg(3); + + // Prepare memory pointers with offsets + std::vector mem_ptrs_idxs{in[0], out[0]}; + const auto& mem_ptrs = utils::transform_idxs_to_regs(mem_ptrs_idxs); + + // Apply memory offsets and load adjusted pointers + std::vector load_regs{x1, x2}; + utils::push_and_load_ptrs_with_offsets(h, mem_ptrs, m_memory_offsets, m_buffer_ids, aux_reg, load_regs); + + // Set up executor pointer as first argument const auto& compiled_kernel = get_compiled_kernel_ptr(); h->mov(x0, compiled_kernel); diff --git a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_gemm_copy_b_emitter.hpp b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_gemm_copy_b_emitter.hpp index c8d3e95f38808e..7a08c4c8fa9566 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_gemm_copy_b_emitter.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_gemm_copy_b_emitter.hpp @@ -30,6 +30,8 @@ class jit_gemm_copy_b_emitter : public jit_emitter { const uintptr_t get_compiled_kernel_ptr() const; std::shared_ptr m_kernel_executor = nullptr; + std::vector m_memory_offsets; + std::vector m_buffer_ids; }; } // namespace ov::intel_cpu::aarch64 \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_gemm_emitter.cpp b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_gemm_emitter.cpp index f7e017c7b6d35e..11eedca8a56181 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_gemm_emitter.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_gemm_emitter.cpp @@ -17,11 +17,16 @@ #include #include "emitters/snippets/aarch64/kernel_executors/gemm.hpp" +#include "emitters/snippets/aarch64/utils.hpp" +#include "emitters/snippets/jit_snippets_call_args.hpp" +#include "emitters/snippets/utils/utils.hpp" #include "emitters/utils.hpp" #include "openvino/core/node.hpp" #include "openvino/core/type/element_type.hpp" #include "snippets/kernel_executor_table.hpp" #include "snippets/lowered/expression.hpp" +#include "snippets/utils/utils.hpp" +#include "transformations/snippets/aarch64/op/gemm_cpu.hpp" using namespace Xbyak_aarch64; @@ -39,6 +44,17 @@ jit_gemm_emitter::jit_gemm_emitter(jit_generator* h, in_out_type_ = emitter_in_out_map::gpr_to_gpr; GemmKernelKaiConfig kernel_config; m_kernel_executor_kai = kernel_table->register_kernel(expr, kernel_config); + + const auto gemm_node = as_type_ptr(expr->get_node()); + OV_CPU_JIT_EMITTER_ASSERT(gemm_node, "Expected GemmCPU node"); + + // Initialize memory offsets similar to x64 brgemm implementation + m_memory_offsets = {gemm_node->get_offset_a(), gemm_node->get_offset_b(), gemm_node->get_offset_c()}; + + // Initialize buffer IDs using the utils function + m_buffer_ids = {ov::intel_cpu::utils::get_buffer_cluster_id(expr->get_input_port(0)), + ov::intel_cpu::utils::get_buffer_cluster_id(expr->get_input_port(1)), + ov::intel_cpu::utils::get_buffer_cluster_id(expr->get_output_port(0))}; } std::set> jit_gemm_emitter::get_supported_precisions( @@ -50,6 +66,8 @@ std::set> jit_gemm_emitter::get_supported_precisions( void jit_gemm_emitter::validate_arguments(const std::vector& in, const std::vector& out) const { OV_CPU_JIT_EMITTER_ASSERT(in.size() == 2, "Expects 2 input regs, got", in.size()); OV_CPU_JIT_EMITTER_ASSERT(out.size() == 1, "Expects 1 output reg, got", out.size()); + OV_CPU_JIT_EMITTER_ASSERT(m_memory_offsets.size() == 3, "Expected 3 memory offsets for A, B, C"); + OV_CPU_JIT_EMITTER_ASSERT(m_buffer_ids.size() == 3, "Expected 3 buffer IDs for A, B, C"); } void jit_gemm_emitter::emit_impl(const std::vector& in, const std::vector& out) const { @@ -62,12 +80,17 @@ void jit_gemm_emitter::emit_impl(const std::vector& in, const std::vecto Xbyak_aarch64::XReg x1(1); Xbyak_aarch64::XReg x2(2); Xbyak_aarch64::XReg x3(3); - h->str(Xbyak_aarch64::XReg(in[0]), pre_ptr(h->sp, -get_vec_length())); - h->str(Xbyak_aarch64::XReg(in[1]), pre_ptr(h->sp, -get_vec_length())); - h->str(Xbyak_aarch64::XReg(out[0]), pre_ptr(h->sp, -get_vec_length())); - h->ldr(x3, post_ptr(h->sp, get_vec_length())); - h->ldr(x2, post_ptr(h->sp, get_vec_length())); - h->ldr(x1, post_ptr(h->sp, get_vec_length())); + Xbyak_aarch64::XReg aux_reg(5); + + // Prepare memory pointers with offsets + std::vector mem_ptrs_idxs{in[0], in[1], out[0]}; + const auto& mem_ptrs = utils::transform_idxs_to_regs(mem_ptrs_idxs); + + // Apply memory offsets and load adjusted pointers + std::vector load_regs{x1, x2, x3}; + utils::push_and_load_ptrs_with_offsets(h, mem_ptrs, m_memory_offsets, m_buffer_ids, aux_reg, load_regs); + + // Set up executor pointer as first argument const auto& compiled_kernel = get_compiled_kernel_ptr(); h->mov(x0, compiled_kernel); diff --git a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_gemm_emitter.hpp b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_gemm_emitter.hpp index a51734c2ac79f8..9321024d507b30 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_gemm_emitter.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_gemm_emitter.hpp @@ -32,6 +32,8 @@ class jit_gemm_emitter : public jit_emitter { const uintptr_t get_compiled_kernel_ptr() const; std::shared_ptr m_kernel_executor_kai = nullptr; + std::vector m_memory_offsets; + std::vector m_buffer_ids; }; } // namespace ov::intel_cpu::aarch64 diff --git a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/utils.cpp b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/utils.cpp new file mode 100644 index 00000000000000..cd6c161b481c36 --- /dev/null +++ b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/utils.cpp @@ -0,0 +1,143 @@ +// Copyright (C) 2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "utils.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "emitters/snippets/jit_snippets_call_args.hpp" +#include "emitters/utils.hpp" +#include "openvino/core/except.hpp" +#include "openvino/core/type.hpp" +#include "snippets/emitter.hpp" +#include "snippets/lowered/expression_port.hpp" +#include "snippets/lowered/expressions/buffer_expression.hpp" +#include "snippets/op/loop.hpp" +#include "snippets/op/memory_access.hpp" +#include "snippets/utils/utils.hpp" + +using namespace dnnl::impl::cpu::aarch64; + +namespace ov::intel_cpu::aarch64::utils { + +Xbyak_aarch64::XReg get_aux_gpr(const std::vector& used_gpr_idxs) { + // SP - stack pointer should be preserved, X0 and X1 - runtime parameter registers in the kernel + // X18 - platform register should not be used + static std::unordered_set blacklist_gpr_idxs = { + 31, // Stack pointer (SP) + 0, // abi_param1 (X0) + 1, // abi_param2 (X1) + 18 // Platform register (X18) + }; + + // Iterate through available GPR registers (X0-X30, excluding X31 which is SP) + for (size_t gpr_idx = 0; gpr_idx <= 30; ++gpr_idx) { + size_t _idx = 30 - gpr_idx; // we allocate from the end + if (std::find(used_gpr_idxs.cbegin(), used_gpr_idxs.cend(), _idx) != used_gpr_idxs.cend()) { + continue; + } + if (blacklist_gpr_idxs.count(_idx) > 0) { + continue; + } + return Xbyak_aarch64::XReg(_idx); + } + OV_CPU_JIT_EMITTER_THROW("Failed to allocate aux GPR"); +} + +Xbyak_aarch64::XReg init_memory_access_aux_gpr(const std::vector& used_gpr_reg_idxs, + const std::vector& aux_gpr_idxs, + std::set& regs_to_spill) { + if (!aux_gpr_idxs.empty()) { + return Xbyak_aarch64::XReg(static_cast(aux_gpr_idxs[0])); + } + const auto aux_reg = get_aux_gpr(used_gpr_reg_idxs); + regs_to_spill.emplace(snippets::RegType::gpr, aux_reg.getIdx()); + return aux_reg; +} + +void push_ptr_with_runtime_offset_on_stack(dnnl::impl::cpu::aarch64::jit_generator* h, + int32_t stack_offset, + const Xbyak_aarch64::XReg& ptr_reg, + const Xbyak_aarch64::XReg& aux_reg, + size_t runtime_offset) { + // Copy pointer to aux register + h->mov(aux_reg, ptr_reg); + + // Load the runtime offset from abi_param1 (X0) and add it to the pointer + Xbyak_aarch64::XReg abi_param1(0); + Xbyak_aarch64::XReg temp_reg(h->X_TMP_0); + + // Load the offset value from the runtime parameter location + h->add_imm(temp_reg, abi_param1, runtime_offset, Xbyak_aarch64::XReg(h->X_TMP_1)); + h->ldr(temp_reg, Xbyak_aarch64::ptr(temp_reg)); + + h->add(aux_reg, aux_reg, temp_reg); + + // Store the adjusted pointer on stack + h->str(aux_reg, Xbyak_aarch64::ptr(h->sp, stack_offset)); +} + +void push_ptr_with_static_offset_on_stack(dnnl::impl::cpu::aarch64::jit_generator* h, + int32_t stack_offset, + const Xbyak_aarch64::XReg& ptr_reg, + size_t ptr_offset) { + // If there's no static offset, just store the pointer + if (ptr_offset == 0) { + h->str(ptr_reg, Xbyak_aarch64::ptr(h->sp, stack_offset)); + return; + } + + // For non-zero offsets, apply the offset and then store + Xbyak_aarch64::XReg temp_reg(h->X_TMP_0); + h->add_imm(temp_reg, ptr_reg, ptr_offset, Xbyak_aarch64::XReg(h->X_TMP_1)); + + // Store the adjusted pointer on stack + h->str(temp_reg, Xbyak_aarch64::ptr(h->sp, stack_offset)); +} + +void push_and_load_ptrs_with_offsets(dnnl::impl::cpu::aarch64::jit_generator* h, + const std::vector& mem_ptrs, + const std::vector& memory_offsets, + const std::vector& buffer_ids, + const Xbyak_aarch64::XReg& aux_reg, + const std::vector& load_regs) { + const size_t gpr_length = 8; // 64-bit register length + const size_t sp_alignment = 16; // AArch64 stack alignment requirement + + // Allocate stack space for all pointers + const auto sp_size = rnd_up(mem_ptrs.size() * gpr_length, sp_alignment); + h->sub(h->sp, h->sp, sp_size); + + // Push all pointers with offsets onto stack + for (size_t i = 0; i < mem_ptrs.size(); i++) { + const auto& ptr_reg = mem_ptrs[i]; + int32_t stack_offset = i * gpr_length; + + if (ov::snippets::utils::is_dynamic_value(memory_offsets[i])) { + // Dynamic offset: read from runtime parameters + size_t runtime_offset = GET_OFF(buffer_offsets) + buffer_ids[i] * sizeof(size_t); + push_ptr_with_runtime_offset_on_stack(h, stack_offset, ptr_reg, aux_reg, runtime_offset); + } else { + // Static offset: add compile-time constant + push_ptr_with_static_offset_on_stack(h, stack_offset, ptr_reg, memory_offsets[i]); + } + } + + // Load back the adjusted pointers to specified registers + for (size_t i = 0; i < load_regs.size() && i < mem_ptrs.size(); i++) { + h->ldr(load_regs[i], Xbyak_aarch64::ptr(h->sp, static_cast(i * gpr_length))); + } + + // Restore stack pointer + h->add(h->sp, h->sp, sp_size); +} + +} // namespace ov::intel_cpu::aarch64::utils diff --git a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/utils.hpp b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/utils.hpp new file mode 100644 index 00000000000000..f9ac1c92b3a729 --- /dev/null +++ b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/utils.hpp @@ -0,0 +1,90 @@ +// Copyright (C) 2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "cpu/aarch64/jit_generator.hpp" +#include "snippets/emitter.hpp" +#include "snippets/lowered/expression_port.hpp" + +namespace ov::intel_cpu::aarch64::utils { + +inline static std::vector transform_idxs_to_regs(const std::vector& idxs) { + std::vector regs; + regs.reserve(idxs.size()); + std::transform(idxs.begin(), idxs.end(), std::back_inserter(regs), [](size_t idx) { + return Xbyak_aarch64::XReg(static_cast(idx)); + }); + return regs; +} + +/** + * @brief Find the available register from the pool excepting: abi_param1, abi_param2, SP and `used_gpr_idxs` + * @param used_gpr_idxs current used gpr register indexes + * @return register + */ +Xbyak_aarch64::XReg get_aux_gpr(const std::vector& used_gpr_idxs); + +/** + * @brief Returns an auxiliary GPR register. Returns a register from `aux_gpr_idxs`. + * If it's empty, then choose a register that is not in `used_gpr_reg_idxs` and add it to `regs_to_spill`. + * @param used_gpr_reg_idxs register indexes reserved to store memory pointers in this emitter + * @param aux_gpr_idxs pool of available gp register indexes + * @param regs_to_spill set of live registers to be spilled before ABI call + */ +Xbyak_aarch64::XReg init_memory_access_aux_gpr(const std::vector& used_gpr_reg_idxs, + const std::vector& aux_gpr_idxs, + std::set& regs_to_spill); + +/** + * @brief Push data pointer on stack adding offset. The offset is taken from runtime params `abi_param1` + * @param h generator + * @param stack_offset stack offset + * @param ptr_reg register containing data pointer + * @param aux_reg aux register + * @param runtime_offset offset in runtime params `abi_param1` + */ +void push_ptr_with_runtime_offset_on_stack(dnnl::impl::cpu::aarch64::jit_generator* h, + int32_t stack_offset, + const Xbyak_aarch64::XReg& ptr_reg, + const Xbyak_aarch64::XReg& aux_reg, + size_t runtime_offset); + +/** + * @brief Push data pointer on stack adding static offset `ptr_offset` + * Note: This helper doesn't allocate stack space - the user should guarantee allocated space on stack + * @param h generator + * @param stack_offset stack offset + * @param ptr_reg register containing data pointer + * @param ptr_offset offset which will be added to data pointer + */ +void push_ptr_with_static_offset_on_stack(dnnl::impl::cpu::aarch64::jit_generator* h, + int32_t stack_offset, + const Xbyak_aarch64::XReg& ptr_reg, + size_t ptr_offset); + +/** + * @brief Push multiple data pointers on stack with offsets and load them back to specified registers + * @param h generator + * @param mem_ptrs vector of registers containing data pointers + * @param memory_offsets vector of memory offsets (can be dynamic or static) + * @param buffer_ids vector of buffer IDs for runtime offset calculation + * @param aux_reg auxiliary register for calculations + * @param load_regs vector of registers to load the adjusted pointers back to + */ +void push_and_load_ptrs_with_offsets(dnnl::impl::cpu::aarch64::jit_generator* h, + const std::vector& mem_ptrs, + const std::vector& memory_offsets, + const std::vector& buffer_ids, + const Xbyak_aarch64::XReg& aux_reg, + const std::vector& load_regs); + +} // namespace ov::intel_cpu::aarch64::utils diff --git a/src/plugins/intel_cpu/src/emitters/snippets/utils/utils.cpp b/src/plugins/intel_cpu/src/emitters/snippets/utils/utils.cpp new file mode 100644 index 00000000000000..67a7ae636c03ee --- /dev/null +++ b/src/plugins/intel_cpu/src/emitters/snippets/utils/utils.cpp @@ -0,0 +1,53 @@ +// Copyright (C) 2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "utils.hpp" + +#include +#include +#include + +#include "emitters/utils.hpp" +#include "openvino/core/except.hpp" +#include "openvino/core/type.hpp" +#include "snippets/lowered/expression_port.hpp" +#include "snippets/lowered/expressions/buffer_expression.hpp" +#include "snippets/op/loop.hpp" +#include "snippets/op/memory_access.hpp" +#include "snippets/utils/utils.hpp" +#include "utils/general_utils.h" + +namespace ov::intel_cpu::utils { + +size_t get_buffer_cluster_id(const ov::snippets::lowered::ExpressionPort& port) { + auto get_cluster_id = [](const ov::snippets::lowered::ExpressionPort& p) { + const auto buffer = ov::as_type_ptr(p.get_expr()); + return buffer ? buffer->get_cluster_id() : SIZE_MAX; + }; + const auto& ma_op = std::dynamic_pointer_cast(port.get_expr()->get_node()); + OPENVINO_ASSERT(ma_op, "Expected MemoryAccess op!"); + auto offset = ov::snippets::utils::get_dynamic_value(); + size_t id = SIZE_MAX; + switch (port.get_type()) { + case ov::snippets::lowered::ExpressionPort::Type::Input: + offset = ma_op->get_input_offset(port.get_index()); + id = get_cluster_id(port.get_port_connector_ptr()->get_source()); + break; + case ov::snippets::lowered::ExpressionPort::Type::Output: + offset = ma_op->get_output_offset(port.get_index()); + for (const auto& child : port.get_connected_ports()) { + if (!ov::is_type(child.get_expr()->get_node())) { + id = get_cluster_id(child); + } + } + break; + default: + OV_CPU_JIT_EMITTER_THROW("Uknown type of expression port!"); + } + OV_CPU_JIT_EMITTER_ASSERT(implication(ov::snippets::utils::is_dynamic_value(offset), id != SIZE_MAX), + "In dynamic case Buffer Cluster ID must be known!"); + return id; +} + +} // namespace ov::intel_cpu::utils diff --git a/src/plugins/intel_cpu/src/emitters/snippets/utils/utils.hpp b/src/plugins/intel_cpu/src/emitters/snippets/utils/utils.hpp new file mode 100644 index 00000000000000..f1399b13d89fcc --- /dev/null +++ b/src/plugins/intel_cpu/src/emitters/snippets/utils/utils.hpp @@ -0,0 +1,15 @@ +// Copyright (C) 2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +#include "snippets/lowered/expression_port.hpp" + +namespace ov::intel_cpu::utils { + +size_t get_buffer_cluster_id(const ov::snippets::lowered::ExpressionPort& port); + +} // namespace ov::intel_cpu::utils diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_copy_b_emitter.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_copy_b_emitter.cpp index df3041cac21d92..4463b5c946e04d 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_copy_b_emitter.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_copy_b_emitter.cpp @@ -16,6 +16,7 @@ #include "emitters/plugin/x64/jit_emitter.hpp" #include "emitters/plugin/x64/utils.hpp" #include "emitters/snippets/jit_snippets_call_args.hpp" +#include "emitters/snippets/utils/utils.hpp" #include "emitters/snippets/x64/jit_binary_call_emitter.hpp" #include "emitters/snippets/x64/kernel_executors/brgemm_copy_b.hpp" #include "emitters/snippets/x64/utils.hpp" @@ -57,11 +58,11 @@ jit_brgemm_copy_b_emitter::jit_brgemm_copy_b_emitter(jit_generator_t* h, m_kernel_executor = kernel_table->register_kernel(expr, compiled_kernel_cache, config); m_memory_offsets = {brgemm_repack->get_offset_in(), brgemm_repack->get_offset_out()}; - m_buffer_ids = {utils::get_buffer_cluster_id(expr->get_input_port(0)), - utils::get_buffer_cluster_id(expr->get_output_port(0))}; + m_buffer_ids = {ov::intel_cpu::utils::get_buffer_cluster_id(expr->get_input_port(0)), + ov::intel_cpu::utils::get_buffer_cluster_id(expr->get_output_port(0))}; if (m_with_comp) { m_memory_offsets.push_back(brgemm_repack->get_offset_compensations()); - m_buffer_ids.push_back(utils::get_buffer_cluster_id(expr->get_output_port(1))); + m_buffer_ids.push_back(ov::intel_cpu::utils::get_buffer_cluster_id(expr->get_output_port(1))); } } diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_emitter.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_emitter.cpp index b09c0285a649a6..d2e0572278e5bd 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_emitter.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_emitter.cpp @@ -19,6 +19,7 @@ #include "emitters/plugin/x64/jit_emitter.hpp" #include "emitters/plugin/x64/utils.hpp" #include "emitters/snippets/jit_snippets_call_args.hpp" +#include "emitters/snippets/utils/utils.hpp" #include "emitters/snippets/x64/jit_binary_call_emitter.hpp" #include "emitters/snippets/x64/kernel_executors/brgemm.hpp" #include "emitters/snippets/x64/kernel_executors/brgemm_amx.hpp" @@ -70,13 +71,13 @@ jit_brgemm_emitter::jit_brgemm_emitter(jit_generator_t* h, "Jit emitter is called when the shapes are unknown"); m_memory_offsets = {brgemm_node->get_offset_a(), brgemm_node->get_offset_b(), brgemm_node->get_offset_c()}; - m_buffer_ids = {utils::get_buffer_cluster_id(expr->get_input_port(0)), - utils::get_buffer_cluster_id(expr->get_input_port(1)), - utils::get_buffer_cluster_id(expr->get_output_port(0))}; + m_buffer_ids = {ov::intel_cpu::utils::get_buffer_cluster_id(expr->get_input_port(0)), + ov::intel_cpu::utils::get_buffer_cluster_id(expr->get_input_port(1)), + ov::intel_cpu::utils::get_buffer_cluster_id(expr->get_output_port(0))}; m_with_scratchpad = brgemm_config.with_scratchpad(); if (m_with_scratchpad) { m_memory_offsets.push_back(brgemm_node->get_offset_scratch()); - m_buffer_ids.push_back(utils::get_buffer_cluster_id(expr->get_input_port(2))); + m_buffer_ids.push_back(ov::intel_cpu::utils::get_buffer_cluster_id(expr->get_input_port(2))); } m_gemm_inputs_count = brgemm_node->get_gemm_inputs_count(); } diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/utils.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/utils.cpp index 1edebe30e2af46..a2a580416c5ab9 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/utils.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/utils.cpp @@ -7,59 +7,19 @@ #include #include -#include #include #include -#include -#include #include #include #include #include "emitters/utils.hpp" -#include "openvino/core/except.hpp" -#include "openvino/core/type.hpp" #include "snippets/emitter.hpp" -#include "snippets/lowered/expression_port.hpp" -#include "snippets/lowered/expressions/buffer_expression.hpp" -#include "snippets/op/loop.hpp" -#include "snippets/op/memory_access.hpp" -#include "snippets/utils/utils.hpp" using namespace dnnl::impl::cpu::x64; namespace ov::intel_cpu::utils { -size_t get_buffer_cluster_id(const ov::snippets::lowered::ExpressionPort& port) { - auto get_cluster_id = [](const snippets::lowered::ExpressionPort& p) { - const auto buffer = ov::as_type_ptr(p.get_expr()); - return buffer ? buffer->get_cluster_id() : SIZE_MAX; - }; - const auto& ma_op = std::dynamic_pointer_cast(port.get_expr()->get_node()); - OPENVINO_ASSERT(ma_op, "Expected MemoryAccess op!"); - auto offset = ov::snippets::utils::get_dynamic_value(); - size_t id = SIZE_MAX; - switch (port.get_type()) { - case ov::snippets::lowered::ExpressionPort::Type::Input: - offset = ma_op->get_input_offset(port.get_index()); - id = get_cluster_id(port.get_port_connector_ptr()->get_source()); - break; - case ov::snippets::lowered::ExpressionPort::Type::Output: - offset = ma_op->get_output_offset(port.get_index()); - for (const auto& child : port.get_connected_ports()) { - if (!ov::is_type(child.get_expr()->get_node())) { - id = get_cluster_id(child); - } - } - break; - default: - OV_CPU_JIT_EMITTER_THROW("Uknown type of expression port!"); - } - OV_CPU_JIT_EMITTER_ASSERT(IMPLICATION(ov::snippets::utils::is_dynamic_value(offset), id != SIZE_MAX), - "In dynamic case Buffer Cluster ID must be known!"); - return id; -} - Xbyak::Reg64 get_aux_gpr(const std::vector& used_gpr_idxs) { // RSP - stack pointer should be preserved, abi_param1 and abi_param2 - runtime parameter register in the kernel static std::unordered_set blacklist_gpr_idxs = {Xbyak::Operand::RSP, diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/utils.hpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/utils.hpp index e353a96241780b..eda5d38c36ec50 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/utils.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/utils.hpp @@ -25,14 +25,6 @@ inline static std::vector transform_idxs_to_regs(const std::vector return regs; } -/** - * @brief If the passed `port` is connected to a Buffer, return its cluster ID. - * Otherwise returns SIZE_MAX - * @param port expression port of memory access op - * @return cluster ID of the connected Buffer or SIZE_MAX - */ -size_t get_buffer_cluster_id(const ov::snippets::lowered::ExpressionPort& port); - /** * @brief Find the available register from the pool excepting: abi_param1, abi_param2, RSP and `used_gpr_idxs` * @param used_gpr_idxs current used gpr register indexes @@ -55,7 +47,7 @@ Xbyak::Reg64 init_memory_access_aux_gpr(const std::vector& used_gpr_reg_ * @brief Push data pointer on stack adding offset. The offset is taken from runtime params `abi_param1` * @param h generator * @param stack_offset stack offset - * @param ptr_reg register contains data pointer + * @param ptr_reg register containing data pointer * @param aux_reg aux register * @param runtime_offset offset in runtime params `abi_param1` */ @@ -67,9 +59,10 @@ void push_ptr_with_runtime_offset_on_stack(dnnl::impl::cpu::x64::jit_generator_t /** * @brief Push data pointer on stack adding static offset `ptr_offset` + * Note: This helper doesn't allocate stack space - the user should guarantee allocated space on stack * @param h generator * @param stack_offset stack offset - * @param ptr_reg register contains data pointer + * @param ptr_reg register containing data pointer * @param ptr_offset offset which will be added to data pointer */ void push_ptr_with_static_offset_on_stack(dnnl::impl::cpu::x64::jit_generator_t* h, diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp index c4bf1d0819d1f9..f8cd0a807b69b8 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp @@ -508,8 +508,10 @@ std::vector disabledTestPatterns() { retVector.emplace_back(R"(smoke_Snippets.*)"); #endif // smoke_Snippets test cases are not supported on arm64 platforms, except for listed below - retVector.emplace_back(R"(smoke_Snippets(?!_Eltwise|_Convert|_Transpose|_FQDecomposition_|_MatMul/|_Reduce|_Softmax|_AddSoftmax).*)"); + retVector.emplace_back(R"(smoke_Snippets(?!_Eltwise|_Convert|_Transpose|_FQDecomposition_|_MatMul/|_MatMulBias|_Reduce|_Softmax|_AddSoftmax).*)"); retVector.emplace_back(R"(smoke_Snippets_TransposeMatMulBias.*)"); + // TODO: support for long offsets is required for MatMulBias tests with bigger shapes + retVector.emplace_back(R"(smoke_Snippets_MatMulBias.*1023.*)"); #endif #if defined(_WIN32) retVector.emplace_back(R"(.*smoke_QuantizedConvolutionBatchNormTransposeOnWeights/QuantizedConvolutionBatchNorm.CompareWithRefs/conv_type=convolution_quantize_type=fake_quantize_intervals_type=per_(tensor|channel)_transpose_on_weights=true_device=CPU.*)");