|
| 1 | +// Copyright (C) 2025 Intel Corporation |
| 2 | +// SPDX-License-Identifier: Apache-2.0 |
| 3 | +// |
| 4 | + |
| 5 | +#include "utils.hpp" |
| 6 | + |
| 7 | +#include <algorithm> |
| 8 | +#include <common/utils.hpp> |
| 9 | +#include <cstddef> |
| 10 | +#include <cstdint> |
| 11 | +#include <memory> |
| 12 | +#include <set> |
| 13 | +#include <unordered_set> |
| 14 | +#include <vector> |
| 15 | + |
| 16 | +#include "emitters/utils.hpp" |
| 17 | +#include "openvino/core/except.hpp" |
| 18 | +#include "openvino/core/type.hpp" |
| 19 | +#include "snippets/emitter.hpp" |
| 20 | +#include "snippets/lowered/expression_port.hpp" |
| 21 | +#include "snippets/lowered/expressions/buffer_expression.hpp" |
| 22 | +#include "snippets/op/loop.hpp" |
| 23 | +#include "snippets/op/memory_access.hpp" |
| 24 | +#include "snippets/utils/utils.hpp" |
| 25 | + |
| 26 | +using namespace dnnl::impl::cpu::aarch64; |
| 27 | + |
| 28 | +namespace ov::intel_cpu::aarch64::utils { |
| 29 | + |
| 30 | +size_t get_buffer_cluster_id(const ov::snippets::lowered::ExpressionPort& port) { |
| 31 | + auto get_cluster_id = [](const snippets::lowered::ExpressionPort& p) { |
| 32 | + const auto buffer = ov::as_type_ptr<ov::snippets::lowered::BufferExpression>(p.get_expr()); |
| 33 | + return buffer ? buffer->get_cluster_id() : SIZE_MAX; |
| 34 | + }; |
| 35 | + const auto& ma_op = std::dynamic_pointer_cast<ov::snippets::modifier::MemoryAccess>(port.get_expr()->get_node()); |
| 36 | + OPENVINO_ASSERT(ma_op, "Expected MemoryAccess op!"); |
| 37 | + auto offset = ov::snippets::utils::get_dynamic_value<size_t>(); |
| 38 | + size_t id = SIZE_MAX; |
| 39 | + switch (port.get_type()) { |
| 40 | + case ov::snippets::lowered::ExpressionPort::Type::Input: |
| 41 | + offset = ma_op->get_input_offset(port.get_index()); |
| 42 | + id = get_cluster_id(port.get_port_connector_ptr()->get_source()); |
| 43 | + break; |
| 44 | + case ov::snippets::lowered::ExpressionPort::Type::Output: |
| 45 | + offset = ma_op->get_output_offset(port.get_index()); |
| 46 | + for (const auto& child : port.get_connected_ports()) { |
| 47 | + if (!ov::is_type<snippets::op::LoopEnd>(child.get_expr()->get_node())) { |
| 48 | + id = get_cluster_id(child); |
| 49 | + } |
| 50 | + } |
| 51 | + break; |
| 52 | + default: |
| 53 | + OV_CPU_JIT_EMITTER_THROW("Uknown type of expression port!"); |
| 54 | + } |
| 55 | + OV_CPU_JIT_EMITTER_ASSERT(IMPLICATION(ov::snippets::utils::is_dynamic_value(offset), id != SIZE_MAX), |
| 56 | + "In dynamic case Buffer Cluster ID must be known!"); |
| 57 | + return id; |
| 58 | +} |
| 59 | + |
| 60 | +Xbyak_aarch64::XReg get_aux_gpr(const std::vector<size_t>& used_gpr_idxs) { |
| 61 | + // SP - stack pointer should be preserved, X0 and X1 - runtime parameter registers in the kernel |
| 62 | + // X18 - platform register should not be used |
| 63 | + static std::unordered_set<size_t> blacklist_gpr_idxs = { |
| 64 | + 31, // Stack pointer (SP) |
| 65 | + 0, // abi_param1 (X0) |
| 66 | + 1, // abi_param2 (X1) |
| 67 | + 18 // Platform register (X18) |
| 68 | + }; |
| 69 | + |
| 70 | + // Iterate through available GPR registers (X0-X30, excluding X31 which is SP) |
| 71 | + for (size_t gpr_idx = 0; gpr_idx <= 30; ++gpr_idx) { |
| 72 | + size_t _idx = 30 - gpr_idx; // we allocate from the end |
| 73 | + if (std::find(used_gpr_idxs.cbegin(), used_gpr_idxs.cend(), _idx) != used_gpr_idxs.cend()) { |
| 74 | + continue; |
| 75 | + } |
| 76 | + if (blacklist_gpr_idxs.count(_idx) > 0) { |
| 77 | + continue; |
| 78 | + } |
| 79 | + return Xbyak_aarch64::XReg(_idx); |
| 80 | + } |
| 81 | + OV_CPU_JIT_EMITTER_THROW("Failed to allocate aux GPR"); |
| 82 | +} |
| 83 | + |
| 84 | +Xbyak_aarch64::XReg init_memory_access_aux_gpr(const std::vector<size_t>& used_gpr_reg_idxs, |
| 85 | + const std::vector<size_t>& aux_gpr_idxs, |
| 86 | + std::set<snippets::Reg>& regs_to_spill) { |
| 87 | + if (!aux_gpr_idxs.empty()) { |
| 88 | + return Xbyak_aarch64::XReg(static_cast<int>(aux_gpr_idxs[0])); |
| 89 | + } |
| 90 | + const auto aux_reg = ov::intel_cpu::aarch64::utils::get_aux_gpr(used_gpr_reg_idxs); |
| 91 | + regs_to_spill.emplace(snippets::RegType::gpr, aux_reg.getIdx()); |
| 92 | + return aux_reg; |
| 93 | +} |
| 94 | + |
| 95 | +void push_ptr_with_runtime_offset_on_stack(dnnl::impl::cpu::aarch64::jit_generator* h, |
| 96 | + int32_t stack_offset, |
| 97 | + const Xbyak_aarch64::XReg& ptr_reg, |
| 98 | + const Xbyak_aarch64::XReg& aux_reg, |
| 99 | + size_t runtime_offset) { |
| 100 | + // Copy pointer to aux register |
| 101 | + h->mov(aux_reg, ptr_reg); |
| 102 | + |
| 103 | + // Load the runtime offset from abi_param1 (X0) and add it to the pointer |
| 104 | + Xbyak_aarch64::XReg abi_param1(0); |
| 105 | + Xbyak_aarch64::XReg offset_reg(4); |
| 106 | + |
| 107 | + // Handle large runtime offsets by using a temporary register |
| 108 | + if (runtime_offset > 4095) { |
| 109 | + Xbyak_aarch64::XReg temp_offset_reg(6); |
| 110 | + h->mov(temp_offset_reg, static_cast<uint64_t>(runtime_offset)); |
| 111 | + h->add(temp_offset_reg, abi_param1, temp_offset_reg); |
| 112 | + h->ldr(offset_reg, Xbyak_aarch64::ptr(temp_offset_reg)); |
| 113 | + } else { |
| 114 | + h->ldr(offset_reg, Xbyak_aarch64::ptr(abi_param1, static_cast<int32_t>(runtime_offset))); |
| 115 | + } |
| 116 | + |
| 117 | + h->add(aux_reg, aux_reg, offset_reg); |
| 118 | + |
| 119 | + // Store the adjusted pointer on stack |
| 120 | + h->str(aux_reg, Xbyak_aarch64::ptr(h->sp, stack_offset)); |
| 121 | +} |
| 122 | + |
| 123 | +void push_ptr_with_static_offset_on_stack(dnnl::impl::cpu::aarch64::jit_generator* h, |
| 124 | + int32_t stack_offset, |
| 125 | + const Xbyak_aarch64::XReg& ptr_reg, |
| 126 | + size_t ptr_offset) { |
| 127 | + // If there's no static offset, just store the pointer |
| 128 | + if (ptr_offset == 0) { |
| 129 | + h->str(ptr_reg, Xbyak_aarch64::ptr(h->sp, stack_offset)); |
| 130 | + return; |
| 131 | + } |
| 132 | + |
| 133 | + // For non-zero offsets, apply the offset and then store |
| 134 | + Xbyak_aarch64::XReg temp_reg(4); |
| 135 | + h->mov(temp_reg, ptr_reg); |
| 136 | + |
| 137 | + // For large offsets, use a register to hold the offset value |
| 138 | + if (ptr_offset > 4095) { // 12-bit immediate limit for add instruction |
| 139 | + Xbyak_aarch64::XReg offset_reg(6); |
| 140 | + h->mov(offset_reg, static_cast<uint64_t>(ptr_offset)); |
| 141 | + h->add(temp_reg, temp_reg, offset_reg); |
| 142 | + } else { |
| 143 | + h->add(temp_reg, temp_reg, static_cast<int32_t>(ptr_offset)); |
| 144 | + } |
| 145 | + |
| 146 | + // Store the adjusted pointer on stack |
| 147 | + h->str(temp_reg, Xbyak_aarch64::ptr(h->sp, stack_offset)); |
| 148 | +} |
| 149 | + |
| 150 | +} // namespace ov::intel_cpu::aarch64::utils |
0 commit comments