Skip to content

Commit 8a86c5a

Browse files
author
sidart
committed
Summary: Initial CMSIS-NN custom kernels port (Take #2)
Test Plan: Reviewers: Subscribers: Tasks: Tags:
1 parent b440e82 commit 8a86c5a

File tree

8 files changed

+268
-19
lines changed

8 files changed

+268
-19
lines changed

CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -530,6 +530,7 @@ endif()
530530

531531
if(EXECUTORCH_BUILD_CORTEX_M)
532532
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/cortex_m)
533+
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/cortex_m/cmsis-nn/ops)
533534
endif()
534535

535536
if(EXECUTORCH_BUILD_DEVTOOLS)

backends/cortex_m/cmsis-nn/cmsis.yaml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
- op: aten::_softmax.out
7+
kernels:
8+
- arg_meta: null
9+
kernel_name: cortex_m::aten_softmax
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
cmake_minimum_required(VERSION 3.19)
8+
9+
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
10+
if(NOT CMAKE_CXX_STANDARD)
11+
set(CMAKE_CXX_STANDARD 17)
12+
endif()
13+
set(CMAKE_VERBOSE_MAKEFILE ON)
14+
15+
# Source root directory for executorch.
16+
if(NOT EXECUTORCH_ROOT)
17+
set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../../)
18+
endif()
19+
20+
include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
21+
include(${EXECUTORCH_ROOT}/tools/cmake/Codegen.cmake)
22+
23+
set(EXECUTORCH_ENABLE_LOGGING ON CACHE BOOL "Enable ExecuTorch logging")
24+
set(EXECUTORCH_LOG_LEVEL "DEBUG" CACHE STRING "ExecuTorch log level")
25+
26+
# Path to CMSIS-NN root - adjust as needed
27+
set(CMSIS_NN_ROOT /home/sidart/working/CMSIS-NN)
28+
29+
# Cortex-M CMSIS ops sources
30+
set(_cortex_m_kernels_cmsis__srcs
31+
"${EXECUTORCH_ROOT}/backends/cortex_m/cmsis-nn/ops/op_aten_add_tensor.cpp"
32+
"${EXECUTORCH_ROOT}/backends/cortex_m/cmsis-nn/ops/op_aten_softmax.cpp"
33+
)
34+
35+
# Common include directories
36+
set(_common_include_directories
37+
${EXECUTORCH_ROOT}/..
38+
${EXECUTORCH_ROOT}/runtime/core/portable_type/c10
39+
${CMSIS_NN_ROOT}/Include
40+
${CMSIS_NN_ROOT} # For any CMake or config includes
41+
)
42+
43+
# Import CMSIS-NN static library as a target
44+
add_library(cmsis_nn STATIC IMPORTED)
45+
set_target_properties(cmsis_nn PROPERTIES
46+
IMPORTED_LOCATION "${CMSIS_NN_ROOT}/build/libcmsis-nn.a"
47+
INTERFACE_INCLUDE_DIRECTORIES "${CMSIS_NN_ROOT}/Include"
48+
)
49+
50+
# Build cortex_m_cmsis_kernels static library
51+
add_library(cortex_m_cmsis_kernels ${_cortex_m_kernels_cmsis__srcs})
52+
53+
# Include directories for cortex_m_cmsis_kernels
54+
target_include_directories(cortex_m_cmsis_kernels
55+
PRIVATE
56+
${_common_include_directories}
57+
)
58+
59+
# Link libraries: executorch and CMSIS-NN imported target
60+
target_link_libraries(cortex_m_cmsis_kernels
61+
PRIVATE
62+
cmsis_nn
63+
executorch
64+
)
65+
66+
# Generate C++ bindings for kernels and operators
67+
gen_selected_ops(
68+
LIB_NAME "cortex_m_cmsis_nn_ops_lib" OPS_SCHEMA_YAML
69+
"${CMAKE_CURRENT_LIST_DIR}/../cmsis.yaml" "" ""
70+
)
71+
generate_bindings_for_kernels(
72+
LIB_NAME "cortex_m_cmsis_nn_ops_lib" FUNCTIONS_YAML
73+
${CMAKE_CURRENT_SOURCE_DIR}/../cmsis.yaml
74+
)
75+
76+
gen_operators_lib(
77+
LIB_NAME "cortex_m_cmsis_nn_ops_lib" KERNEL_LIBS cortex_m_cmsis_kernels DEPS executorch
78+
)
79+
set(CMAKE_EXE_LINKER_FLAGS "-Wl,--gc-sections")
80+
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -ffunction-sections -fdata-sections")
81+
82+
# Install targets and headers
83+
install(
84+
TARGETS cortex_m_cmsis_kernels cortex_m_cmsis_nn_ops_lib
85+
DESTINATION lib
86+
PUBLIC_HEADER DESTINATION include/executorch/backends/cortex_m/cmsis-nn/ops/
87+
)
Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
#include <executorch/runtime/kernel/kernel_includes.h>
2+
#include <executorch/runtime/core/portable_type/tensor.h> // for torch::executor::Tensor
3+
#include <executorch/runtime/core/portable_type/scalar.h> // for torch::executor::Scalar
4+
5+
#include <vector>
6+
#include <algorithm>
7+
#include <cmath>
8+
#include <cstdint>
9+
10+
extern "C" {
11+
#include "Include/arm_nnfunctions.h"
12+
}
13+
14+
namespace cortex_m {
15+
namespace native {
16+
17+
using Tensor = torch::executor::Tensor;
18+
using KernelRuntimeContext = torch::executor::KernelRuntimeContext;
19+
20+
// Determine quantization scale from fp32 data
21+
float determine_input_scale(const float* data, int size) {
22+
float min_val = *std::min_element(data, data + size);
23+
float max_val = *std::max_element(data, data + size);
24+
return (max_val - min_val) / 255.0f; // For int8 range [-128, 127]
25+
}
26+
// Quantize fp32 to int8
27+
void quantize_tensor(const float* input, int8_t* output, int size,
28+
float scale, int32_t zero_point) {
29+
for (int i = 0; i < size; i++) {
30+
int32_t quantized = std::round(input[i] / scale) + zero_point;
31+
// This ensures that the value quantized stays within the specified bounds — in this case, between -128 and 127,
32+
// which are the limits of int8_t.
33+
output[i] = std::clamp(quantized, static_cast<int32_t>(-128), static_cast<int32_t>(127));
34+
}
35+
}
36+
// Dequantize int8 to fp32
37+
void dequantize_tensor(const int8_t* input, float* output, int size,
38+
float scale, int32_t zero_point) {
39+
for (int i = 0; i < size; i++) {
40+
output[i] = (input[i] - zero_point) * scale;
41+
}
42+
}
43+
44+
// Converts a floating-point scale to CMSIS-NN fixed-point multiplier and shift
45+
// scale: the floating-point scale factor from ExecuTorch quantization
46+
// multiplier: output fixed-point multiplier (Q31 format)
47+
// shift: output left shift amount (positive means left shift)
48+
// diff_min: output minimum difference threshold (usually -128 for int8)
49+
void convert_scale_to_cmsis_params(float scale, int32_t* multiplier, int32_t* shift, int32_t* diff_min) {
50+
if (scale == 0.0f) {
51+
*multiplier = 0;
52+
*shift = 0;
53+
*diff_min = -128;
54+
return;
55+
}
56+
// Decompose scale into mantissa and exponent: scale = mantissa * 2^exponent
57+
int exponent;
58+
float mantissa = std::frexp(scale, &exponent); // mantissa in [0.5, 1)
59+
// Convert mantissa to Q31 fixed-point format
60+
int64_t q_fixed = static_cast<int64_t>(std::round(mantissa * (1ll << 31)));
61+
// Adjust multiplier and shift for CMSIS-NN
62+
*multiplier = static_cast<int32_t>(q_fixed);
63+
// CMSIS-NN expects a left shift, so negate exponent to get shift
64+
*shift = -exponent;
65+
// Typical diff_min for int8 softmax
66+
*diff_min = -128;
67+
}
68+
69+
torch::executor::Tensor& aten_softmax(
70+
KernelRuntimeContext& context,
71+
const Tensor& self,
72+
int64_t dim,
73+
bool half_to_float,
74+
Tensor& out) {
75+
76+
ET_LOG(Info, "CMSIS-NN quantized softmax kernel called");
77+
78+
// Step 1: Extract fp32 data
79+
const float* input_data_fp32 = self.data_ptr<float>();
80+
float* output_data_fp32 = out.data_ptr<float>();
81+
82+
// Step 2: Get tensor dimensions
83+
int rows = self.sizes()[0];
84+
int cols = self.sizes()[1];
85+
86+
// Step 3: Quantize input (fp32 -> int8)
87+
// Determine appropriate scale/zero_point
88+
float input_scale = determine_input_scale(input_data_fp32, rows * cols);
89+
90+
// '0' a reasonable default for symmetric quantization in int8,
91+
// especially if the input data is centered around zero else TBD
92+
int32_t input_zero_point = 0;
93+
94+
std::vector<int8_t> input_quantized(rows * cols);
95+
quantize_tensor(input_data_fp32, input_quantized.data(),
96+
rows * cols, input_scale, input_zero_point);
97+
98+
// Step 4: Convert to CMSIS-NN parameters
99+
int32_t input_mult, input_shift, diff_min;
100+
convert_scale_to_cmsis_params(input_scale, &input_mult, &input_shift, &diff_min);
101+
102+
// Step 5: Call CMSIS-NN kernel
103+
std::vector<int8_t> output_quantized(rows * cols);
104+
arm_softmax_s8(input_quantized.data(), rows, cols,
105+
input_mult, input_shift, diff_min,
106+
output_quantized.data());
107+
108+
// Step 6: Dequantize output (int8 -> fp32)
109+
dequantize_tensor(output_quantized.data(), output_data_fp32,
110+
rows * cols, input_scale, input_zero_point);
111+
112+
return out;
113+
}
114+
115+
} // namespace native
116+
} // namespace cortex_m

examples/arm/ethos-u-setup/arm-none-eabi-gcc.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ elseif(
7777
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m4(\\+|$)"
7878
OR CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m7(\\+|$)"
7979
)
80-
set(FLOAT hard)
80+
set(FLOAT soft)
8181
set(FPU_CONFIG "fpv4-sp-d16")
8282
add_compile_options(-mfpu=${FPU_CONFIG})
8383
add_link_options(-mfpu=${FPU_CONFIG})

examples/arm/executor_runner/CMakeLists.txt

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ option(ET_ATOL "Set atol to use for BundleIO testing" OFF)
1313
option(ET_RTOL "Set rtol to use for BundleIO testing" OFF)
1414
option(ET_DUMP_INPUT "Dump input in log" OFF)
1515
option(ET_DUMP_OUTPUT "Dump output in log" ON)
16-
option(FETCH_ETHOS_U_CONTENT "Fetch ethos_u dependencies instead of relying on pre-downloads" ON)
16+
option(FETCH_ETHOS_U_CONTENT "Fetch ethos_u dependencies instead of relying on pre-downloads" OFF)
1717

1818
if(NOT DEFINED ET_PTE_FILE_PATH AND NOT ${SEMIHOSTING})
1919
message(
@@ -539,6 +539,26 @@ set_property(
539539
PROPERTY IMPORTED_LOCATION
540540
"${ET_BUILD_DIR_PATH}/backends/cortex_m/libcortex_m_kernels.a"
541541
)
542+
add_library(cortex_m_cmsis_nn_ops_lib STATIC IMPORTED)
543+
set_property(
544+
TARGET cortex_m_cmsis_nn_ops_lib
545+
PROPERTY IMPORTED_LOCATION
546+
"${ET_BUILD_DIR_PATH}/backends/cortex_m/cmsis-nn/ops/libcortex_m_cmsis_nn_ops_lib.a"
547+
)
548+
add_library(cortex_m_cmsis_kernels STATIC IMPORTED)
549+
set_property(
550+
TARGET cortex_m_cmsis_kernels
551+
PROPERTY IMPORTED_LOCATION
552+
"${ET_BUILD_DIR_PATH}/backends/cortex_m/cmsis-nn/ops/libcortex_m_cmsis_kernels.a"
553+
)
554+
555+
add_library(cmsis_nn STATIC IMPORTED)
556+
set_property(
557+
TARGET cmsis_nn
558+
PROPERTY IMPORTED_LOCATION
559+
"/home/sidart/working/CMSIS-NN/build/libcmsis-nn.a"
560+
)
561+
542562
add_library(extension_runner_util STATIC IMPORTED)
543563
set_property(
544564
TARGET extension_runner_util
@@ -580,11 +600,14 @@ list(APPEND arm_executor_runner_link
580600
"-Wl,--whole-archive"
581601
executorch_delegate_ethos_u
582602
cortex_m_ops_lib
603+
cortex_m_cmsis_nn_ops_lib
583604
quantized_ops_lib
584605
portable_ops_lib
585606
quantized_kernels
586-
cortex_m_kernels
587607
portable_kernels
608+
cortex_m_kernels
609+
cortex_m_cmsis_kernels
610+
cmsis_nn
588611
"-Wl,--no-whole-archive"
589612
-Xlinker -Map=arm_executor_runner.map
590613
)
@@ -674,6 +697,10 @@ if(ET_DUMP_OUTPUT)
674697
target_compile_definitions(arm_executor_runner PUBLIC -DET_DUMP_OUTPUT)
675698
endif()
676699

700+
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -ffunction-sections -fdata-sections -fno-exceptions -fno-unwind-tables")
701+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffunction-sections -fdata-sections -fno-exceptions -fno-unwind-tables")
702+
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,--gc-sections")
703+
677704
# Fixup compilation of retarget.c
678705
if(SEMIHOSTING)
679706
# Remove this when MLBEDSW-8910 is closed.

examples/arm/run.sh

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ ethos_u_scratch_dir=$(realpath ${ethos_u_scratch_dir})
9898
setup_path_script=${ethos_u_scratch_dir}/setup_path.sh
9999
if [[ ${toolchain} == "arm-none-eabi-gcc" ]]; then
100100
toolchain_cmake=${et_root_dir}/examples/arm/ethos-u-setup/${toolchain}.cmake
101-
elif [[ ${toolchain} == "arm-zephyr-eabi-gcc" ]]; then
101+
elif [[ ${toolchain} == "arm-zephyr-eabi-gcc" ]]; then
102102
toolchain_cmake=${et_root_dir}/examples/zephyr/x86_64-linux-arm-zephyr-eabi-gcc.cmake
103103
else
104104
echo "Error: Invalid toolchain selection, provided: ${tolchain}"
@@ -198,21 +198,21 @@ if [[ -z "$model_name" ]]; then
198198
# the test models run, and whether to delegate
199199
test_model=(
200200
"softmax" # 0
201-
"add" # 1
202-
"add3" # 2
203-
"qadd" # 3
204-
"qadd2" # 4
205-
"qops" # 5
206-
"mv2" # 6
201+
#"add" # 1
202+
#"add3" # 2
203+
#"qadd" # 3
204+
#"qadd2" # 4
205+
#"qops" # 5
206+
#"mv2" # 6
207207
)
208208
model_compiler_flags=(
209209
"" # 0 softmax
210-
"--delegate" # 1 add
211-
"--delegate" # 2 add3
212-
"--delegate --quantize" # 3 qadd
213-
"--delegate --quantize" # 4 qadd2
214-
"--delegate --quantize" # 5 qops
215-
"--delegate --quantize" # 6 mv2
210+
#"--delegate" # 1 add
211+
#"--delegate" # 2 add3
212+
#"--delegate --quantize" # 3 qadd
213+
#"--delegate --quantize" # 4 qadd2
214+
#"--delegate --quantize" # 5 qops
215+
#"--delegate --quantize" # 6 mv2
216216
)
217217
else
218218
test_model=( "$model_name" )
@@ -277,6 +277,7 @@ for i in "${!test_model[@]}"; do
277277
set -x
278278
# Rebuild the application as the pte is imported as a header/c array
279279
backends/arm/scripts/build_executor_runner.sh --et_build_root="${et_build_root}" --pte="${pte_file}" --build_type=${build_type} --target=${target} --system_config=${system_config} --memory_mode=${memory_mode} ${bundleio_flag} ${et_dump_flag} --extra_build_flags="${extra_build_flags}" --ethosu_tools_dir="${ethos_u_scratch_dir}" --toolchain="${toolchain}"
280+
#echo "CALL ${cmd}" >&2
280281
if [ "$build_only" = false ] ; then
281282
# Execute the executor_runner on FVP Simulator
282283
elf_file="${output_folder}/${elf_folder}/cmake-out/arm_executor_runner"

runtime/kernel/operator_registry.cpp

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -81,17 +81,20 @@ Error register_kernels_internal(const Span<const Kernel> kernels) {
8181
et_pal_get_shared_library_name(kernels.data());
8282

8383
for (const auto& kernel : kernels) {
84+
bool duplicate = false;
8485
// Linear search. This is fine if the number of kernels is small.
8586
for (size_t i = 0; i < num_registered_kernels; i++) {
8687
Kernel k = registered_kernels[i];
8788
if (strcmp(kernel.name_, k.name_) == 0 &&
8889
kernel.kernel_key_ == k.kernel_key_) {
89-
ET_LOG(Error, "Re-registering %s, from %s", k.name_, lib_name);
90+
ET_LOG(Error, "! Re-registering %s, from %s", k.name_, lib_name);
9091
ET_LOG_KERNEL_KEY(k.kernel_key_);
91-
return Error::RegistrationAlreadyRegistered;
92+
//return Error::RegistrationAlreadyRegistered;
93+
duplicate = true;
9294
}
9395
}
94-
registered_kernels[num_registered_kernels++] = kernel;
96+
if (!duplicate)
97+
registered_kernels[num_registered_kernels++] = kernel;
9598
}
9699
ET_LOG(
97100
Debug,
@@ -238,9 +241,12 @@ Result<OpFunction> get_op_function_from_registry(
238241
return err;
239242
}
240243
KernelKey kernel_key = KernelKey(key_string.data());
244+
//ET_LOG(Debug, "get_op_function_from_registry: name %s", name);
245+
ET_LOG_TENSOR_META(meta_list);
241246

242247
int32_t fallback_idx = -1;
243248
for (size_t idx = 0; idx < num_registered_kernels; idx++) {
249+
ET_LOG(Info, "get_op_function_from_registry Checking kernel %s", registered_kernels[idx].name_);
244250
if (strcmp(registered_kernels[idx].name_, name) == 0) {
245251
if (registered_kernels[idx].kernel_key_ == kernel_key) {
246252
return registered_kernels[idx].op_;
@@ -250,7 +256,9 @@ Result<OpFunction> get_op_function_from_registry(
250256
}
251257
}
252258
}
259+
253260
if (fallback_idx != -1) {
261+
ET_LOG(Info, "get_op_function_from_registry: fallback kernel %s", registered_kernels[fallback_idx].name_);
254262
return registered_kernels[fallback_idx].op_;
255263
}
256264
ET_LOG(Error, "kernel '%s' not found.", name);

0 commit comments

Comments
 (0)