Summary: Initial CMSIS-NN custom kernels port (Take #2)

sidart · sidart · commit 8a86c5a6f3fb · 2025-07-31T10:45:54.000-07:00
Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -530,6 +530,7 @@ endif()
 
 if(EXECUTORCH_BUILD_CORTEX_M)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/cortex_m)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/cortex_m/cmsis-nn/ops)
 endif()
 
 if(EXECUTORCH_BUILD_DEVTOOLS)
diff --git a/backends/cortex_m/cmsis-nn/cmsis.yaml b/backends/cortex_m/cmsis-nn/cmsis.yaml
@@ -0,0 +1,9 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+- op: aten::_softmax.out
+  kernels:
+    - arg_meta: null
+      kernel_name: cortex_m::aten_softmax
diff --git a/backends/cortex_m/cmsis-nn/ops/CMakeLists.txt b/backends/cortex_m/cmsis-nn/ops/CMakeLists.txt
@@ -0,0 +1,87 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+cmake_minimum_required(VERSION 3.19)
+
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+if(NOT CMAKE_CXX_STANDARD)
+  set(CMAKE_CXX_STANDARD 17)
+endif()
+set(CMAKE_VERBOSE_MAKEFILE ON)
+
+# Source root directory for executorch.
+if(NOT EXECUTORCH_ROOT)
+  set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../../)
+endif()
+
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Codegen.cmake)
+
+set(EXECUTORCH_ENABLE_LOGGING ON CACHE BOOL "Enable ExecuTorch logging")
+set(EXECUTORCH_LOG_LEVEL "DEBUG" CACHE STRING "ExecuTorch log level")
+
+# Path to CMSIS-NN root - adjust as needed
+set(CMSIS_NN_ROOT /home/sidart/working/CMSIS-NN)
+
+# Cortex-M CMSIS ops sources
+set(_cortex_m_kernels_cmsis__srcs
+    "${EXECUTORCH_ROOT}/backends/cortex_m/cmsis-nn/ops/op_aten_add_tensor.cpp"
+    "${EXECUTORCH_ROOT}/backends/cortex_m/cmsis-nn/ops/op_aten_softmax.cpp"
+)
+
+# Common include directories
+set(_common_include_directories
+    ${EXECUTORCH_ROOT}/..
+    ${EXECUTORCH_ROOT}/runtime/core/portable_type/c10
+    ${CMSIS_NN_ROOT}/Include
+    ${CMSIS_NN_ROOT}  # For any CMake or config includes
+)
+
+# Import CMSIS-NN static library as a target
+add_library(cmsis_nn STATIC IMPORTED)
+set_target_properties(cmsis_nn PROPERTIES
+  IMPORTED_LOCATION "${CMSIS_NN_ROOT}/build/libcmsis-nn.a"
+  INTERFACE_INCLUDE_DIRECTORIES "${CMSIS_NN_ROOT}/Include"
+)
+
+# Build cortex_m_cmsis_kernels static library
+add_library(cortex_m_cmsis_kernels ${_cortex_m_kernels_cmsis__srcs})
+
+# Include directories for cortex_m_cmsis_kernels
+target_include_directories(cortex_m_cmsis_kernels
+  PRIVATE
+    ${_common_include_directories}
+)
+
+# Link libraries: executorch and CMSIS-NN imported target
+target_link_libraries(cortex_m_cmsis_kernels
+  PRIVATE
+    cmsis_nn
+    executorch
+)
+
+# Generate C++ bindings for kernels and operators
+gen_selected_ops(
+  LIB_NAME "cortex_m_cmsis_nn_ops_lib" OPS_SCHEMA_YAML
+  "${CMAKE_CURRENT_LIST_DIR}/../cmsis.yaml" "" ""
+)
+generate_bindings_for_kernels(
+  LIB_NAME "cortex_m_cmsis_nn_ops_lib" FUNCTIONS_YAML
+  ${CMAKE_CURRENT_SOURCE_DIR}/../cmsis.yaml
+)
+
+gen_operators_lib(
+  LIB_NAME "cortex_m_cmsis_nn_ops_lib" KERNEL_LIBS cortex_m_cmsis_kernels DEPS executorch
+)
+set(CMAKE_EXE_LINKER_FLAGS "-Wl,--gc-sections")
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -ffunction-sections -fdata-sections")
+
+# Install targets and headers
+install(
+  TARGETS cortex_m_cmsis_kernels cortex_m_cmsis_nn_ops_lib
+  DESTINATION lib
+  PUBLIC_HEADER DESTINATION include/executorch/backends/cortex_m/cmsis-nn/ops/
+)
diff --git a/backends/cortex_m/cmsis-nn/ops/op_aten_softmax.cpp b/backends/cortex_m/cmsis-nn/ops/op_aten_softmax.cpp
@@ -0,0 +1,116 @@
+#include <executorch/runtime/kernel/kernel_includes.h>
+#include <executorch/runtime/core/portable_type/tensor.h>  // for torch::executor::Tensor
+#include <executorch/runtime/core/portable_type/scalar.h>  // for torch::executor::Scalar
+
+#include <vector>
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+
+extern "C" {
+#include "Include/arm_nnfunctions.h"
+}
+
+namespace cortex_m {
+namespace native {
+
+using Tensor = torch::executor::Tensor;
+using KernelRuntimeContext = torch::executor::KernelRuntimeContext;
+
+// Determine quantization scale from fp32 data
+float determine_input_scale(const float* data, int size) {
+    float min_val = *std::min_element(data, data + size);
+    float max_val = *std::max_element(data, data + size);
+    return (max_val - min_val) / 255.0f; // For int8 range [-128, 127]
+}
+// Quantize fp32 to int8
+void quantize_tensor(const float* input, int8_t* output, int size,
+                    float scale, int32_t zero_point) {
+    for (int i = 0; i < size; i++) {
+        int32_t quantized = std::round(input[i] / scale) + zero_point;
+        // This ensures that the value quantized stays within the specified bounds — in this case, between -128 and 127, 
+        // which are the limits of int8_t.
+        output[i] = std::clamp(quantized, static_cast<int32_t>(-128), static_cast<int32_t>(127));
+    }
+}
+// Dequantize int8 to fp32
+void dequantize_tensor(const int8_t* input, float* output, int size,
+                      float scale, int32_t zero_point) {
+    for (int i = 0; i < size; i++) {
+        output[i] = (input[i] - zero_point) * scale;
+    }
+}
+
+// Converts a floating-point scale to CMSIS-NN fixed-point multiplier and shift
+// scale: the floating-point scale factor from ExecuTorch quantization
+// multiplier: output fixed-point multiplier (Q31 format)
+// shift: output left shift amount (positive means left shift)
+// diff_min: output minimum difference threshold (usually -128 for int8)
+void convert_scale_to_cmsis_params(float scale, int32_t* multiplier, int32_t* shift, int32_t* diff_min) {
+    if (scale == 0.0f) {
+        *multiplier = 0;
+        *shift = 0;
+        *diff_min = -128;
+        return;
+    }
+    // Decompose scale into mantissa and exponent: scale = mantissa * 2^exponent
+    int exponent;
+    float mantissa = std::frexp(scale, &exponent); // mantissa in [0.5, 1)
+    // Convert mantissa to Q31 fixed-point format
+    int64_t q_fixed = static_cast<int64_t>(std::round(mantissa * (1ll << 31)));
+    // Adjust multiplier and shift for CMSIS-NN
+    *multiplier = static_cast<int32_t>(q_fixed);
+    // CMSIS-NN expects a left shift, so negate exponent to get shift
+    *shift = -exponent;
+    // Typical diff_min for int8 softmax
+    *diff_min = -128;
+}
+
+torch::executor::Tensor& aten_softmax(
+    KernelRuntimeContext& context,
+    const Tensor& self,
+    int64_t dim,
+    bool half_to_float,
+    Tensor& out) {
+
+    ET_LOG(Info, "CMSIS-NN quantized softmax kernel called");
+    
+    // Step 1: Extract fp32 data
+    const float* input_data_fp32 = self.data_ptr<float>();
+    float* output_data_fp32 = out.data_ptr<float>();
+    
+    // Step 2: Get tensor dimensions
+    int rows = self.sizes()[0];
+    int cols = self.sizes()[1];
+    
+    // Step 3: Quantize input (fp32 -> int8)
+    // Determine appropriate scale/zero_point
+    float input_scale = determine_input_scale(input_data_fp32, rows * cols);
+
+    // '0' a reasonable default for symmetric quantization in int8, 
+    // especially if the input data is centered around zero else TBD
+    int32_t input_zero_point = 0;
+    
+    std::vector<int8_t> input_quantized(rows * cols);
+    quantize_tensor(input_data_fp32, input_quantized.data(), 
+                   rows * cols, input_scale, input_zero_point);
+    
+    // Step 4: Convert to CMSIS-NN parameters
+    int32_t input_mult, input_shift, diff_min;
+    convert_scale_to_cmsis_params(input_scale, &input_mult, &input_shift, &diff_min);
+    
+    // Step 5: Call CMSIS-NN kernel
+    std::vector<int8_t> output_quantized(rows * cols);
+    arm_softmax_s8(input_quantized.data(), rows, cols, 
+                   input_mult, input_shift, diff_min,
+                   output_quantized.data());
+    
+    // Step 6: Dequantize output (int8 -> fp32)
+    dequantize_tensor(output_quantized.data(), output_data_fp32,
+                     rows * cols, input_scale, input_zero_point);
+    
+    return out;
+}
+
+} // namespace native
+} // namespace cortex_m
diff --git a/examples/arm/ethos-u-setup/arm-none-eabi-gcc.cmake b/examples/arm/ethos-u-setup/arm-none-eabi-gcc.cmake
@@ -77,7 +77,7 @@ elseif(
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m4(\\+|$)"
        OR CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m7(\\+|$)"
 )
-  set(FLOAT hard)
+  set(FLOAT soft)
   set(FPU_CONFIG "fpv4-sp-d16")
   add_compile_options(-mfpu=${FPU_CONFIG})
   add_link_options(-mfpu=${FPU_CONFIG})
diff --git a/examples/arm/executor_runner/CMakeLists.txt b/examples/arm/executor_runner/CMakeLists.txt
@@ -13,7 +13,7 @@ option(ET_ATOL "Set atol to use for BundleIO testing" OFF)
 option(ET_RTOL "Set rtol to use for BundleIO testing" OFF)
 option(ET_DUMP_INPUT "Dump input in log" OFF)
 option(ET_DUMP_OUTPUT "Dump output in log" ON)
-option(FETCH_ETHOS_U_CONTENT "Fetch ethos_u dependencies instead of relying on pre-downloads" ON)
+option(FETCH_ETHOS_U_CONTENT "Fetch ethos_u dependencies instead of relying on pre-downloads" OFF)
 
 if(NOT DEFINED ET_PTE_FILE_PATH AND NOT ${SEMIHOSTING})
   message(
@@ -539,6 +539,26 @@ set_property(
   PROPERTY IMPORTED_LOCATION
            "${ET_BUILD_DIR_PATH}/backends/cortex_m/libcortex_m_kernels.a"
 )
+add_library(cortex_m_cmsis_nn_ops_lib STATIC IMPORTED)
+set_property(
+  TARGET cortex_m_cmsis_nn_ops_lib
+  PROPERTY IMPORTED_LOCATION
+           "${ET_BUILD_DIR_PATH}/backends/cortex_m/cmsis-nn/ops/libcortex_m_cmsis_nn_ops_lib.a"
+)
+add_library(cortex_m_cmsis_kernels STATIC IMPORTED)
+set_property(
+  TARGET cortex_m_cmsis_kernels
+   PROPERTY IMPORTED_LOCATION
+           "${ET_BUILD_DIR_PATH}/backends/cortex_m/cmsis-nn/ops/libcortex_m_cmsis_kernels.a"
+)
+
+add_library(cmsis_nn STATIC IMPORTED)
+set_property(
+  TARGET cmsis_nn
+   PROPERTY IMPORTED_LOCATION
+           "/home/sidart/working/CMSIS-NN/build/libcmsis-nn.a"
+)
+
 add_library(extension_runner_util STATIC IMPORTED)
 set_property(
   TARGET extension_runner_util
@@ -580,11 +600,14 @@ list(APPEND arm_executor_runner_link
   "-Wl,--whole-archive"
   executorch_delegate_ethos_u
   cortex_m_ops_lib
+  cortex_m_cmsis_nn_ops_lib
   quantized_ops_lib
   portable_ops_lib
   quantized_kernels
-  cortex_m_kernels
   portable_kernels
+  cortex_m_kernels
+  cortex_m_cmsis_kernels
+  cmsis_nn
   "-Wl,--no-whole-archive"
   -Xlinker -Map=arm_executor_runner.map
 )
@@ -674,6 +697,10 @@ if(ET_DUMP_OUTPUT)
   target_compile_definitions(arm_executor_runner PUBLIC -DET_DUMP_OUTPUT)
 endif()
 
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -ffunction-sections -fdata-sections -fno-exceptions -fno-unwind-tables")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffunction-sections -fdata-sections -fno-exceptions -fno-unwind-tables")
+set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,--gc-sections")
+
 # Fixup compilation of retarget.c
 if(SEMIHOSTING)
   # Remove this when MLBEDSW-8910 is closed.
diff --git a/examples/arm/run.sh b/examples/arm/run.sh
@@ -98,7 +98,7 @@ ethos_u_scratch_dir=$(realpath ${ethos_u_scratch_dir})
 setup_path_script=${ethos_u_scratch_dir}/setup_path.sh
 if [[ ${toolchain} == "arm-none-eabi-gcc" ]]; then
     toolchain_cmake=${et_root_dir}/examples/arm/ethos-u-setup/${toolchain}.cmake
-elif [[ ${toolchain} == "arm-zephyr-eabi-gcc" ]]; then 
+elif [[ ${toolchain} == "arm-zephyr-eabi-gcc" ]]; then
     toolchain_cmake=${et_root_dir}/examples/zephyr/x86_64-linux-arm-zephyr-eabi-gcc.cmake
 else
     echo "Error: Invalid toolchain selection, provided: ${tolchain}"
@@ -198,21 +198,21 @@ if [[ -z "$model_name" ]]; then
     # the test models run, and whether to delegate
     test_model=(
         "softmax"  # 0
-        "add"      # 1
-        "add3"     # 2
-        "qadd"     # 3
-        "qadd2"    # 4
-        "qops"     # 5
-        "mv2"      # 6
+        #"add"      # 1
+        #"add3"     # 2
+        #"qadd"     # 3
+        #"qadd2"    # 4
+        #"qops"     # 5
+        #"mv2"      # 6
     )
     model_compiler_flags=(
         ""                      # 0 softmax
-        "--delegate"            # 1 add
-        "--delegate"            # 2 add3
-        "--delegate --quantize" # 3 qadd
-        "--delegate --quantize" # 4 qadd2
-        "--delegate --quantize" # 5 qops
-        "--delegate --quantize" # 6 mv2
+        #"--delegate"            # 1 add
+        #"--delegate"            # 2 add3
+        #"--delegate --quantize" # 3 qadd
+        #"--delegate --quantize" # 4 qadd2
+        #"--delegate --quantize" # 5 qops
+        #"--delegate --quantize" # 6 mv2
     )
 else
     test_model=( "$model_name" )
@@ -277,6 +277,7 @@ for i in "${!test_model[@]}"; do
         set -x
         # Rebuild the application as the pte is imported as a header/c array
         backends/arm/scripts/build_executor_runner.sh --et_build_root="${et_build_root}" --pte="${pte_file}" --build_type=${build_type} --target=${target} --system_config=${system_config} --memory_mode=${memory_mode} ${bundleio_flag} ${et_dump_flag} --extra_build_flags="${extra_build_flags}" --ethosu_tools_dir="${ethos_u_scratch_dir}" --toolchain="${toolchain}"
+        #echo "CALL ${cmd}" >&2
         if [ "$build_only" = false ] ; then
             # Execute the executor_runner on FVP Simulator
             elf_file="${output_folder}/${elf_folder}/cmake-out/arm_executor_runner"
diff --git a/runtime/kernel/operator_registry.cpp b/runtime/kernel/operator_registry.cpp
@@ -81,17 +81,20 @@ Error register_kernels_internal(const Span<const Kernel> kernels) {
       et_pal_get_shared_library_name(kernels.data());
 
   for (const auto& kernel : kernels) {
+    bool duplicate = false;
     // Linear search. This is fine if the number of kernels is small.
     for (size_t i = 0; i < num_registered_kernels; i++) {
       Kernel k = registered_kernels[i];
       if (strcmp(kernel.name_, k.name_) == 0 &&
           kernel.kernel_key_ == k.kernel_key_) {
-        ET_LOG(Error, "Re-registering %s, from %s", k.name_, lib_name);
+        ET_LOG(Error, "! Re-registering %s, from %s", k.name_, lib_name);
         ET_LOG_KERNEL_KEY(k.kernel_key_);
-        return Error::RegistrationAlreadyRegistered;
+        //return Error::RegistrationAlreadyRegistered;
+        duplicate = true;
       }
     }
-    registered_kernels[num_registered_kernels++] = kernel;
+    if (!duplicate)
+      registered_kernels[num_registered_kernels++] = kernel;
   }
   ET_LOG(
       Debug,
@@ -238,9 +241,12 @@ Result<OpFunction> get_op_function_from_registry(
     return err;
   }
   KernelKey kernel_key = KernelKey(key_string.data());
+  //ET_LOG(Debug, "get_op_function_from_registry: name %s", name);
+  ET_LOG_TENSOR_META(meta_list);
 
   int32_t fallback_idx = -1;
   for (size_t idx = 0; idx < num_registered_kernels; idx++) {
+    ET_LOG(Info, "get_op_function_from_registry Checking kernel %s", registered_kernels[idx].name_);
     if (strcmp(registered_kernels[idx].name_, name) == 0) {
       if (registered_kernels[idx].kernel_key_ == kernel_key) {
         return registered_kernels[idx].op_;
@@ -250,7 +256,9 @@ Result<OpFunction> get_op_function_from_registry(
       }
     }
   }
+
   if (fallback_idx != -1) {
+    ET_LOG(Info, "get_op_function_from_registry: fallback kernel %s", registered_kernels[fallback_idx].name_);
     return registered_kernels[fallback_idx].op_;
   }
   ET_LOG(Error, "kernel '%s' not found.", name);

Original file line number	Diff line number	Diff line change
`@@ -77,7 +77,7 @@ elseif(`
`77`	`77`	`elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m4(\\+\|$)"`
`78`	`78`	`OR CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m7(\\+\|$)"`
`79`	`79`	`)`
`80`		`- set(FLOAT hard)`
	`80`	`+ set(FLOAT soft)`
`81`	`81`	`set(FPU_CONFIG "fpv4-sp-d16")`
`82`	`82`	`add_compile_options(-mfpu=${FPU_CONFIG})`
`83`	`83`	`add_link_options(-mfpu=${FPU_CONFIG})`
Original file line number	Diff line number	Diff line change
`@@ -81,17 +81,20 @@ Error register_kernels_internal(const Span<const Kernel> kernels) {`
`81`	`81`	`et_pal_get_shared_library_name(kernels.data());`
`82`	`82`
`83`	`83`	`for (const auto& kernel : kernels) {`
	`84`	`+ bool duplicate = false;`
`84`	`85`	`// Linear search. This is fine if the number of kernels is small.`
`85`	`86`	`for (size_t i = 0; i < num_registered_kernels; i++) {`
`86`	`87`	`Kernel k = registered_kernels[i];`
`87`	`88`	`if (strcmp(kernel.name_, k.name_) == 0 &&`
`88`	`89`	`kernel.kernel_key_ == k.kernel_key_) {`
`89`		`- ET_LOG(Error, "Re-registering %s, from %s", k.name_, lib_name);`
	`90`	`+ ET_LOG(Error, "! Re-registering %s, from %s", k.name_, lib_name);`
`90`	`91`	`ET_LOG_KERNEL_KEY(k.kernel_key_);`
`91`		`- return Error::RegistrationAlreadyRegistered;`
	`92`	`+ //return Error::RegistrationAlreadyRegistered;`
	`93`	`+ duplicate = true;`
`92`	`94`	`}`
`93`	`95`	`}`
`94`		`- registered_kernels[num_registered_kernels++] = kernel;`
	`96`	`+ if (!duplicate)`
	`97`	`+ registered_kernels[num_registered_kernels++] = kernel;`
`95`	`98`	`}`
`96`	`99`	`ET_LOG(`
`97`	`100`	`Debug,`
`@@ -238,9 +241,12 @@ Result<OpFunction> get_op_function_from_registry(`
`238`	`241`	`return err;`
`239`	`242`	`}`
`240`	`243`	`KernelKey kernel_key = KernelKey(key_string.data());`
	`244`	`+ //ET_LOG(Debug, "get_op_function_from_registry: name %s", name);`
	`245`	`+ ET_LOG_TENSOR_META(meta_list);`
`241`	`246`
`242`	`247`	`int32_t fallback_idx = -1;`
`243`	`248`	`for (size_t idx = 0; idx < num_registered_kernels; idx++) {`
	`249`	`+ ET_LOG(Info, "get_op_function_from_registry Checking kernel %s", registered_kernels[idx].name_);`
`244`	`250`	`if (strcmp(registered_kernels[idx].name_, name) == 0) {`
`245`	`251`	`if (registered_kernels[idx].kernel_key_ == kernel_key) {`
`246`	`252`	`return registered_kernels[idx].op_;`
`@@ -250,7 +256,9 @@ Result<OpFunction> get_op_function_from_registry(`
`250`	`256`	`}`
`251`	`257`	`}`
`252`	`258`	`}`
	`259`	`+`
`253`	`260`	`if (fallback_idx != -1) {`
	`261`	`+ ET_LOG(Info, "get_op_function_from_registry: fallback kernel %s", registered_kernels[fallback_idx].name_);`
`254`	`262`	`return registered_kernels[fallback_idx].op_;`
`255`	`263`	`}`
`256`	`264`	`ET_LOG(Error, "kernel '%s' not found.", name);`