Summary: Initial CMSIS-NN custom kernels port (Take #2)

sidart · sidart · commit f953921c5551 · 2025-07-22T23:45:53.000-07:00
Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -530,6 +530,7 @@ endif()
 
 if(EXECUTORCH_BUILD_CORTEX_M)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/cortex_m)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/cortex_m/cmsis-nn/ops)
 endif()
 
 if(EXECUTORCH_BUILD_DEVTOOLS)
diff --git a/backends/cortex_m/cmsis-nn/cmsis-operators.py b/backends/cortex_m/cmsis-nn/cmsis-operators.py
@@ -0,0 +1,48 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from executorch.exir.dialects._ops import (
+    ops as exir_ops,
+)  # To provide the implementation of the operators
+from torch.library import impl, Library, register_fake
+
+# New operator library with a custom namespace to allow fusion etc.
+lib = Library("cortex_m", "DEF")
+
+###
+# add.Tensor
+###
+
+lib.define("aten_add_tensor(Tensor self, Tensor other, ScalarType dtype, Tensor(a!) out) -> Tensor(a!)")
+
+@impl(lib, "aten_add_tensor", "CompositeExplicitAutograd")
+def aten_add_tensor_impl(input1, input2, dtype, out):
+    return exir_ops.edge.cortex_m.aten_add_tensor.default(input1, input2, dtype, dtype)
+
+
+###
+# add.out
+###
+
+lib.define(
+    "add.out(Tensor input1, Tensor input2, ScalarType dtype, Tensor(a!) out) -> Tensor(a!)"
+)
+
+@impl(lib, "add.out", "CompositeExplicitAutograd")
+def add_out_impl(
+    input1: torch.Tensor,
+    input2: torch.Tensor,
+    dtype: torch.dtype,
+    out: torch.Tensor,
+) -> torch.Tensor:
+    """
+    The implementation of cmsis-nn add.out.
+    """
+
+    return exir_ops.edge.cortex_m.add.default(
+        input1, input2, dtype, dtype
+    )
diff --git a/backends/cortex_m/cmsis-nn/cmsis.yaml b/backends/cortex_m/cmsis-nn/cmsis.yaml
@@ -0,0 +1,15 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+- op: aten::add.out
+  kernels:
+    - arg_meta: null
+      kernel_name: cortex_m::aten_add_tensor
+
+- op: aten::_softmax.out
+  kernels:
+    - arg_meta: null
+      kernel_name: cortex_m::aten_softmax
diff --git a/backends/cortex_m/cmsis-nn/ops/CMakeLists.txt b/backends/cortex_m/cmsis-nn/ops/CMakeLists.txt
@@ -0,0 +1,59 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+cmake_minimum_required(VERSION 3.19)
+
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+if(NOT CMAKE_CXX_STANDARD)
+  set(CMAKE_CXX_STANDARD 17)
+endif()
+
+# Source root directory for executorch.
+if(NOT EXECUTORCH_ROOT)
+  set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../../)
+endif()
+
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Codegen.cmake)
+
+set(EXECUTORCH_ENABLE_LOGGING ON CACHE BOOL "Enable ExecuTorch logging")
+set(EXECUTORCH_LOG_LEVEL "DEBUG" CACHE STRING "ExecuTorch log level")
+
+# Cortex-M CMSIS ops that are needed to run this model.
+set(_cortex_m_kernels_cmsis__srcs
+    "${EXECUTORCH_ROOT}/backends/cortex_m/cmsis-nn/ops/op_aten_add_tensor.cpp"
+    "${EXECUTORCH_ROOT}/backends/cortex_m/cmsis-nn/ops/op_aten_softmax.cpp"
+    )
+
+# Let files say "include <executorch/path/to/header.h>".
+set(_common_include_directories ${EXECUTORCH_ROOT}/..
+${EXECUTORCH_ROOT}/runtime/core/portable_type/c10)
+
+add_library(cortex_m_cmsis_kernels ${_cortex_m_kernels_cmsis__srcs})
+target_link_libraries(cortex_m_cmsis_kernels PRIVATE executorch)
+target_compile_options(cortex_m_cmsis_kernels PUBLIC ${_common_compile_options})
+
+# Generate C++ bindings to register kernels into both PyTorch (for AOT) and
+# Executorch (for runtime). Here select all ops in functions.yaml
+gen_selected_ops(
+  LIB_NAME "cortex_m_cmsis_nn_ops_lib" OPS_SCHEMA_YAML
+  "${CMAKE_CURRENT_LIST_DIR}/../cmsis.yaml" "" ""
+)
+generate_bindings_for_kernels(
+  LIB_NAME "cortex_m_cmsis_nn_ops_lib" FUNCTIONS_YAML
+  ${CMAKE_CURRENT_SOURCE_DIR}/../cmsis.yaml
+)
+message("Generated files ${gen_command_sources}")
+
+gen_operators_lib(
+  LIB_NAME "cortex_m_cmsis_nn_ops_lib" KERNEL_LIBS cortex_m_cmsis_kernels DEPS executorch
+)
+
+install(
+  TARGETS cortex_m_cmsis_kernels cortex_m_cmsis_nn_ops_lib
+  DESTINATION lib
+  PUBLIC_HEADER DESTINATION include/executorch/backends/cortex_m/cmsis-nn/ops/
+)
diff --git a/backends/cortex_m/cmsis-nn/ops/op_aten_add_tensor.cpp b/backends/cortex_m/cmsis-nn/ops/op_aten_add_tensor.cpp
@@ -0,0 +1,68 @@
+#include <executorch/runtime/kernel/kernel_includes.h>
+#include <executorch/runtime/core/portable_type/tensor.h>  // for torch::executor::Tensor
+#include <executorch/runtime/core/portable_type/scalar.h>  // for torch::executor::Scalar
+#include <iostream>
+
+namespace cortex_m {
+namespace native {
+
+using Tensor = executorch::aten::Tensor;
+using ScalarType = executorch::aten::ScalarType;
+using Scalar = executorch::aten::Scalar;
+using KernelRuntimeContext = torch::executor::KernelRuntimeContext;
+
+torch::executor::Tensor& aten_add_tensor(
+    torch::executor::KernelRuntimeContext& ctx,
+    const torch::executor::Tensor&  input1,
+    const torch::executor::Tensor&  input2,
+    const torch::executor::Scalar& alpha,
+    torch::executor::Tensor& out) {
+  // Your CMSIS-NN optimized implementation here
+  // Return 'out' tensor as per Executorch kernel signature
+  std::cout << "add_out kernel called" << std::endl;
+  ET_LOG(Info, "xxxxxxxxxx add_out kernel called");
+
+  assert(false);
+  assert(true);
+  return out;
+}
+
+torch::executor::Tensor& add_out(
+    torch::executor::KernelRuntimeContext& ctx,
+    const torch::executor::Tensor&  input1,
+    const torch::executor::Tensor&  input2,
+    const torch::executor::Scalar& alpha,
+    torch::executor::Tensor& out) {
+  std::cout << "add_out kernel called" << std::endl;
+  ET_LOG(Info, "xxxxxxxxxx add_out kernel called");
+
+  // Ensure input is char type
+  ET_CHECK_MSG(
+      input1.scalar_type() == ScalarType::Char,
+      "input1.scalar_type() %" PRId8 " is not char type",
+      static_cast<int8_t>(input1.scalar_type()));
+
+  ET_CHECK_MSG(
+      input2.scalar_type() == ScalarType::Char,
+      "input2.scalar_type() %" PRId8 " is not char type",
+      static_cast<int8_t>(input2.scalar_type()));
+
+  // Check output dtype is float
+  ET_CHECK_MSG(
+      out.scalar_type() == ScalarType::Float,
+      "out.scalar_type() %" PRId8 " is not float",
+      static_cast<int8_t>(out.scalar_type()));
+
+  // Check dtype is int8 (Char)
+  /*ET_CHECK_MSG(
+      dtype == ScalarType::Char,
+      "dtype %" PRId8 " is not int8 (Char)",
+      static_cast<int8_t>(dtype));*/
+  
+  assert(false);
+
+  return out;
+}
+
+} // namespace native
+} // namespace cortex_m
diff --git a/backends/cortex_m/cmsis-nn/ops/op_aten_softmax.cpp b/backends/cortex_m/cmsis-nn/ops/op_aten_softmax.cpp
@@ -0,0 +1,31 @@
+#include <executorch/runtime/kernel/kernel_includes.h>
+#include <executorch/runtime/core/portable_type/tensor.h>  // for torch::executor::Tensor
+#include <executorch/runtime/core/portable_type/scalar.h>  // for torch::executor::Scalar
+#include <iostream>
+
+namespace cortex_m {
+namespace native {
+
+using Tensor = executorch::aten::Tensor;
+using ScalarType = executorch::aten::ScalarType;
+using Scalar = executorch::aten::Scalar;
+using KernelRuntimeContext = torch::executor::KernelRuntimeContext;
+
+torch::executor::Tensor& aten_softmax(
+    torch::executor::KernelRuntimeContext& context,
+    const torch::executor::Tensor& self,
+    int64_t dim,
+    bool half_to_float,
+    torch::executor::Tensor& out) {
+  // Your CMSIS-NN optimized implementation here
+  // Return 'out' tensor as per Executorch kernel signature
+  //std::cout << "softmax kernel called" << std::endl;
+  ET_LOG(Info, "xxxxxxxxxx softmax kernel called");
+
+  //assert(false);
+  //assert(true);
+  return out;
+}
+
+} // namespace native
+} // namespace cortex_m
diff --git a/examples/arm/executor_runner/CMakeLists.txt b/examples/arm/executor_runner/CMakeLists.txt
@@ -539,6 +539,18 @@ set_property(
   PROPERTY IMPORTED_LOCATION
            "${ET_BUILD_DIR_PATH}/backends/cortex_m/libcortex_m_kernels.a"
 )
+add_library(cortex_m_cmsis_nn_ops_lib STATIC IMPORTED)
+set_property(
+  TARGET cortex_m_cmsis_nn_ops_lib
+  PROPERTY IMPORTED_LOCATION
+           "${ET_BUILD_DIR_PATH}/backends/cortex_m/cmsis-nn/ops/libcortex_m_cmsis_nn_ops_lib.a"
+)
+add_library(cortex_m_cmsis_kernels STATIC IMPORTED)
+set_property(
+  TARGET cortex_m_cmsis_kernels
+   PROPERTY IMPORTED_LOCATION
+           "${ET_BUILD_DIR_PATH}/backends/cortex_m/cmsis-nn/ops/libcortex_m_cmsis_kernels.a"
+)
 add_library(extension_runner_util STATIC IMPORTED)
 set_property(
   TARGET extension_runner_util
@@ -580,11 +592,13 @@ list(APPEND arm_executor_runner_link
   "-Wl,--whole-archive"
   executorch_delegate_ethos_u
   cortex_m_ops_lib
+  cortex_m_cmsis_nn_ops_lib
   quantized_ops_lib
   portable_ops_lib
   quantized_kernels
-  cortex_m_kernels
   portable_kernels
+  cortex_m_kernels
+  cortex_m_cmsis_kernels
   "-Wl,--no-whole-archive"
   -Xlinker -Map=arm_executor_runner.map
 )
diff --git a/runtime/kernel/operator_registry.cpp b/runtime/kernel/operator_registry.cpp
@@ -81,17 +81,20 @@ Error register_kernels_internal(const Span<const Kernel> kernels) {
       et_pal_get_shared_library_name(kernels.data());
 
   for (const auto& kernel : kernels) {
+    bool duplicate = false;
     // Linear search. This is fine if the number of kernels is small.
     for (size_t i = 0; i < num_registered_kernels; i++) {
       Kernel k = registered_kernels[i];
       if (strcmp(kernel.name_, k.name_) == 0 &&
           kernel.kernel_key_ == k.kernel_key_) {
-        ET_LOG(Error, "Re-registering %s, from %s", k.name_, lib_name);
+        ET_LOG(Error, "! Re-registering %s, from %s", k.name_, lib_name);
         ET_LOG_KERNEL_KEY(k.kernel_key_);
-        return Error::RegistrationAlreadyRegistered;
+        //return Error::RegistrationAlreadyRegistered;
+        duplicate = true;
       }
     }
-    registered_kernels[num_registered_kernels++] = kernel;
+    if (!duplicate)
+      registered_kernels[num_registered_kernels++] = kernel;
   }
   ET_LOG(
       Debug,
@@ -238,9 +241,12 @@ Result<OpFunction> get_op_function_from_registry(
     return err;
   }
   KernelKey kernel_key = KernelKey(key_string.data());
+  //ET_LOG(Debug, "get_op_function_from_registry: name %s", name);
+  ET_LOG_TENSOR_META(meta_list);
 
   int32_t fallback_idx = -1;
   for (size_t idx = 0; idx < num_registered_kernels; idx++) {
+    ET_LOG(Info, "get_op_function_from_registry Checking kernel %s", registered_kernels[idx].name_);
     if (strcmp(registered_kernels[idx].name_, name) == 0) {
       if (registered_kernels[idx].kernel_key_ == kernel_key) {
         return registered_kernels[idx].op_;
@@ -250,7 +256,9 @@ Result<OpFunction> get_op_function_from_registry(
       }
     }
   }
+
   if (fallback_idx != -1) {
+    ET_LOG(Info, "get_op_function_from_registry: fallback kernel %s", registered_kernels[fallback_idx].name_);
     return registered_kernels[fallback_idx].op_;
   }
   ET_LOG(Error, "kernel '%s' not found.", name);

Original file line number	Diff line number	Diff line change
`@@ -81,17 +81,20 @@ Error register_kernels_internal(const Span<const Kernel> kernels) {`
`81`	`81`	`et_pal_get_shared_library_name(kernels.data());`
`82`	`82`
`83`	`83`	`for (const auto& kernel : kernels) {`
	`84`	`+ bool duplicate = false;`
`84`	`85`	`// Linear search. This is fine if the number of kernels is small.`
`85`	`86`	`for (size_t i = 0; i < num_registered_kernels; i++) {`
`86`	`87`	`Kernel k = registered_kernels[i];`
`87`	`88`	`if (strcmp(kernel.name_, k.name_) == 0 &&`
`88`	`89`	`kernel.kernel_key_ == k.kernel_key_) {`
`89`		`- ET_LOG(Error, "Re-registering %s, from %s", k.name_, lib_name);`
	`90`	`+ ET_LOG(Error, "! Re-registering %s, from %s", k.name_, lib_name);`
`90`	`91`	`ET_LOG_KERNEL_KEY(k.kernel_key_);`
`91`		`- return Error::RegistrationAlreadyRegistered;`
	`92`	`+ //return Error::RegistrationAlreadyRegistered;`
	`93`	`+ duplicate = true;`
`92`	`94`	`}`
`93`	`95`	`}`
`94`		`- registered_kernels[num_registered_kernels++] = kernel;`
	`96`	`+ if (!duplicate)`
	`97`	`+ registered_kernels[num_registered_kernels++] = kernel;`
`95`	`98`	`}`
`96`	`99`	`ET_LOG(`
`97`	`100`	`Debug,`
`@@ -238,9 +241,12 @@ Result<OpFunction> get_op_function_from_registry(`
`238`	`241`	`return err;`
`239`	`242`	`}`
`240`	`243`	`KernelKey kernel_key = KernelKey(key_string.data());`
	`244`	`+ //ET_LOG(Debug, "get_op_function_from_registry: name %s", name);`
	`245`	`+ ET_LOG_TENSOR_META(meta_list);`
`241`	`246`
`242`	`247`	`int32_t fallback_idx = -1;`
`243`	`248`	`for (size_t idx = 0; idx < num_registered_kernels; idx++) {`
	`249`	`+ ET_LOG(Info, "get_op_function_from_registry Checking kernel %s", registered_kernels[idx].name_);`
`244`	`250`	`if (strcmp(registered_kernels[idx].name_, name) == 0) {`
`245`	`251`	`if (registered_kernels[idx].kernel_key_ == kernel_key) {`
`246`	`252`	`return registered_kernels[idx].op_;`
`@@ -250,7 +256,9 @@ Result<OpFunction> get_op_function_from_registry(`
`250`	`256`	`}`
`251`	`257`	`}`
`252`	`258`	`}`
	`259`	`+`
`253`	`260`	`if (fallback_idx != -1) {`
	`261`	`+ ET_LOG(Info, "get_op_function_from_registry: fallback kernel %s", registered_kernels[fallback_idx].name_);`
`254`	`262`	`return registered_kernels[fallback_idx].op_;`
`255`	`263`	`}`
`256`	`264`	`ET_LOG(Error, "kernel '%s' not found.", name);`