Fix missing things from OSS kernels updates

mcremon-meta · web-flow · commit 00b50b27e03b · 2025-09-08T16:07:55.000-07:00
Differential Revision: D81863822 Pull Request resolved: pytorch#14067
diff --git a/backends/cadence/hifi/operators/op_quantized_add_asym8sxasym8s_asym8s_per_tensor_out.cpp b/backends/cadence/hifi/operators/op_quantized_add_asym8sxasym8s_asym8s_per_tensor_out.cpp
@@ -16,6 +16,8 @@ namespace native {
 
 using ::executorch::aten::Tensor;
 using ::executorch::runtime::KernelRuntimeContext;
+using ::impl::reference::kernels::dequantize;
+using ::impl::reference::kernels::quantize;
 
 void quantized_add_asym8sxasym8s_asym8s_per_tensor_out(
     KernelRuntimeContext& ctx,
@@ -61,25 +63,19 @@ void quantized_add_asym8sxasym8s_asym8s_per_tensor_out(
     }
   } /* if Y is a scalar Tensor */
   else if (Y_numel == 1) {
-    float y =
-        kernels::dequantize<int8_t>(Y_data[0], Y_scale_f, Y_zero_point_i32);
+    float y = dequantize<int8_t>(Y_data[0], Y_scale_f, Y_zero_point_i32);
     for (size_t i = 0; i < X_numel; ++i) {
-      float x =
-          kernels::dequantize<int8_t>(X_data[i], X_scale_f, X_zero_point_i32);
+      float x = dequantize<int8_t>(X_data[i], X_scale_f, X_zero_point_i32);
       float z = x + y;
-      out_data[i] =
-          kernels::quantize<int8_t>(z, inv_out_scale, out_zero_point_i32);
+      out_data[i] = quantize<int8_t>(z, inv_out_scale, out_zero_point_i32);
     }
   } /* if X is a scalar Tensor */
   else if (X_numel == 1) {
-    float x =
-        kernels::dequantize<int8_t>(X_data[0], X_scale_f, X_zero_point_i32);
+    float x = dequantize<int8_t>(X_data[0], X_scale_f, X_zero_point_i32);
     for (size_t i = 0; i < Y_numel; ++i) {
-      float y =
-          kernels::dequantize<int8_t>(Y_data[i], Y_scale_f, Y_zero_point_i32);
+      float y = dequantize<int8_t>(Y_data[i], Y_scale_f, Y_zero_point_i32);
       float z = x + y;
-      out_data[i] =
-          kernels::quantize<int8_t>(z, inv_out_scale, out_zero_point_i32);
+      out_data[i] = quantize<int8_t>(z, inv_out_scale, out_zero_point_i32);
     }
   } /* other broadcasting cases */
   else {
@@ -162,13 +158,10 @@ void quantized_add_asym8sxasym8s_asym8s_per_tensor_out(
       }
 
       /* Apply the operation */
-      float x = kernels::dequantize<int8_t>(
-          X_data[X_idx], X_scale_f, X_zero_point_i32);
-      float y = kernels::dequantize<int8_t>(
-          Y_data[Y_idx], Y_scale_f, Y_zero_point_i32);
+      float x = dequantize<int8_t>(X_data[X_idx], X_scale_f, X_zero_point_i32);
+      float y = dequantize<int8_t>(Y_data[Y_idx], Y_scale_f, Y_zero_point_i32);
       float z = x + y;
-      out_data[i] =
-          kernels::quantize<int8_t>(z, inv_out_scale, out_zero_point_i32);
+      out_data[i] = quantize<int8_t>(z, inv_out_scale, out_zero_point_i32);
     }
   }
 }
diff --git a/backends/cadence/hifi/operators/op_quantized_add_asym8uxasym8u_asym8u_per_tensor_out.cpp b/backends/cadence/hifi/operators/op_quantized_add_asym8uxasym8u_asym8u_per_tensor_out.cpp
@@ -16,6 +16,8 @@ namespace native {
 
 using ::executorch::aten::Tensor;
 using ::executorch::runtime::KernelRuntimeContext;
+using ::impl::reference::kernels::dequantize;
+using ::impl::reference::kernels::quantize;
 
 void quantized_add_asym8uxasym8u_asym8u_per_tensor_out(
     KernelRuntimeContext& ctx,
@@ -61,25 +63,19 @@ void quantized_add_asym8uxasym8u_asym8u_per_tensor_out(
     }
   } /* if Y is a scalar Tensor */
   else if (Y_numel == 1) {
-    float y =
-        kernels::dequantize<uint8_t>(Y_data[0], Y_scale_f, Y_zero_point_i32);
+    float y = dequantize<uint8_t>(Y_data[0], Y_scale_f, Y_zero_point_i32);
     for (size_t i = 0; i < X_numel; ++i) {
-      float x =
-          kernels::dequantize<uint8_t>(X_data[i], X_scale_f, X_zero_point_i32);
+      float x = dequantize<uint8_t>(X_data[i], X_scale_f, X_zero_point_i32);
       float z = x + y;
-      out_data[i] =
-          kernels::quantize<uint8_t>(z, inv_out_scale, out_zero_point_i32);
+      out_data[i] = quantize<uint8_t>(z, inv_out_scale, out_zero_point_i32);
     }
   } /* if X is a scalar Tensor */
   else if (X_numel == 1) {
-    float x =
-        kernels::dequantize<uint8_t>(X_data[0], X_scale_f, X_zero_point_i32);
+    float x = dequantize<uint8_t>(X_data[0], X_scale_f, X_zero_point_i32);
     for (size_t i = 0; i < Y_numel; ++i) {
-      float y =
-          kernels::dequantize<uint8_t>(Y_data[i], Y_scale_f, Y_zero_point_i32);
+      float y = dequantize<uint8_t>(Y_data[i], Y_scale_f, Y_zero_point_i32);
       float z = x + y;
-      out_data[i] =
-          kernels::quantize<uint8_t>(z, inv_out_scale, out_zero_point_i32);
+      out_data[i] = quantize<uint8_t>(z, inv_out_scale, out_zero_point_i32);
     }
   } /* other broadcasting cases */
   else {
@@ -162,13 +158,10 @@ void quantized_add_asym8uxasym8u_asym8u_per_tensor_out(
       }
 
       /* Apply the operation */
-      float x = kernels::dequantize<uint8_t>(
-          X_data[X_idx], X_scale_f, X_zero_point_i32);
-      float y = kernels::dequantize<uint8_t>(
-          Y_data[Y_idx], Y_scale_f, Y_zero_point_i32);
+      float x = dequantize<uint8_t>(X_data[X_idx], X_scale_f, X_zero_point_i32);
+      float y = dequantize<uint8_t>(Y_data[Y_idx], Y_scale_f, Y_zero_point_i32);
       float z = x + y;
-      out_data[i] =
-          kernels::quantize<uint8_t>(z, inv_out_scale, out_zero_point_i32);
+      out_data[i] = quantize<uint8_t>(z, inv_out_scale, out_zero_point_i32);
     }
   }
 }
diff --git a/backends/cadence/hifi/operators/op_quantized_layer_norm.cpp b/backends/cadence/hifi/operators/op_quantized_layer_norm.cpp
@@ -13,6 +13,8 @@
 #include <cmath>
 #include <tuple>
 
+using ::cadence::impl::HiFi::kernels::dequantize;
+using ::cadence::impl::HiFi::kernels::quantize;
 using ::executorch::aten::IntArrayRef;
 using ::executorch::aten::ScalarType;
 using ::executorch::aten::Tensor;
@@ -80,11 +82,9 @@ void quantized_layer_norm_per_tensor_(
     for (size_t j = 0; j < last_dim; ++j) {
       // Since X is quantized, we dequantize it, compute fp32 result, and
       // quantize the result to an int8/uint8 value.
-      float val = ::cadence::impl::HiFi::kernels::dequantize<T>(
-          x[j], input_scale, input_zero_point);
+      float val = dequantize<T>(x[j], input_scale, input_zero_point);
       val = (val - mean) * inv_std * weight_data[j] + bias_data[j];
-      y[j] = ::cadence::impl::HiFi::kernels::quantize<T>(
-          val, output_inv_scale, output_zero_point);
+      y[j] = quantize<T>(val, output_inv_scale, output_zero_point);
     }
   }
 }
diff --git a/backends/cadence/reference/operators/dequantize_per_tensor.cpp b/backends/cadence/reference/operators/dequantize_per_tensor.cpp
@@ -13,9 +13,10 @@ namespace impl {
 namespace reference {
 namespace native {
 
-using executorch::aten::ScalarType;
-using executorch::aten::Tensor;
-using executorch::runtime::KernelRuntimeContext;
+using ::executorch::aten::ScalarType;
+using ::executorch::aten::Tensor;
+using ::executorch::runtime::KernelRuntimeContext;
+using ::impl::reference::kernels::dequantize;
 
 void dequantize_per_tensor_out(
     KernelRuntimeContext& context,
@@ -31,22 +32,18 @@ void dequantize_per_tensor_out(
 
   if (input.scalar_type() == ScalarType::Byte) {
     const uint8_t* input_data = input.const_data_ptr<uint8_t>();
-    impl::reference::kernels::dequantize<uint8_t>(
-        out_data, input_data, scale, zero_point, numel);
+    dequantize<uint8_t>(out_data, input_data, scale, zero_point, numel);
   } else if (input.scalar_type() == ScalarType::Char) {
     const int8_t* input_data = input.const_data_ptr<int8_t>();
-    impl::reference::kernels::dequantize<int8_t>(
-        out_data, input_data, scale, zero_point, numel);
+    dequantize<int8_t>(out_data, input_data, scale, zero_point, numel);
   } else if (
       input.scalar_type() == ScalarType::Bits16 ||
       input.scalar_type() == ScalarType::UInt16) {
     const uint16_t* input_data = input.const_data_ptr<uint16_t>();
-    impl::reference::kernels::dequantize<uint16_t>(
-        out_data, input_data, scale, zero_point, numel);
+    dequantize<uint16_t>(out_data, input_data, scale, zero_point, numel);
   } else if (input.scalar_type() == ScalarType::Short) {
     const int16_t* input_data = input.const_data_ptr<int16_t>();
-    impl::reference::kernels::dequantize<int16_t>(
-        out_data, input_data, scale, zero_point, numel);
+    dequantize<int16_t>(out_data, input_data, scale, zero_point, numel);
   } else {
     ET_CHECK_MSG(
         false,
diff --git a/backends/cadence/reference/operators/op_requantize_out.cpp b/backends/cadence/reference/operators/op_requantize_out.cpp
@@ -95,7 +95,6 @@ Tensor& requantize_out(
     out_data[i] =                                                           \
         kernels::quantize<dtype>(dequant, 1 / out_scale, out_zero_point);   \
   };
-
 #define typed_requantize_in(ctype)               \
   switch (out_dtype) {                           \
     case ScalarType::Byte: {                     \
diff --git a/backends/cadence/reference/operators/quantized_add_out.cpp b/backends/cadence/reference/operators/quantized_add_out.cpp
@@ -14,8 +14,10 @@ namespace impl {
 namespace reference {
 namespace native {
 
-using executorch::aten::Tensor;
-using executorch::runtime::KernelRuntimeContext;
+using ::executorch::aten::Tensor;
+using ::executorch::runtime::KernelRuntimeContext;
+using ::impl::reference::kernels::dequantize;
+using ::impl::reference::kernels::quantize;
 
 template <typename T>
 void quantized_add_per_tensor_impl(
@@ -48,28 +50,28 @@ void quantized_add_per_tensor_impl(
   // Simple case: tensors have the same shape, no broadcasting
   if (X_numel == Y_numel && Y_numel == out_numel) {
     for (size_t i = 0; i < X_numel; ++i) {
-      float x = kernels::dequantize<T>(X_data[i], X_scale_f, X_zero_point_i32);
-      float y = kernels::dequantize<T>(Y_data[i], Y_scale_f, Y_zero_point_i32);
+      float x = dequantize<T>(X_data[i], X_scale_f, X_zero_point_i32);
+      float y = dequantize<T>(Y_data[i], Y_scale_f, Y_zero_point_i32);
       float z = x + y;
-      out_data[i] = kernels::quantize<T>(z, inv_out_scale, out_zero_point_i32);
+      out_data[i] = quantize<T>(z, inv_out_scale, out_zero_point_i32);
     }
   }
   // Y is a scalar tensor
   else if (Y_numel == 1) {
-    float y = kernels::dequantize<T>(Y_data[0], Y_scale_f, Y_zero_point_i32);
+    float y = dequantize<T>(Y_data[0], Y_scale_f, Y_zero_point_i32);
     for (size_t i = 0; i < X_numel; ++i) {
-      float x = kernels::dequantize<T>(X_data[i], X_scale_f, X_zero_point_i32);
+      float x = dequantize<T>(X_data[i], X_scale_f, X_zero_point_i32);
       float z = x + y;
-      out_data[i] = kernels::quantize<T>(z, inv_out_scale, out_zero_point_i32);
+      out_data[i] = quantize<T>(z, inv_out_scale, out_zero_point_i32);
     }
   }
   // X is a scalar tensor
   else if (X_numel == 1) {
-    float x = kernels::dequantize<T>(X_data[0], X_scale_f, X_zero_point_i32);
+    float x = dequantize<T>(X_data[0], X_scale_f, X_zero_point_i32);
     for (size_t i = 0; i < Y_numel; ++i) {
-      float y = kernels::dequantize<T>(Y_data[i], Y_scale_f, Y_zero_point_i32);
+      float y = dequantize<T>(Y_data[i], Y_scale_f, Y_zero_point_i32);
       float z = x + y;
-      out_data[i] = kernels::quantize<T>(z, inv_out_scale, out_zero_point_i32);
+      out_data[i] = quantize<T>(z, inv_out_scale, out_zero_point_i32);
     }
   }
   // General broadcasting case - simplified implementation
@@ -79,12 +81,10 @@ void quantized_add_per_tensor_impl(
       size_t x_idx = (X_numel == 1) ? 0 : i % X_numel;
       size_t y_idx = (Y_numel == 1) ? 0 : i % Y_numel;
 
-      float x =
-          kernels::dequantize<T>(X_data[x_idx], X_scale_f, X_zero_point_i32);
-      float y =
-          kernels::dequantize<T>(Y_data[y_idx], Y_scale_f, Y_zero_point_i32);
+      float x = dequantize<T>(X_data[x_idx], X_scale_f, X_zero_point_i32);
+      float y = dequantize<T>(Y_data[y_idx], Y_scale_f, Y_zero_point_i32);
       float z = x + y;
-      out_data[i] = kernels::quantize<T>(z, inv_out_scale, out_zero_point_i32);
+      out_data[i] = quantize<T>(z, inv_out_scale, out_zero_point_i32);
     }
   }
 }
diff --git a/backends/cadence/reference/operators/quantized_layer_norm.cpp b/backends/cadence/reference/operators/quantized_layer_norm.cpp
@@ -16,6 +16,8 @@ using ::executorch::aten::ScalarType;
 using ::executorch::aten::Tensor;
 using ::executorch::runtime::getLeadingDims;
 using ::executorch::runtime::KernelRuntimeContext;
+using ::impl::reference::kernels::dequantize;
+using ::impl::reference::kernels::quantize;
 
 namespace impl {
 namespace reference {
@@ -74,10 +76,10 @@ void quantized_layer_norm_per_tensor_(
       // y[j] = (x[j] - mean) / std * kGamma + kBeta;
       // Since X is quantized, we dequantize it, compute fp32 result, and
       // quantize the result to an int8/uint8 value.
-      float val = kernels::dequantize<T>(x[j], input_scale, input_zero_point);
+      float val = dequantize<T>(x[j], input_scale, input_zero_point);
 
       val = (val - mean) * inv_std * weight_data[j] + bias_data[j];
-      y[j] = kernels::quantize<T>(val, output_inv_scale, output_zero_point);
+      y[j] = quantize<T>(val, output_inv_scale, output_zero_point);
     }
   }
 }

Original file line number	Diff line number	Diff line change
`@@ -16,6 +16,8 @@ using ::executorch::aten::ScalarType;`
`16`	`16`	`using ::executorch::aten::Tensor;`
`17`	`17`	`using ::executorch::runtime::getLeadingDims;`
`18`	`18`	`using ::executorch::runtime::KernelRuntimeContext;`
	`19`	`+using ::impl::reference::kernels::dequantize;`
	`20`	`+using ::impl::reference::kernels::quantize;`
`19`	`21`
`20`	`22`	`namespace impl {`
`21`	`23`	`namespace reference {`
`@@ -74,10 +76,10 @@ void quantized_layer_norm_per_tensor_(`
`74`	`76`	`// y[j] = (x[j] - mean) / std * kGamma + kBeta;`
`75`	`77`	`// Since X is quantized, we dequantize it, compute fp32 result, and`
`76`	`78`	`// quantize the result to an int8/uint8 value.`
`77`		`- float val = kernels::dequantize<T>(x[j], input_scale, input_zero_point);`
	`79`	`+ float val = dequantize<T>(x[j], input_scale, input_zero_point);`
`78`	`80`
`79`	`81`	`val = (val - mean) * inv_std * weight_data[j] + bias_data[j];`
`80`		`- y[j] = kernels::quantize<T>(val, output_inv_scale, output_zero_point);`
	`82`	`+ y[j] = quantize<T>(val, output_inv_scale, output_zero_point);`
`81`	`83`	`}`
`82`	`84`	`}`
`83`	`85`	`}`