[libclc] Optimize generic CLC fmin/fmax (#128506)

frasercrmck · web-flow · commit 586cacdbdd99 · 2025-07-29T13:21:42.000+01:00
With this commit, the CLC fmin/fmax builtins use clang's __builtin_elementwise_(min|max)imumnum which helps us generate LLVM minimumnum/maximumnum intrinsics directly. These intrinsics uniformly select the non-NaN input over the (quiet or signalling) NaN input, which corresponds to what the OpenCL CTS tests. These intrinsics maintain the vector types, as opposed to scalarizing, which was previously happening. This commit therefore helps to optimize codegen for those targets. Note that there is ongoing discussion regarding how these builtins should handle signalling NaNs in the OpenCL specification and whether they should be able to return a quiet NaN as per the IEEE behaviour. If the specification and/or CTS is ever updated to allow or mandate returning a qNAN, these builtins could/should be updated to use __builtin_elementwise_(min|max)num instead which would lower to LLVM minnum/maxnum intrinsics. The SPIR-V targets maintain the old implementations, as the LLVM -> SPIR-V translator can't currently handle the LLVM intrinsics. The implementation has been simplifies to consistently use clang builtins, as opposed to before where the half version was explicitly defined. [1] KhronosGroup/OpenCL-CTS#2285
diff --git a/libclc/clc/lib/amdgcn/SOURCES b/libclc/clc/lib/amdgcn/SOURCES
@@ -1,5 +1,3 @@
-math/clc_fmax.cl
-math/clc_fmin.cl
 math/clc_ldexp_override.cl
 workitem/clc_get_global_offset.cl
 workitem/clc_get_global_size.cl
diff --git a/libclc/clc/lib/generic/math/clc_fmax.cl b/libclc/clc/lib/generic/math/clc_fmax.cl
@@ -6,53 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <clc/clcmacro.h>
 #include <clc/internal/clc.h>
-#include <clc/relational/clc_isnan.h>
 
-#define __FLOAT_ONLY
-#define __CLC_MIN_VECSIZE 1
 #define FUNCTION __clc_fmax
-#define __IMPL_FUNCTION __builtin_fmaxf
-#define __CLC_BODY <clc/shared/binary_def_scalarize.inc>
-#include <clc/math/gentype.inc>
-#undef __CLC_MIN_VECSIZE
-#undef FUNCTION
-#undef __IMPL_FUNCTION
-
-#ifdef cl_khr_fp64
-
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-
-#define __DOUBLE_ONLY
-#define __CLC_MIN_VECSIZE 1
-#define FUNCTION __clc_fmax
-#define __IMPL_FUNCTION __builtin_fmax
-#define __CLC_BODY <clc/shared/binary_def_scalarize.inc>
-#include <clc/math/gentype.inc>
-#undef __CLC_MIN_VECSIZE
-#undef FUNCTION
-#undef __IMPL_FUNCTION
+#define __IMPL_FUNCTION(x) __builtin_elementwise_maximumnum
+#define __CLC_BODY <clc/shared/binary_def.inc>
 
-#endif
-
-#ifdef cl_khr_fp16
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-_CLC_DEF _CLC_OVERLOAD half __clc_fmax(half x, half y) {
-  if (__clc_isnan(x))
-    return y;
-  if (__clc_isnan(y))
-    return x;
-  return (x < y) ? y : x;
-}
-
-#define __HALF_ONLY
-#define __CLC_SUPPORTED_VECSIZE_OR_1 2
-#define FUNCTION __clc_fmax
-#define __CLC_BODY <clc/shared/binary_def_scalarize.inc>
 #include <clc/math/gentype.inc>
-#undef FUNCTION
-
-#endif
diff --git a/libclc/clc/lib/generic/math/clc_fmin.cl b/libclc/clc/lib/generic/math/clc_fmin.cl
@@ -6,52 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <clc/clcmacro.h>
 #include <clc/internal/clc.h>
-#include <clc/relational/clc_isnan.h>
 
-#define __FLOAT_ONLY
-#define __CLC_MIN_VECSIZE 1
 #define FUNCTION __clc_fmin
-#define __IMPL_FUNCTION __builtin_fminf
-#define __CLC_BODY <clc/shared/binary_def_scalarize.inc>
-#include <clc/math/gentype.inc>
-#undef __CLC_MIN_VECSIZE
-#undef FUNCTION
-#undef __IMPL_FUNCTION
-
-#ifdef cl_khr_fp64
-
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-
-#define __DOUBLE_ONLY
-#define __CLC_MIN_VECSIZE 1
-#define FUNCTION __clc_fmin
-#define __IMPL_FUNCTION __builtin_fmin
-#define __CLC_BODY <clc/shared/binary_def_scalarize.inc>
-#include <clc/math/gentype.inc>
-#undef __CLC_MIN_VECSIZE
-#undef FUNCTION
-#undef __IMPL_FUNCTION
+#define __IMPL_FUNCTION(x) __builtin_elementwise_minimumnum
+#define __CLC_BODY <clc/shared/binary_def.inc>
 
-#endif
-
-#ifdef cl_khr_fp16
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-_CLC_DEF _CLC_OVERLOAD half __clc_fmin(half x, half y) {
-  if (__clc_isnan(x))
-    return y;
-  if (__clc_isnan(y))
-    return x;
-  return (y < x) ? y : x;
-}
-
-#define __HALF_ONLY
-#define __CLC_SUPPORTED_VECSIZE_OR_1 2
-#define FUNCTION __clc_fmin
-#define __CLC_BODY <clc/shared/binary_def_scalarize.inc>
 #include <clc/math/gentype.inc>
-
-#endif
diff --git a/libclc/clc/lib/r600/SOURCES b/libclc/clc/lib/r600/SOURCES
@@ -1,4 +1,2 @@
-math/clc_fmax.cl
-math/clc_fmin.cl
 math/clc_native_rsqrt.cl
 math/clc_rsqrt_override.cl
diff --git a/libclc/clc/lib/r600/math/clc_fmax.cl b/libclc/clc/lib/r600/math/clc_fmax.cl
diff --git a/libclc/clc/lib/r600/math/clc_fmin.cl b/libclc/clc/lib/r600/math/clc_fmin.cl
diff --git a/libclc/clc/lib/spirv/SOURCES b/libclc/clc/lib/spirv/SOURCES
@@ -1 +1,3 @@
+math/clc_fmax.cl
+math/clc_fmin.cl
 math/clc_runtime_has_hw_fma32.cl
diff --git a/libclc/clc/lib/spirv/math/clc_fmax.cl b/libclc/clc/lib/spirv/math/clc_fmax.cl
@@ -8,40 +8,23 @@
 
 #include <clc/clcmacro.h>
 #include <clc/internal/clc.h>
-#include <clc/relational/clc_isnan.h>
 
 _CLC_DEF _CLC_OVERLOAD float __clc_fmax(float x, float y) {
-  // fcanonicalize removes sNaNs and flushes denormals if not enabled. Otherwise
-  // fmax instruction flushes the values for comparison, but outputs original
-  // denormal
-  x = __builtin_canonicalizef(x);
-  y = __builtin_canonicalizef(y);
   return __builtin_fmaxf(x, y);
 }
 
 #ifdef cl_khr_fp64
-
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
-
 _CLC_DEF _CLC_OVERLOAD double __clc_fmax(double x, double y) {
-  x = __builtin_canonicalize(x);
-  y = __builtin_canonicalize(y);
   return __builtin_fmax(x, y);
 }
-
 #endif
-#ifdef cl_khr_fp16
 
+#ifdef cl_khr_fp16
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
 _CLC_DEF _CLC_OVERLOAD half __clc_fmax(half x, half y) {
-  if (__clc_isnan(x))
-    return y;
-  if (__clc_isnan(y))
-    return x;
-  return (y < x) ? x : y;
+  return __builtin_fmaxf16(x, y);
 }
-
 #endif
 
 #define FUNCTION __clc_fmax
diff --git a/libclc/clc/lib/spirv/math/clc_fmin.cl b/libclc/clc/lib/spirv/math/clc_fmin.cl
@@ -8,41 +8,23 @@
 
 #include <clc/clcmacro.h>
 #include <clc/internal/clc.h>
-#include <clc/relational/clc_isnan.h>
 
 _CLC_DEF _CLC_OVERLOAD float __clc_fmin(float x, float y) {
-  // fcanonicalize removes sNaNs and flushes denormals if not enabled. Otherwise
-  // fmin instruction flushes the values for comparison, but outputs original
-  // denormal
-  x = __builtin_canonicalizef(x);
-  y = __builtin_canonicalizef(y);
   return __builtin_fminf(x, y);
 }
 
 #ifdef cl_khr_fp64
-
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
-
 _CLC_DEF _CLC_OVERLOAD double __clc_fmin(double x, double y) {
-  x = __builtin_canonicalize(x);
-  y = __builtin_canonicalize(y);
   return __builtin_fmin(x, y);
 }
-
 #endif
 
 #ifdef cl_khr_fp16
-
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
 _CLC_DEF _CLC_OVERLOAD half __clc_fmin(half x, half y) {
-  if (__clc_isnan(x))
-    return y;
-  if (__clc_isnan(y))
-    return x;
-  return (y < x) ? y : x;
+  return __builtin_fminf16(x, y);
 }
-
 #endif
 
 #define FUNCTION __clc_fmin

Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,3 @@`
`1`		`-math/clc_fmax.cl`
`2`		`-math/clc_fmin.cl`
`3`	`1`	`math/clc_ldexp_override.cl`
`4`	`2`	`workitem/clc_get_global_offset.cl`
`5`	`3`	`workitem/clc_get_global_size.cl`
Original file line number	Diff line number	Diff line change
`@@ -1 +1,3 @@`
	`1`	`+math/clc_fmax.cl`
	`2`	`+math/clc_fmin.cl`
`1`	`3`	`math/clc_runtime_has_hw_fma32.cl`