Refactored ef64 sqrt algorithm

Przemog1 · Przemog1 · commit 4075ab6a2776 · 2025-07-25T17:40:33.000+02:00
diff --git a/include/nbl/builtin/hlsl/emulated/float64_t.hlsl b/include/nbl/builtin/hlsl/emulated/float64_t.hlsl
@@ -395,44 +395,57 @@ namespace hlsl
             return bit_cast<this_t>(data ^ ieee754::traits<float64_t>::signMask);
         }
 
+        /**
+        * @brief Computes sqare root estimation.
+        * 
+        * Can be less precise when FastMath is disabled.
+        * sqrt(inf) = inf
+        * sqrt(-0) = -0
+        * sqrt(NaN) = NaN
+        */
         static this_t sqrt(this_t number)
         {
-            // so it doesn't return NaN for -0.0
             bool isZero = !(number.data & 0x7FFFFFFFFFFFFFFFull);
             if (isZero)
                 return number;
 
-            bool isNegative = (number.data >> 63) > 0;
-            if (isNegative)
-                return bit_cast<this_t>(ieee754::traits<this_t>::quietNaN);
-
-            if(!FastMath)
+            static const uint64_t MaxFloat64AsUint64 = 0x7FEFFFFFFFFFFFFFull;
+            if (number.data > MaxFloat64AsUint64)
             {
                 bool isInf = cpp_compat_intrinsics_impl::isinf_uint_impl(number.data);
                 if (isInf)
                     return number;
-            }
 
-            // find square root initial guess using the fast inverse square root algorithm
-            nbl::hlsl::emulated_float64_t<true, true> invSquareRoot = number;
-            {
-                int64_t i = 0x5fe6eb50c7b537a9ull - (number.data >> 1);
-                invSquareRoot.data = i;
-
-                nbl::hlsl::emulated_float64_t<true, true> threeHalfs = emulated_float64_t<true, true>::create(1.5);
-                nbl::hlsl::emulated_float64_t<true, true> x2 = number * emulated_float64_t<true, true>::create(0.5);
-                invSquareRoot = invSquareRoot * (threeHalfs - (x2 * invSquareRoot * invSquareRoot));
+                // when (number.data > MaxFloat64AsUint64) and is not infinity, we can be sure that number is either NaN or negative
+                return bit_cast<this_t>(ieee754::traits<this_t>::quietNaN);
             }
 
+            const float f32InverseSquareRoot = nbl::hlsl::rsqrt(_static_cast<float>(number));
+
             // find sqrt approximation using the Newton-Raphson method
-            nbl::hlsl::emulated_float64_t<true, true> squareRoot = nbl::hlsl::emulated_float64_t<true, true>::create(1.0) / invSquareRoot;
+            this_t inverseSquareRoot = _static_cast<this_t>(f32InverseSquareRoot);
             const int Iterations = 5;
+            static const this_t Half = this_t::create(0.5f);
+            static const this_t ThreeHalfs = this_t::create(1.5f);
+            const this_t x2 = number * Half;
+            [[unroll]]
             for (int i = 0; i < Iterations; ++i)
             {
-                squareRoot = nbl::hlsl::emulated_float64_t<true, true>::create(0.5) * (squareRoot + number / squareRoot);
+                inverseSquareRoot = inverseSquareRoot * (ThreeHalfs - (x2 * inverseSquareRoot * inverseSquareRoot));
             }
 
-            return squareRoot;
+            if (FastMath)
+            {
+                return this_t::create(1.0f) / inverseSquareRoot;
+            }
+            else
+            {
+                // 2 Newton-Raphson iterations to increase precision
+                this_t squareRoot = this_t::create(1.0f) / inverseSquareRoot;
+                squareRoot = Half * (squareRoot + number / squareRoot);
+                squareRoot = Half * (squareRoot + number / squareRoot);
+                return squareRoot;
+            }
         }
 
         NBL_CONSTEXPR_STATIC bool isFastMathSupported = FastMath;