@@ -395,44 +395,57 @@ namespace hlsl
395
395
return bit_cast<this_t>(data ^ ieee754::traits<float64_t>::signMask);
396
396
}
397
397
398
+ /**
399
+ * @brief Computes sqare root estimation.
400
+ *
401
+ * Can be less precise when FastMath is disabled.
402
+ * sqrt(inf) = inf
403
+ * sqrt(-0) = -0
404
+ * sqrt(NaN) = NaN
405
+ */
398
406
static this_t sqrt (this_t number)
399
407
{
400
- // so it doesn't return NaN for -0.0
401
408
bool isZero = !(number.data & 0x7FFFFFFFFFFFFFFFull);
402
409
if (isZero)
403
410
return number;
404
411
405
- bool isNegative = (number.data >> 63 ) > 0 ;
406
- if (isNegative)
407
- return bit_cast<this_t>(ieee754::traits<this_t>::quietNaN);
408
-
409
- if (!FastMath)
412
+ static const uint64_t MaxFloat64AsUint64 = 0x7FEFFFFFFFFFFFFFull;
413
+ if (number.data > MaxFloat64AsUint64)
410
414
{
411
415
bool isInf = cpp_compat_intrinsics_impl::isinf_uint_impl (number.data);
412
416
if (isInf)
413
417
return number;
414
- }
415
418
416
- // find square root initial guess using the fast inverse square root algorithm
417
- nbl::hlsl::emulated_float64_t<true , true > invSquareRoot = number;
418
- {
419
- int64_t i = 0x5fe6eb50c7b537a9ull - (number.data >> 1 );
420
- invSquareRoot.data = i;
421
-
422
- nbl::hlsl::emulated_float64_t<true , true > threeHalfs = emulated_float64_t<true , true >::create (1.5 );
423
- nbl::hlsl::emulated_float64_t<true , true > x2 = number * emulated_float64_t<true , true >::create (0.5 );
424
- invSquareRoot = invSquareRoot * (threeHalfs - (x2 * invSquareRoot * invSquareRoot));
419
+ // when (number.data > MaxFloat64AsUint64) and is not infinity, we can be sure that number is either NaN or negative
420
+ return bit_cast<this_t>(ieee754::traits<this_t>::quietNaN);
425
421
}
426
422
423
+ const float f32InverseSquareRoot = nbl::hlsl::rsqrt (_static_cast<float >(number));
424
+
427
425
// find sqrt approximation using the Newton-Raphson method
428
- nbl::hlsl::emulated_float64_t< true , true > squareRoot = nbl::hlsl::emulated_float64_t< true , true >:: create ( 1.0 ) / invSquareRoot ;
426
+ this_t inverseSquareRoot = _static_cast<this_t>(f32InverseSquareRoot) ;
429
427
const int Iterations = 5 ;
428
+ static const this_t Half = this_t::create (0.5f );
429
+ static const this_t ThreeHalfs = this_t::create (1.5f );
430
+ const this_t x2 = number * Half;
431
+ [[unroll]]
430
432
for (int i = 0 ; i < Iterations; ++i)
431
433
{
432
- squareRoot = nbl::hlsl::emulated_float64_t< true , true >:: create ( 0.5 ) * (squareRoot + number / squareRoot );
434
+ inverseSquareRoot = inverseSquareRoot * (ThreeHalfs - (x2 * inverseSquareRoot * inverseSquareRoot) );
433
435
}
434
436
435
- return squareRoot;
437
+ if (FastMath)
438
+ {
439
+ return this_t::create (1.0f ) / inverseSquareRoot;
440
+ }
441
+ else
442
+ {
443
+ // 2 Newton-Raphson iterations to increase precision
444
+ this_t squareRoot = this_t::create (1.0f ) / inverseSquareRoot;
445
+ squareRoot = Half * (squareRoot + number / squareRoot);
446
+ squareRoot = Half * (squareRoot + number / squareRoot);
447
+ return squareRoot;
448
+ }
436
449
}
437
450
438
451
NBL_CONSTEXPR_STATIC bool isFastMathSupported = FastMath;
0 commit comments