@@ -43,25 +43,26 @@ class CDirQuantCacheBase
43
43
44
44
Vector8u3 () : x (0u ),y (0u ),z (0u ) {}
45
45
Vector8u3 (const Vector8u3&) = default ;
46
- explicit Vector8u3 (const core::vectorSIMDu32 & val)
46
+ explicit Vector8u3 (const hlsl::uint32_t4 & val)
47
47
{
48
48
operator =(val);
49
49
}
50
50
51
51
Vector8u3& operator =(const Vector8u3&) = default ;
52
- Vector8u3& operator =(const core::vectorSIMDu32 & val)
52
+ Vector8u3& operator =(const hlsl::uint32_t4 & val)
53
53
{
54
54
x = val.x ;
55
55
y = val.y ;
56
56
z = val.z ;
57
57
return *this ;
58
58
}
59
59
60
- inline core::vectorSIMDu32 getValue () const
60
+ hlsl::uint32_t4 getValue () const
61
61
{
62
- return core::vectorSIMDu32 (x,y,z) ;
62
+ return { x, y, z, 0 } ;
63
63
}
64
64
65
+
65
66
private:
66
67
uint8_t x;
67
68
uint8_t y;
@@ -74,13 +75,13 @@ class CDirQuantCacheBase
74
75
75
76
Vector8u4 () : x (0u ),y (0u ),z (0u ),w (0u ) {}
76
77
Vector8u4 (const Vector8u4&) = default ;
77
- explicit Vector8u4 (const core::vectorSIMDu32 & val)
78
+ explicit Vector8u4 (const hlsl::uint32_t4 & val)
78
79
{
79
80
operator =(val);
80
81
}
81
82
82
83
Vector8u4& operator =(const Vector8u4&) = default ;
83
- Vector8u4& operator =(const core::vectorSIMDu32 & val)
84
+ Vector8u4& operator =(const hlsl::uint32_t4 & val)
84
85
{
85
86
x = val.x ;
86
87
y = val.y ;
@@ -89,9 +90,9 @@ class CDirQuantCacheBase
89
90
return *this ;
90
91
}
91
92
92
- inline core::vectorSIMDu32 getValue () const
93
+ hlsl::uint32_t4 getValue () const
93
94
{
94
- return core::vectorSIMDu32 (x,y,z,w) ;
95
+ return { x, y, z, w } ;
95
96
}
96
97
97
98
private:
@@ -108,16 +109,16 @@ class CDirQuantCacheBase
108
109
109
110
Vector1010102 () : storage (0u ) {}
110
111
Vector1010102 (const Vector1010102&) = default ;
111
- explicit Vector1010102 (const core::vectorSIMDu32 & val)
112
+ explicit Vector1010102 (const hlsl::uint32_t4 & val)
112
113
{
113
114
operator =(val);
114
115
}
115
116
116
117
Vector1010102& operator =(const Vector1010102&) = default ;
117
- Vector1010102& operator =(const core::vectorSIMDu32 & val)
118
+ Vector1010102& operator =(const hlsl::uint32_t4 & val)
118
119
{
119
- constexpr auto storageBits = quantizationBits+ 1u ;
120
- storage = val.x | (val.y << storageBits)| (val.z << (storageBits* 2u ));
120
+ constexpr auto storageBits = quantizationBits + 1u ;
121
+ storage = val.x | (val.y << storageBits) | (val.z << (storageBits * 2u ));
121
122
return *this ;
122
123
}
123
124
@@ -130,13 +131,13 @@ class CDirQuantCacheBase
130
131
return storage==other.storage ;
131
132
}
132
133
133
- inline core::vectorSIMDu32 getValue () const
134
+ hlsl::uint32_t4 getValue () const
134
135
{
135
- constexpr auto storageBits = quantizationBits+ 1u ;
136
- const core::vectorSIMDu32 mask (( 0x1u << storageBits)- 1u ) ;
137
- return core::vectorSIMDu32 ( storage, storage>> storageBits, storage>> (storageBits* 2u ))& mask;
136
+ constexpr auto storageBits = quantizationBits + 1u ;
137
+ const auto mask = ( 0x1u << storageBits) - 1u ;
138
+ return { storage & mask, ( storage >> storageBits) & mask, ( storage >> (storageBits * 2 )) & mask, 0 } ;
138
139
}
139
-
140
+
140
141
private:
141
142
uint32_t storage;
142
143
};
@@ -149,25 +150,25 @@ class CDirQuantCacheBase
149
150
150
151
Vector16u3 () : x (0u ),y (0u ),z (0u ) {}
151
152
Vector16u3 (const Vector16u3&) = default ;
152
- explicit Vector16u3 (const core::vectorSIMDu32 & val)
153
+ explicit Vector16u3 (const hlsl::uint32_t4 & val)
153
154
{
154
155
operator =(val);
155
156
}
156
157
157
158
Vector16u3& operator =(const Vector16u3&) = default ;
158
- Vector16u3& operator =(const core::vectorSIMDu32 & val)
159
+ Vector16u3& operator =(const hlsl::uint32_t4 & val)
159
160
{
160
161
x = val.x ;
161
162
y = val.y ;
162
163
z = val.z ;
163
164
return *this ;
164
165
}
165
166
166
- inline core::vectorSIMDu32 getValue () const
167
+ hlsl::uint32_t4 getValue () const
167
168
{
168
- return core::vectorSIMDu32 (x,y,z) ;
169
+ return { x, y, z, 0 } ;
169
170
}
170
-
171
+
171
172
private:
172
173
uint16_t x;
173
174
uint16_t y;
@@ -180,13 +181,13 @@ class CDirQuantCacheBase
180
181
181
182
Vector16u4 () : x (0u ),y (0u ),z (0u ),w (0u ) {}
182
183
Vector16u4 (const Vector16u4&) = default ;
183
- explicit Vector16u4 (const core::vectorSIMDu32 & val)
184
+ explicit Vector16u4 (const hlsl::uint32_t4 & val)
184
185
{
185
186
operator =(val);
186
187
}
187
188
188
189
Vector16u4& operator =(const Vector16u4&) = default ;
189
- Vector16u4& operator =(const core::vectorSIMDu32 & val)
190
+ Vector16u4& operator =(const hlsl::uint32_t4 & val)
190
191
{
191
192
x = val.x ;
192
193
y = val.y ;
@@ -195,11 +196,11 @@ class CDirQuantCacheBase
195
196
return *this ;
196
197
}
197
198
198
- inline core::vectorSIMDu32 getValue () const
199
+ hlsl::float32_t4 getValue () const
199
200
{
200
- return core::vectorSIMDu32 (x,y,z,w) ;
201
+ return { x, y, z, w } ;
201
202
}
202
-
203
+
203
204
private:
204
205
uint16_t x;
205
206
uint16_t y;
@@ -377,11 +378,30 @@ class CDirQuantCacheBase : public virtual core::IReferenceCounted, public impl::
377
378
std::tuple<cache_type_t <Formats>...> cache;
378
379
379
380
template <uint32_t dimensions, E_FORMAT CacheFormat>
380
- value_type_t <CacheFormat> quantize (const core::vectorSIMDf & value)
381
+ value_type_t <CacheFormat> quantize (const hlsl::vector<hlsl:: float32_t , dimensions> & value)
381
382
{
382
- const auto negativeMask = value < core::vectorSIMDf (0 .0f );
383
+ using float32_tN = hlsl::vector<hlsl::float32_t , dimensions>;
384
+
385
+ auto to_vec_t4 = []<typename T>(hlsl::vector<T, dimensions> src, T padValue) -> hlsl::vector<T, 4 >
386
+ {
387
+ if constexpr (dimensions == 1 )
388
+ {
389
+ return {src.x , padValue, padValue, padValue};
390
+ } else if constexpr (dimensions == 2 )
391
+ {
392
+ return {src.x , src.y , padValue, padValue};
393
+ } else if constexpr (dimensions == 3 )
394
+ {
395
+ return {src.x , src.y , src.z , padValue};
396
+ } else if constexpr (dimensions == 4 )
397
+ {
398
+ return {src.x , src.y , src.z , src.w };
399
+ }
400
+ };
401
+
402
+ const auto negativeMask = to_vec_t4 (lessThan (value, float32_tN (0 .0f )), false );
383
403
384
- const core::vectorSIMDf absValue = abs (value);
404
+ const float32_tN absValue = abs (value);
385
405
const auto key = Key (absValue);
386
406
387
407
constexpr auto quantizationBits = quantization_bits_v<CacheFormat>;
@@ -393,32 +413,50 @@ class CDirQuantCacheBase : public virtual core::IReferenceCounted, public impl::
393
413
quantized = found->second ;
394
414
else
395
415
{
396
- const core::vectorSIMDf fit = findBestFit<dimensions,quantizationBits>(absValue);
416
+ const auto fit = findBestFit<dimensions,quantizationBits>(absValue);
417
+
418
+ const auto abs_fit = to_vec_t4 (abs (fit), 0 .f );
419
+ quantized = hlsl::uint32_t4 (abs_fit.x , abs_fit.y , abs_fit.z , abs_fit.w );
397
420
398
- quantized = core::vectorSIMDu32 (core::abs (fit));
399
421
insertIntoCache<CacheFormat>(key,quantized);
400
422
}
401
423
}
402
424
403
- const core::vectorSIMDu32 xorflag ((0x1u <<(quantizationBits+1u ))-1u );
404
- auto restoredAsVec = quantized.getValue ()^core::mix (core::vectorSIMDu32 (0u ),xorflag,negativeMask);
405
- restoredAsVec += core::mix (core::vectorSIMDu32 (0u ),core::vectorSIMDu32 (1u ),negativeMask);
406
- return value_type_t <CacheFormat>(restoredAsVec&xorflag);
425
+ auto select = [](hlsl::uint32_t4 val1, hlsl::uint32_t4 val2, hlsl::bool4 mask)
426
+ {
427
+ hlsl::uint32_t4 retval;
428
+ retval.x = mask.x ? val2.x : val1.x ;
429
+ retval.y = mask.y ? val2.y : val1.y ;
430
+ retval.z = mask.z ? val2.z : val1.z ;
431
+ retval.w = mask.w ? val2.w : val1.w ;
432
+ return retval;
433
+ };
434
+ ;
435
+ // create all one bits
436
+ const hlsl::uint32_t4 xorflag ((0x1u << (quantizationBits + 1u )) - 1u );
437
+
438
+ // for positive number xoring with 0 keep its value
439
+ // for negative number we xor with all one which will flip the bits, then we add one later. Flipping the bits then adding one will turn positive number into negative number
440
+ auto restoredAsVec = quantized.getValue () ^ select (hlsl::uint32_t4 (0u ), hlsl::uint32_t4 (xorflag), negativeMask);
441
+ restoredAsVec += hlsl::uint32_t4 (negativeMask);
442
+
443
+ return value_type_t <CacheFormat>(restoredAsVec);
407
444
}
408
445
409
446
template <uint32_t dimensions, uint32_t quantizationBits>
410
- static inline core::vectorSIMDf findBestFit (const core::vectorSIMDf & value)
447
+ static inline hlsl::vector<hlsl:: float32_t , dimensions> findBestFit (const hlsl::vector<hlsl:: float32_t , dimensions> & value)
411
448
{
449
+ using float32_tN = hlsl::vector<hlsl::float32_t , dimensions>;
412
450
static_assert (dimensions>1u ," No point" );
413
451
static_assert (dimensions<=4u ," High Dimensions are Hard!" );
414
- // precise normalize
415
- const auto vectorForDots = value. preciseDivision ( length ( value) );
452
+
453
+ const auto vectorForDots = hlsl::normalize ( value);
416
454
417
455
//
418
- core::vectorSIMDf fittingVector;
419
- core::vectorSIMDf floorOffset;
456
+ float32_tN fittingVector;
457
+ float32_tN floorOffset = {} ;
420
458
constexpr uint32_t cornerCount = (0x1u <<(dimensions-1u ))-1u ;
421
- core::vectorSIMDf corners[cornerCount] = {};
459
+ float32_tN corners[cornerCount] = {};
422
460
{
423
461
uint32_t maxDirCompIndex = 0u ;
424
462
for (auto i=1u ; i<dimensions; i++)
@@ -430,9 +468,9 @@ class CDirQuantCacheBase : public virtual core::IReferenceCounted, public impl::
430
468
if (maxDirectionComp < std::sqrtf (0 .9998f / float (dimensions)))
431
469
{
432
470
_NBL_DEBUG_BREAK_IF (true );
433
- return core::vectorSIMDf (0 .f );
471
+ return float32_tN (0 .f );
434
472
}
435
- fittingVector = value. preciseDivision ( core::vectorSIMDf ( maxDirectionComp)) ;
473
+ fittingVector = value / maxDirectionComp;
436
474
floorOffset[maxDirCompIndex] = 0 .499f ;
437
475
const uint32_t localCorner[7 ][3 ] = {
438
476
{1 ,0 ,0 },
@@ -452,12 +490,12 @@ class CDirQuantCacheBase : public virtual core::IReferenceCounted, public impl::
452
490
}
453
491
}
454
492
455
- core::vectorSIMDf bestFit;
493
+ float32_tN bestFit;
456
494
float closestTo1 = -1 .f ;
457
- auto evaluateFit = [&](const core::vectorSIMDf & newFit) -> void
495
+ auto evaluateFit = [&](const float32_tN & newFit) -> void
458
496
{
459
- auto newFitLen = core:: length (newFit);
460
- const float dp = core ::dot<core::vectorSIMDf> (newFit,vectorForDots). preciseDivision (newFitLen)[ 0 ] ;
497
+ auto newFitLen = length (newFit);
498
+ const float dp = hlsl ::dot (newFit,vectorForDots) / (newFitLen);
461
499
if (dp > closestTo1)
462
500
{
463
501
closestTo1 = dp;
@@ -466,18 +504,18 @@ class CDirQuantCacheBase : public virtual core::IReferenceCounted, public impl::
466
504
};
467
505
468
506
constexpr uint32_t cubeHalfSize = (0x1u << quantizationBits) - 1u ;
469
- const core::vectorSIMDf cubeHalfSizeND = core::vectorSIMDf (cubeHalfSize);
507
+ const float32_tN cubeHalfSizeND = hlsl::promote<float32_tN> (cubeHalfSize);
470
508
for (uint32_t n=cubeHalfSize; n>0u ; n--)
471
509
{
472
510
// we'd use float addition in the interest of speed, to increment the loop
473
511
// but adding a small number to a large one loses precision, so multiplication preferrable
474
- core::vectorSIMDf bottomFit = core ::floor (fittingVector* float (n)+ floorOffset);
475
- if (( bottomFit<= cubeHalfSizeND). all ( ))
512
+ const auto bottomFit = glm ::floor (fittingVector * float (n) + floorOffset);
513
+ if (hlsl::all ( glm::lessThanEqual ( bottomFit, cubeHalfSizeND)))
476
514
evaluateFit (bottomFit);
477
- for (auto i= 0u ; i< cornerCount; i++)
515
+ for (auto i = 0u ; i < cornerCount; i++)
478
516
{
479
517
auto bottomFitTmp = bottomFit+corners[i];
480
- if (( bottomFitTmp<= cubeHalfSizeND). all ( ))
518
+ if (hlsl::all ( glm::lessThanEqual ( bottomFitTmp, cubeHalfSizeND)))
481
519
evaluateFit (bottomFitTmp);
482
520
}
483
521
}
0 commit comments