@@ -33,6 +33,28 @@ namespace xsimd
33
33
{
34
34
using namespace types ;
35
35
36
+ namespace detail
37
+ {
38
+ constexpr uint32_t shuffle (uint32_t w, uint32_t x, uint32_t y, uint32_t z)
39
+ {
40
+ return (z << 6 ) | (y << 4 ) | (x << 2 ) | w;
41
+ }
42
+ constexpr uint32_t shuffle (uint32_t x, uint32_t y)
43
+ {
44
+ return (y << 1 ) | x;
45
+ }
46
+
47
+ constexpr uint32_t mod_shuffle (uint32_t w, uint32_t x, uint32_t y, uint32_t z)
48
+ {
49
+ return shuffle (w % 4 , x % 4 , y % 4 , z % 4 );
50
+ }
51
+
52
+ constexpr uint32_t mod_shuffle (uint32_t w, uint32_t x)
53
+ {
54
+ return shuffle (w % 2 , x % 2 );
55
+ }
56
+ }
57
+
36
58
// fwd
37
59
template <class A , class T , size_t I>
38
60
XSIMD_INLINE batch<T, A> insert (batch<T, A> const & self, T val, index<I>, requires_arch<common>) noexcept ;
@@ -1282,13 +1304,16 @@ namespace xsimd
1282
1304
template <class A , class T , class _ = typename std::enable_if<(sizeof (T) <= 2 ), void >::type>
1283
1305
XSIMD_INLINE T reduce_max (batch<T, A> const & self, requires_arch<sse2>) noexcept
1284
1306
{
1285
- batch<T, A> step0 = _mm_shuffle_epi32 (self, detail::shuffle<2 , 3 , 0 , 0 >());
1307
+ constexpr auto mask0 = detail::shuffle (2 , 3 , 0 , 0 );
1308
+ batch<T, A> step0 = _mm_shuffle_epi32 (self, mask0);
1286
1309
batch<T, A> acc0 = max (self, step0);
1287
1310
1288
- batch<T, A> step1 = _mm_shuffle_epi32 (acc0, detail::shuffle<1 , 0 , 0 , 0 >());
1311
+ constexpr auto mask1 = detail::shuffle (1 , 0 , 0 , 0 );
1312
+ batch<T, A> step1 = _mm_shuffle_epi32 (acc0, mask1);
1289
1313
batch<T, A> acc1 = max (acc0, step1);
1290
1314
1291
- batch<T, A> step2 = _mm_shufflelo_epi16 (acc1, detail::shuffle<1 , 0 , 0 , 0 >());
1315
+ constexpr auto mask2 = detail::shuffle (1 , 0 , 0 , 0 );
1316
+ batch<T, A> step2 = _mm_shufflelo_epi16 (acc1, mask2);
1292
1317
batch<T, A> acc2 = max (acc1, step2);
1293
1318
if (sizeof (T) == 2 )
1294
1319
return first (acc2, A {});
@@ -1301,13 +1326,16 @@ namespace xsimd
1301
1326
template <class A , class T , class _ = typename std::enable_if<(sizeof (T) <= 2 ), void >::type>
1302
1327
XSIMD_INLINE T reduce_min (batch<T, A> const & self, requires_arch<sse2>) noexcept
1303
1328
{
1304
- batch<T, A> step0 = _mm_shuffle_epi32 (self, detail::shuffle<2 , 3 , 0 , 0 >());
1329
+ constexpr auto mask0 = detail::shuffle (2 , 3 , 0 , 0 );
1330
+ batch<T, A> step0 = _mm_shuffle_epi32 (self, mask0);
1305
1331
batch<T, A> acc0 = min (self, step0);
1306
1332
1307
- batch<T, A> step1 = _mm_shuffle_epi32 (acc0, detail::shuffle<1 , 0 , 0 , 0 >());
1333
+ constexpr auto mask1 = detail::shuffle (1 , 0 , 0 , 0 );
1334
+ batch<T, A> step1 = _mm_shuffle_epi32 (acc0, mask1);
1308
1335
batch<T, A> acc1 = min (acc0, step1);
1309
1336
1310
- batch<T, A> step2 = _mm_shufflelo_epi16 (acc1, detail::shuffle<1 , 0 , 0 , 0 >());
1337
+ constexpr auto mask2 = detail::shuffle (1 , 0 , 0 , 0 );
1338
+ batch<T, A> step2 = _mm_shufflelo_epi16 (acc1, mask2);
1311
1339
batch<T, A> acc2 = min (acc1, step2);
1312
1340
if (sizeof (T) == 2 )
1313
1341
return first (acc2, A {});
@@ -1355,7 +1383,7 @@ namespace xsimd
1355
1383
template <class A , class ITy , ITy I0, ITy I1, ITy I2, ITy I3>
1356
1384
XSIMD_INLINE batch<float , A> shuffle (batch<float , A> const & x, batch<float , A> const & y, batch_constant<ITy, A, I0, I1, I2, I3> mask, requires_arch<sse2>) noexcept
1357
1385
{
1358
- constexpr uint32_t smask = detail::mod_shuffle< I0, I1, I2, I3>( );
1386
+ constexpr uint32_t smask = detail::mod_shuffle ( I0, I1, I2, I3);
1359
1387
// shuffle within lane
1360
1388
if (I0 < 4 && I1 < 4 && I2 >= 4 && I3 >= 4 )
1361
1389
return _mm_shuffle_ps (x, y, smask);
@@ -1369,7 +1397,7 @@ namespace xsimd
1369
1397
template <class A , class ITy , ITy I0, ITy I1>
1370
1398
XSIMD_INLINE batch<double , A> shuffle (batch<double , A> const & x, batch<double , A> const & y, batch_constant<ITy, A, I0, I1> mask, requires_arch<sse2>) noexcept
1371
1399
{
1372
- constexpr uint32_t smask = detail::mod_shuffle< I0, I1>( );
1400
+ constexpr uint32_t smask = detail::mod_shuffle ( I0, I1);
1373
1401
// shuffle within lane
1374
1402
if (I0 < 2 && I1 >= 2 )
1375
1403
return _mm_shuffle_pd (x, y, smask);
@@ -1617,26 +1645,24 @@ namespace xsimd
1617
1645
return _mm_sub_pd (self, other);
1618
1646
}
1619
1647
1620
- // swizzle
1621
-
1622
1648
template <class A , uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
1623
1649
XSIMD_INLINE batch<float , A> swizzle (batch<float , A> const & self, batch_constant<uint32_t , A, V0, V1, V2, V3>, requires_arch<sse2>) noexcept
1624
1650
{
1625
- constexpr uint32_t index = detail::shuffle< V0, V1, V2, V3>( );
1651
+ constexpr uint32_t index = detail::shuffle ( V0, V1, V2, V3);
1626
1652
return _mm_shuffle_ps (self, self, index);
1627
1653
}
1628
1654
1629
1655
template <class A , uint64_t V0, uint64_t V1>
1630
1656
XSIMD_INLINE batch<double , A> swizzle (batch<double , A> const & self, batch_constant<uint64_t , A, V0, V1>, requires_arch<sse2>) noexcept
1631
1657
{
1632
- constexpr uint32_t index = detail::shuffle< V0, V1>( );
1658
+ constexpr uint32_t index = detail::shuffle ( V0, V1);
1633
1659
return _mm_shuffle_pd (self, self, index);
1634
1660
}
1635
1661
1636
1662
template <class A , uint64_t V0, uint64_t V1>
1637
1663
XSIMD_INLINE batch<uint64_t , A> swizzle (batch<uint64_t , A> const & self, batch_constant<uint64_t , A, V0, V1>, requires_arch<sse2>) noexcept
1638
1664
{
1639
- constexpr uint32_t index = detail::shuffle< 2 * V0, 2 * V0 + 1 , 2 * V1, 2 * V1 + 1 >( );
1665
+ constexpr uint32_t index = detail::shuffle ( 2 * V0, 2 * V0 + 1 , 2 * V1, 2 * V1 + 1 );
1640
1666
return _mm_shuffle_epi32 (self, index);
1641
1667
}
1642
1668
@@ -1649,7 +1675,7 @@ namespace xsimd
1649
1675
template <class A , uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
1650
1676
XSIMD_INLINE batch<uint32_t , A> swizzle (batch<uint32_t , A> const & self, batch_constant<uint32_t , A, V0, V1, V2, V3>, requires_arch<sse2>) noexcept
1651
1677
{
1652
- constexpr uint32_t index = detail::shuffle< V0, V1, V2, V3>( );
1678
+ constexpr uint32_t index = detail::shuffle ( V0, V1, V2, V3);
1653
1679
return _mm_shuffle_epi32 (self, index);
1654
1680
}
1655
1681
@@ -1663,8 +1689,8 @@ namespace xsimd
1663
1689
XSIMD_INLINE batch<int16_t , A>
1664
1690
swizzle (batch<int16_t , A> const & self, batch_constant<uint16_t , A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<sse2>) noexcept
1665
1691
{
1666
- constexpr int imm_lo = detail::mod_shuffle< V0, V1, V2, V3>( );
1667
- constexpr int imm_hi = detail::mod_shuffle< V4, V5, V6, V7>( );
1692
+ constexpr int imm_lo = detail::mod_shuffle ( V0, V1, V2, V3);
1693
+ constexpr int imm_hi = detail::mod_shuffle ( V4, V5, V6, V7);
1668
1694
// 0) identity?
1669
1695
constexpr bool identity = detail::is_identity (mask);
1670
1696
XSIMD_IF_CONSTEXPR (identity)
@@ -1735,6 +1761,7 @@ namespace xsimd
1735
1761
{
1736
1762
return bitwise_cast<uint16_t >(swizzle (bitwise_cast<int16_t >(self), mask, sse2 {}));
1737
1763
}
1764
+
1738
1765
// transpose
1739
1766
template <class A >
1740
1767
XSIMD_INLINE void transpose (batch<float , A>* matrix_begin, batch<float , A>* matrix_end, requires_arch<sse2>) noexcept
@@ -1852,7 +1879,8 @@ namespace xsimd
1852
1879
{
1853
1880
return _mm_unpacklo_pd (self, other);
1854
1881
}
1882
+
1855
1883
}
1856
1884
}
1857
1885
1858
- #endif
1886
+ #endif
0 commit comments