Skip to content

Commit 7fa61b3

Browse files
committed
removed templated shuffle
1 parent beb92a6 commit 7fa61b3

File tree

7 files changed

+58
-69
lines changed

7 files changed

+58
-69
lines changed

.github/workflows/emulated.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ jobs:
1717
- { compiler: 'clang', version: '16'}
1818
steps:
1919
- name: Setup compiler
20-
if: ${{ matrix.sys.compiler == 'gcc' }}
20+
if: ${{ matrix.sys.compiler == 'gcc' }}
2121
run: |
2222
GCC_VERSION=${{ matrix.sys.version }}
2323
sudo apt-get update
@@ -31,7 +31,7 @@ jobs:
3131
- name: Setup compiler
3232
if: ${{ matrix.sys.compiler == 'clang' }}
3333
run: |
34-
LLVM_VERSION=${{ matrix.sys.version }}
34+
LLVM_VERSION=${{ matrix.sys.version }}
3535
sudo apt-get update || exit 1
3636
sudo apt-get --no-install-suggests --no-install-recommends install clang-$LLVM_VERSION || exit 1
3737
sudo apt-get --no-install-suggests --no-install-recommends install g++ g++-multilib || exit 1
@@ -49,7 +49,7 @@ jobs:
4949
- name: Configure build
5050
env:
5151
CC: ${{ env.CC }}
52-
CXX: ${{ env.CXX }}
52+
CXX: ${{ env.CXX }}
5353
run: |
5454
5555
mkdir _build

.github/workflows/linux.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ jobs:
2929
- { compiler: 'clang', version: '18', flags: 'avx512' }
3030
steps:
3131
- name: Setup compiler
32-
if: ${{ matrix.sys.compiler == 'gcc' }}
32+
if: ${{ matrix.sys.compiler == 'gcc' }}
3333
run: |
3434
GCC_VERSION=${{ matrix.sys.version }}
3535
sudo apt-get update
@@ -45,7 +45,7 @@ jobs:
4545
- name: Setup compiler
4646
if: ${{ matrix.sys.compiler == 'clang' }}
4747
run: |
48-
LLVM_VERSION=${{ matrix.sys.version }}
48+
LLVM_VERSION=${{ matrix.sys.version }}
4949
sudo apt-get update || exit 1
5050
sudo apt-get --no-install-suggests --no-install-recommends install clang-$LLVM_VERSION || exit 1
5151
sudo apt-get --no-install-suggests --no-install-recommends install g++ g++-multilib || exit 1
@@ -66,7 +66,7 @@ jobs:
6666
- name: Configure build
6767
env:
6868
CC: ${{ env.CC }}
69-
CXX: ${{ env.CXX }}
69+
CXX: ${{ env.CXX }}
7070
run: |
7171
if [[ '${{ matrix.sys.flags }}' == 'enable_xtl_complex' ]]; then
7272
CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DENABLE_XTL_COMPLEX=ON"

include/xsimd/arch/common/xsimd_common_swizzle.hpp

Lines changed: 0 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -204,41 +204,6 @@ namespace xsimd
204204
static_assert(no_duplicates_v<0, 1, 2, 3, 4, 5, 6, 7>(), "N=8: [0..7] → distinct");
205205
static_assert(!no_duplicates_v<0, 1, 2, 3, 4, 5, 6, 0>(), "N=8: last repeats 0");
206206

207-
// ────────────────────────────────────────────────────────────────────────
208-
// ────── log2 for powers of 2 ──────
209-
template <std::size_t N>
210-
struct log2_c
211-
{
212-
static_assert(N > 0 && (N & (N - 1)) == 0, "N must be power of 2");
213-
static constexpr std::size_t value = 1 + log2_c<N / 2>::value;
214-
};
215-
template <>
216-
struct log2_c<1>
217-
{
218-
static constexpr std::size_t value = 0;
219-
};
220-
221-
// ────── Recursive encoder ──────
222-
template <std::size_t I, std::size_t N, std::size_t SHIFT, uint32_t... Values>
223-
struct shuffle_impl
224-
{
225-
static constexpr uint32_t value = (get_nth_value<I, Values...>::value << (I * SHIFT)) | shuffle_impl<I + 1, N, SHIFT, Values...>::value;
226-
};
227-
template <std::size_t N, std::size_t SHIFT, uint32_t... Values>
228-
struct shuffle_impl<N, N, SHIFT, Values...>
229-
{
230-
static constexpr uint32_t value = 0;
231-
};
232-
template <uint32_t... Values>
233-
XSIMD_INLINE constexpr std::uint32_t shuffle() noexcept
234-
{
235-
return shuffle_impl<0, sizeof...(Values), log2_c<sizeof...(Values)>::value, Values...>::value;
236-
}
237-
template <uint32_t... Values>
238-
XSIMD_INLINE constexpr std::uint32_t mod_shuffle() noexcept
239-
{
240-
return shuffle<(Values % sizeof...(Values))...>();
241-
}
242207
} // namespace detail
243208
} // namespace kernel
244209
} // namespace xsimd

include/xsimd/arch/xsimd_avx.hpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1058,7 +1058,7 @@ namespace xsimd
10581058
template <class A, class T, class _ = typename std::enable_if<(sizeof(T) <= 2), void>::type>
10591059
XSIMD_INLINE T reduce_max(batch<T, A> const& self, requires_arch<avx>) noexcept
10601060
{
1061-
constexpr auto mask = detail::shuffle<1, 0>();
1061+
constexpr auto mask = detail::shuffle(1, 0);
10621062
batch<T, A> step = _mm256_permute2f128_si256(self, self, mask);
10631063
batch<T, A> acc = max(self, step);
10641064
__m128i low = _mm256_castsi256_si128(acc);
@@ -1069,7 +1069,7 @@ namespace xsimd
10691069
template <class A, class T, class _ = typename std::enable_if<(sizeof(T) <= 2), void>::type>
10701070
XSIMD_INLINE T reduce_min(batch<T, A> const& self, requires_arch<avx>) noexcept
10711071
{
1072-
constexpr auto mask = detail::shuffle<1, 0>();
1072+
constexpr auto mask = detail::shuffle(1, 0);
10731073
batch<T, A> step = _mm256_permute2f128_si256(self, self, mask);
10741074
batch<T, A> acc = min(self, step);
10751075
__m128i low = _mm256_castsi256_si128(acc);
@@ -1214,7 +1214,7 @@ namespace xsimd
12141214
template <class A, class ITy, ITy I0, ITy I1, ITy I2, ITy I3, ITy I4, ITy I5, ITy I6, ITy I7>
12151215
XSIMD_INLINE batch<float, A> shuffle(batch<float, A> const& x, batch<float, A> const& y, batch_constant<ITy, A, I0, I1, I2, I3, I4, I5, I6, I7> mask, requires_arch<avx>) noexcept
12161216
{
1217-
constexpr uint32_t smask = detail::mod_shuffle<I0, I1, I2, I3>();
1217+
constexpr uint32_t smask = detail::mod_shuffle(I0, I1, I2, I3);
12181218
// shuffle within lane
12191219
if (I4 == (I0 + 4) && I5 == (I1 + 4) && I6 == (I2 + 4) && I7 == (I3 + 4) && I0 < 4 && I1 < 4 && I2 >= 8 && I2 < 12 && I3 >= 8 && I3 < 12)
12201220
return _mm256_shuffle_ps(x, y, smask);
@@ -1488,7 +1488,7 @@ namespace xsimd
14881488
auto split = _mm256_permute2f128_ps(self, self, control);
14891489
if (!is_dup_identity)
14901490
{
1491-
constexpr auto shuffle_mask = is_dup_low ? detail::mod_shuffle<V0, V1, V2, V3>() : detail::mod_shuffle<V4 - 4, V5 - 4, V6 - 4, V7 - 4>();
1491+
constexpr auto shuffle_mask = is_dup_low ? detail::mod_shuffle(V0, V1, V2, V3) : detail::mod_shuffle(V4 - 4, V5 - 4, V6 - 4, V7 - 4);
14921492
split = _mm256_permute_ps(split, shuffle_mask);
14931493
}
14941494
return split;
@@ -1910,4 +1910,4 @@ namespace xsimd
19101910
}
19111911
}
19121912

1913-
#endif
1913+
#endif

include/xsimd/arch/xsimd_avx2.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -964,15 +964,15 @@ namespace xsimd
964964
constexpr auto imm = ((V0 & 1) << 0) | ((V1 & 1) << 1) | ((V2 & 1) << 2) | ((V3 & 1) << 3);
965965
return _mm256_permute_pd(self, imm);
966966
}
967-
constexpr auto imm = detail::mod_shuffle<V0, V1, V2, V3>();
967+
constexpr auto imm = detail::mod_shuffle(V0, V1, V2, V3);
968968
// fallback to full 4-element permute
969969
return _mm256_permute4x64_pd(self, imm);
970970
}
971971

972972
template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
973973
XSIMD_INLINE batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<uint64_t, A, V0, V1, V2, V3>, requires_arch<avx2>) noexcept
974974
{
975-
constexpr auto mask = detail::mod_shuffle<V0, V1, V2, V3>();
975+
constexpr auto mask = detail::mod_shuffle(V0, V1, V2, V3);
976976
return _mm256_permute4x64_epi64(self, mask);
977977
}
978978
template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>

include/xsimd/arch/xsimd_common_fwd.hpp

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -50,10 +50,6 @@ namespace xsimd
5050
XSIMD_INLINE constexpr bool is_dup_lo(batch_constant<T, A, Vs...>) noexcept;
5151
template <typename T, class A, T... Vs>
5252
XSIMD_INLINE constexpr bool is_dup_hi(batch_constant<T, A, Vs...>) noexcept;
53-
template <uint32_t... Values>
54-
XSIMD_INLINE constexpr std::uint32_t shuffle() noexcept;
55-
template <uint32_t... Values>
56-
XSIMD_INLINE constexpr std::uint32_t mod_shuffle() noexcept;
5753
template <typename T, class A, T... Vs>
5854
XSIMD_INLINE constexpr bool is_cross_lane(batch_constant<T, A, Vs...>) noexcept;
5955
template <typename T, class A, T... Vs>

include/xsimd/arch/xsimd_sse2.hpp

Lines changed: 45 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,28 @@ namespace xsimd
3333
{
3434
using namespace types;
3535

36+
namespace detail
37+
{
38+
constexpr uint32_t shuffle(uint32_t w, uint32_t x, uint32_t y, uint32_t z)
39+
{
40+
return (z << 6) | (y << 4) | (x << 2) | w;
41+
}
42+
constexpr uint32_t shuffle(uint32_t x, uint32_t y)
43+
{
44+
return (y << 1) | x;
45+
}
46+
47+
constexpr uint32_t mod_shuffle(uint32_t w, uint32_t x, uint32_t y, uint32_t z)
48+
{
49+
return shuffle(w % 4, x % 4, y % 4, z % 4);
50+
}
51+
52+
constexpr uint32_t mod_shuffle(uint32_t w, uint32_t x)
53+
{
54+
return shuffle(w % 2, x % 2);
55+
}
56+
}
57+
3658
// fwd
3759
template <class A, class T, size_t I>
3860
XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<common>) noexcept;
@@ -1282,13 +1304,16 @@ namespace xsimd
12821304
template <class A, class T, class _ = typename std::enable_if<(sizeof(T) <= 2), void>::type>
12831305
XSIMD_INLINE T reduce_max(batch<T, A> const& self, requires_arch<sse2>) noexcept
12841306
{
1285-
batch<T, A> step0 = _mm_shuffle_epi32(self, detail::shuffle<2, 3, 0, 0>());
1307+
constexpr auto mask0 = detail::shuffle(2, 3, 0, 0);
1308+
batch<T, A> step0 = _mm_shuffle_epi32(self, mask0);
12861309
batch<T, A> acc0 = max(self, step0);
12871310

1288-
batch<T, A> step1 = _mm_shuffle_epi32(acc0, detail::shuffle<1, 0, 0, 0>());
1311+
constexpr auto mask1 = detail::shuffle(1, 0, 0, 0);
1312+
batch<T, A> step1 = _mm_shuffle_epi32(acc0, mask1);
12891313
batch<T, A> acc1 = max(acc0, step1);
12901314

1291-
batch<T, A> step2 = _mm_shufflelo_epi16(acc1, detail::shuffle<1, 0, 0, 0>());
1315+
constexpr auto mask2 = detail::shuffle(1, 0, 0, 0);
1316+
batch<T, A> step2 = _mm_shufflelo_epi16(acc1, mask2);
12921317
batch<T, A> acc2 = max(acc1, step2);
12931318
if (sizeof(T) == 2)
12941319
return first(acc2, A {});
@@ -1301,13 +1326,16 @@ namespace xsimd
13011326
template <class A, class T, class _ = typename std::enable_if<(sizeof(T) <= 2), void>::type>
13021327
XSIMD_INLINE T reduce_min(batch<T, A> const& self, requires_arch<sse2>) noexcept
13031328
{
1304-
batch<T, A> step0 = _mm_shuffle_epi32(self, detail::shuffle<2, 3, 0, 0>());
1329+
constexpr auto mask0 = detail::shuffle(2, 3, 0, 0);
1330+
batch<T, A> step0 = _mm_shuffle_epi32(self, mask0);
13051331
batch<T, A> acc0 = min(self, step0);
13061332

1307-
batch<T, A> step1 = _mm_shuffle_epi32(acc0, detail::shuffle<1, 0, 0, 0>());
1333+
constexpr auto mask1 = detail::shuffle(1, 0, 0, 0);
1334+
batch<T, A> step1 = _mm_shuffle_epi32(acc0, mask1);
13081335
batch<T, A> acc1 = min(acc0, step1);
13091336

1310-
batch<T, A> step2 = _mm_shufflelo_epi16(acc1, detail::shuffle<1, 0, 0, 0>());
1337+
constexpr auto mask2 = detail::shuffle(1, 0, 0, 0);
1338+
batch<T, A> step2 = _mm_shufflelo_epi16(acc1, mask2);
13111339
batch<T, A> acc2 = min(acc1, step2);
13121340
if (sizeof(T) == 2)
13131341
return first(acc2, A {});
@@ -1355,7 +1383,7 @@ namespace xsimd
13551383
template <class A, class ITy, ITy I0, ITy I1, ITy I2, ITy I3>
13561384
XSIMD_INLINE batch<float, A> shuffle(batch<float, A> const& x, batch<float, A> const& y, batch_constant<ITy, A, I0, I1, I2, I3> mask, requires_arch<sse2>) noexcept
13571385
{
1358-
constexpr uint32_t smask = detail::mod_shuffle<I0, I1, I2, I3>();
1386+
constexpr uint32_t smask = detail::mod_shuffle(I0, I1, I2, I3);
13591387
// shuffle within lane
13601388
if (I0 < 4 && I1 < 4 && I2 >= 4 && I3 >= 4)
13611389
return _mm_shuffle_ps(x, y, smask);
@@ -1369,7 +1397,7 @@ namespace xsimd
13691397
template <class A, class ITy, ITy I0, ITy I1>
13701398
XSIMD_INLINE batch<double, A> shuffle(batch<double, A> const& x, batch<double, A> const& y, batch_constant<ITy, A, I0, I1> mask, requires_arch<sse2>) noexcept
13711399
{
1372-
constexpr uint32_t smask = detail::mod_shuffle<I0, I1>();
1400+
constexpr uint32_t smask = detail::mod_shuffle(I0, I1);
13731401
// shuffle within lane
13741402
if (I0 < 2 && I1 >= 2)
13751403
return _mm_shuffle_pd(x, y, smask);
@@ -1617,26 +1645,24 @@ namespace xsimd
16171645
return _mm_sub_pd(self, other);
16181646
}
16191647

1620-
// swizzle
1621-
16221648
template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
16231649
XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3>, requires_arch<sse2>) noexcept
16241650
{
1625-
constexpr uint32_t index = detail::shuffle<V0, V1, V2, V3>();
1651+
constexpr uint32_t index = detail::shuffle(V0, V1, V2, V3);
16261652
return _mm_shuffle_ps(self, self, index);
16271653
}
16281654

16291655
template <class A, uint64_t V0, uint64_t V1>
16301656
XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch_constant<uint64_t, A, V0, V1>, requires_arch<sse2>) noexcept
16311657
{
1632-
constexpr uint32_t index = detail::shuffle<V0, V1>();
1658+
constexpr uint32_t index = detail::shuffle(V0, V1);
16331659
return _mm_shuffle_pd(self, self, index);
16341660
}
16351661

16361662
template <class A, uint64_t V0, uint64_t V1>
16371663
XSIMD_INLINE batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<uint64_t, A, V0, V1>, requires_arch<sse2>) noexcept
16381664
{
1639-
constexpr uint32_t index = detail::shuffle<2 * V0, 2 * V0 + 1, 2 * V1, 2 * V1 + 1>();
1665+
constexpr uint32_t index = detail::shuffle(2 * V0, 2 * V0 + 1, 2 * V1, 2 * V1 + 1);
16401666
return _mm_shuffle_epi32(self, index);
16411667
}
16421668

@@ -1649,7 +1675,7 @@ namespace xsimd
16491675
template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
16501676
XSIMD_INLINE batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3>, requires_arch<sse2>) noexcept
16511677
{
1652-
constexpr uint32_t index = detail::shuffle<V0, V1, V2, V3>();
1678+
constexpr uint32_t index = detail::shuffle(V0, V1, V2, V3);
16531679
return _mm_shuffle_epi32(self, index);
16541680
}
16551681

@@ -1663,8 +1689,8 @@ namespace xsimd
16631689
XSIMD_INLINE batch<int16_t, A>
16641690
swizzle(batch<int16_t, A> const& self, batch_constant<uint16_t, A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<sse2>) noexcept
16651691
{
1666-
constexpr int imm_lo = detail::mod_shuffle<V0, V1, V2, V3>();
1667-
constexpr int imm_hi = detail::mod_shuffle<V4, V5, V6, V7>();
1692+
constexpr int imm_lo = detail::mod_shuffle(V0, V1, V2, V3);
1693+
constexpr int imm_hi = detail::mod_shuffle(V4, V5, V6, V7);
16681694
// 0) identity?
16691695
constexpr bool identity = detail::is_identity(mask);
16701696
XSIMD_IF_CONSTEXPR(identity)
@@ -1735,6 +1761,7 @@ namespace xsimd
17351761
{
17361762
return bitwise_cast<uint16_t>(swizzle(bitwise_cast<int16_t>(self), mask, sse2 {}));
17371763
}
1764+
17381765
// transpose
17391766
template <class A>
17401767
XSIMD_INLINE void transpose(batch<float, A>* matrix_begin, batch<float, A>* matrix_end, requires_arch<sse2>) noexcept
@@ -1852,7 +1879,8 @@ namespace xsimd
18521879
{
18531880
return _mm_unpacklo_pd(self, other);
18541881
}
1882+
18551883
}
18561884
}
18571885

1858-
#endif
1886+
#endif

0 commit comments

Comments
 (0)