Skip to content

Commit beb92a6

Browse files
committed
fixing shuffle
1 parent 06ae37c commit beb92a6

File tree

3 files changed

+10
-19
lines changed

3 files changed

+10
-19
lines changed

include/xsimd/arch/common/xsimd_common_swizzle.hpp

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -232,12 +232,8 @@ namespace xsimd
232232
template <uint32_t... Values>
233233
XSIMD_INLINE constexpr std::uint32_t shuffle() noexcept
234234
{
235-
return shuffle_impl<0,
236-
sizeof...(Values),
237-
log2_c<sizeof...(Values)>::value,
238-
Values...>::value;
235+
return shuffle_impl<0, sizeof...(Values), log2_c<sizeof...(Values)>::value, Values...>::value;
239236
}
240-
241237
template <uint32_t... Values>
242238
XSIMD_INLINE constexpr std::uint32_t mod_shuffle() noexcept
243239
{

include/xsimd/arch/xsimd_avx2.hpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -964,14 +964,15 @@ namespace xsimd
964964
constexpr auto imm = ((V0 & 1) << 0) | ((V1 & 1) << 1) | ((V2 & 1) << 2) | ((V3 & 1) << 3);
965965
return _mm256_permute_pd(self, imm);
966966
}
967+
constexpr auto imm = detail::mod_shuffle<V0, V1, V2, V3>();
967968
// fallback to full 4-element permute
968-
return _mm256_permute4x64_pd(self, detail::shuffle<V0, V1, V2, V3>());
969+
return _mm256_permute4x64_pd(self, imm);
969970
}
970971

971972
template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
972973
XSIMD_INLINE batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<uint64_t, A, V0, V1, V2, V3>, requires_arch<avx2>) noexcept
973974
{
974-
constexpr auto mask = detail::shuffle<V0, V1, V2, V3>();
975+
constexpr auto mask = detail::mod_shuffle<V0, V1, V2, V3>();
975976
return _mm256_permute4x64_epi64(self, mask);
976977
}
977978
template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>

include/xsimd/arch/xsimd_sse2.hpp

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1282,16 +1282,13 @@ namespace xsimd
12821282
template <class A, class T, class _ = typename std::enable_if<(sizeof(T) <= 2), void>::type>
12831283
XSIMD_INLINE T reduce_max(batch<T, A> const& self, requires_arch<sse2>) noexcept
12841284
{
1285-
constexpr auto mask0 = detail::shuffle<2, 3, 0, 0>();
1286-
batch<T, A> step0 = _mm_shuffle_epi32(self, mask0);
1285+
batch<T, A> step0 = _mm_shuffle_epi32(self, detail::shuffle<2, 3, 0, 0>());
12871286
batch<T, A> acc0 = max(self, step0);
12881287

1289-
constexpr auto mask1 = detail::shuffle<1, 0, 0, 0>();
1290-
batch<T, A> step1 = _mm_shuffle_epi32(acc0, mask1);
1288+
batch<T, A> step1 = _mm_shuffle_epi32(acc0, detail::shuffle<1, 0, 0, 0>());
12911289
batch<T, A> acc1 = max(acc0, step1);
12921290

1293-
constexpr auto mask2 = detail::shuffle<1, 0, 0, 0>();
1294-
batch<T, A> step2 = _mm_shufflelo_epi16(acc1, mask2);
1291+
batch<T, A> step2 = _mm_shufflelo_epi16(acc1, detail::shuffle<1, 0, 0, 0>());
12951292
batch<T, A> acc2 = max(acc1, step2);
12961293
if (sizeof(T) == 2)
12971294
return first(acc2, A {});
@@ -1304,16 +1301,13 @@ namespace xsimd
13041301
template <class A, class T, class _ = typename std::enable_if<(sizeof(T) <= 2), void>::type>
13051302
XSIMD_INLINE T reduce_min(batch<T, A> const& self, requires_arch<sse2>) noexcept
13061303
{
1307-
constexpr auto mask0 = detail::shuffle<2, 3, 0, 0>();
1308-
batch<T, A> step0 = _mm_shuffle_epi32(self, mask0);
1304+
batch<T, A> step0 = _mm_shuffle_epi32(self, detail::shuffle<2, 3, 0, 0>());
13091305
batch<T, A> acc0 = min(self, step0);
13101306

1311-
constexpr auto mask1 = detail::shuffle<1, 0, 0, 0>();
1312-
batch<T, A> step1 = _mm_shuffle_epi32(acc0, mask1);
1307+
batch<T, A> step1 = _mm_shuffle_epi32(acc0, detail::shuffle<1, 0, 0, 0>());
13131308
batch<T, A> acc1 = min(acc0, step1);
13141309

1315-
constexpr auto mask2 = detail::shuffle<1, 0, 0, 0>();
1316-
batch<T, A> step2 = _mm_shufflelo_epi16(acc1, mask2);
1310+
batch<T, A> step2 = _mm_shufflelo_epi16(acc1, detail::shuffle<1, 0, 0, 0>());
13171311
batch<T, A> acc2 = min(acc1, step2);
13181312
if (sizeof(T) == 2)
13191313
return first(acc2, A {});

0 commit comments

Comments
 (0)