-
Notifications
You must be signed in to change notification settings - Fork 279
Closed
Description
Hi team,
I benchmarked the swizzles and they seem a bit slow. I made some effort in optimizing them. In particular the compile time case where we know the permutation. I added very common pattern in scientific computing that I think is worth hardcoding. I have not looked at AVX512, SSE I see that there is #1086. yet but if this is merged I will have a look.
Results:
Code: https://github.com/DiamonDinoia/cpp-learning/blob/master/xsimd/swizzles.cpp
Proposal:
template <uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
static inline __m256 swizzle_const_opt(__m256 self) noexcept {
constexpr bool is_identity = (V0 == 0 && V1 == 1 && V2 == 2 && V3 == 3 && V4 == 4 && V5 == 5 && V6 == 6 && V7 == 7);
constexpr bool is_reverse = (V0 == 3 && V1 == 2 && V2 == 1 && V3 == 0 && V4 == 7 && V5 == 6 && V6 == 5 && V7 == 4);
constexpr bool is_dup_lo = (V0 == 0 && V1 == 1 && V2 == 2 && V3 == 3 && V4 == 0 && V5 == 1 && V6 == 2 && V7 == 3);
constexpr bool is_dup_hi = (V0 == 4 && V1 == 5 && V2 == 6 && V3 == 7 && V4 == 4 && V5 == 5 && V6 == 6 && V7 == 7);
constexpr bool is_pairdup_lo = (V0 == 0 && V1 == 0 && V2 == 1 && V3 == 1);
constexpr bool is_pairdup_hi = (V4 == 2 && V5 == 2 && V6 == 3 && V7 == 3);
constexpr bool is_pairdup = is_pairdup_lo && is_pairdup_hi;
XSIMD_IF_CONSTEXPR (is_identity) {
return self;
} else XSIMD_IF_CONSTEXPR (is_reverse) {
__m128 lo = _mm256_castps256_ps128(self);
__m128 hi = _mm256_extractf128_ps(self, 1);
__m128 lo_rev = _mm_shuffle_ps(lo, lo, _MM_SHUFFLE(0, 1, 2, 3));
__m128 hi_rev = _mm_shuffle_ps(hi, hi, _MM_SHUFFLE(0, 1, 2, 3));
return _mm256_set_m128(lo_rev, hi_rev);
} else XSIMD_IF_CONSTEXPR (is_dup_lo) {
__m128 lo = _mm256_castps256_ps128(self);
return _mm256_set_m128(lo, lo);
} else XSIMD_IF_CONSTEXPR (is_dup_hi) {
__m128 hi = _mm256_extractf128_ps(self, 1);
return _mm256_set_m128(hi, hi);
} else XSIMD_IF_CONSTEXPR (is_pairdup) {
__m256i idx = _mm256_setr_epi32(V0, V0, V2, V2, V4, V4, V6, V6);
return _mm256_permutevar8x32_ps(self, idx);
} else {
__m128 lo = _mm256_castps256_ps128(self);
__m128 hi = _mm256_extractf128_ps(self, 1);
constexpr int lo_im = _MM_SHUFFLE(int(V3 % 4), int(V2 % 4), int(V1 % 4), int(V0 % 4));
constexpr int hi_im = _MM_SHUFFLE(int(V7 % 4), int(V6 % 4), int(V5 % 4), int(V4 % 4));
__m128 lo_s = _mm_shuffle_ps(lo, lo, lo_im);
__m128 hi_s = _mm_shuffle_ps(hi, hi, hi_im);
return _mm256_set_m128(hi_s, lo_s);
}
}
template <uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
static inline __m256d swizzle_const_opt(__m256d self) noexcept {
constexpr bool is_dup_re = (V0 % 2 == 0 && V1 % 2 == 0 && V2 % 2 == 0 && V3 % 2 == 0);
constexpr bool is_dup_im = (V0 % 2 == 1 && V1 % 2 == 1 && V2 % 2 == 1 && V3 % 2 == 1);
constexpr bool is_swap = (V0 % 2 == 1 && V1 % 2 == 0 && V2 % 2 == 1 && V3 % 2 == 0);
constexpr bool is_identity = (V0 == 0 && V1 == 1 && V2 == 2 && V3 == 3);
constexpr bool is_pairdup = (V0 == V1 && V2 == V3);
XSIMD_IF_CONSTEXPR (is_identity) {
return self;
} else XSIMD_IF_CONSTEXPR (is_dup_re) {
return _mm256_permute_pd(self, 0x0);
} else XSIMD_IF_CONSTEXPR (is_dup_im) {
return _mm256_permute_pd(self, 0xF);
} else XSIMD_IF_CONSTEXPR (is_swap) {
return _mm256_permute_pd(self, 0x5);
} else XSIMD_IF_CONSTEXPR (is_pairdup) {
constexpr int permute_mask = ((V2 & 3) << 2) | (V0 & 3);
return _mm256_permute4x64_pd(self, permute_mask);
} else {
__m128d lo = _mm256_castpd256_pd128(self);
__m128d hi = _mm256_extractf128_pd(self, 1);
constexpr int lo_ctrl = ((V0 % 2) << 0) | ((V1 % 2) << 1);
constexpr int hi_ctrl = ((V2 % 2) << 0) | ((V3 % 2) << 1);
__m128d lo_s = _mm_shuffle_pd(lo, lo, lo_ctrl);
__m128d hi_s = _mm_shuffle_pd(hi, hi, hi_ctrl);
return _mm256_set_m128d(hi_s, lo_s);
}
serge-sans-paille
Metadata
Metadata
Assignees
Labels
No labels