Skip to content

AVX swizzle seems a bit slowΒ #1138

@DiamonDinoia

Description

@DiamonDinoia

Hi team,

I benchmarked the swizzles and they seem a bit slow. I made some effort in optimizing them. In particular the compile time case where we know the permutation. I added very common pattern in scientific computing that I think is worth hardcoding. I have not looked at AVX512, SSE I see that there is #1086. yet but if this is merged I will have a look.

Results:

Image

Code: https://github.com/DiamonDinoia/cpp-learning/blob/master/xsimd/swizzles.cpp

Proposal:

template <uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
static inline __m256 swizzle_const_opt(__m256 self) noexcept {
    constexpr bool is_identity = (V0 == 0 && V1 == 1 && V2 == 2 && V3 == 3 && V4 == 4 && V5 == 5 && V6 == 6 && V7 == 7);
    constexpr bool is_reverse = (V0 == 3 && V1 == 2 && V2 == 1 && V3 == 0 && V4 == 7 && V5 == 6 && V6 == 5 && V7 == 4);
    constexpr bool is_dup_lo = (V0 == 0 && V1 == 1 && V2 == 2 && V3 == 3 && V4 == 0 && V5 == 1 && V6 == 2 && V7 == 3);
    constexpr bool is_dup_hi = (V0 == 4 && V1 == 5 && V2 == 6 && V3 == 7 && V4 == 4 && V5 == 5 && V6 == 6 && V7 == 7);
    constexpr bool is_pairdup_lo = (V0 == 0 && V1 == 0 && V2 == 1 && V3 == 1);
    constexpr bool is_pairdup_hi = (V4 == 2 && V5 == 2 && V6 == 3 && V7 == 3);
    constexpr bool is_pairdup = is_pairdup_lo && is_pairdup_hi;

    XSIMD_IF_CONSTEXPR (is_identity) {
        return self;
    } else XSIMD_IF_CONSTEXPR (is_reverse) {
        __m128 lo = _mm256_castps256_ps128(self);
        __m128 hi = _mm256_extractf128_ps(self, 1);
        __m128 lo_rev = _mm_shuffle_ps(lo, lo, _MM_SHUFFLE(0, 1, 2, 3));
        __m128 hi_rev = _mm_shuffle_ps(hi, hi, _MM_SHUFFLE(0, 1, 2, 3));
        return _mm256_set_m128(lo_rev, hi_rev);
    } else XSIMD_IF_CONSTEXPR (is_dup_lo) {
        __m128 lo = _mm256_castps256_ps128(self);
        return _mm256_set_m128(lo, lo);
    } else XSIMD_IF_CONSTEXPR (is_dup_hi) {
        __m128 hi = _mm256_extractf128_ps(self, 1);
        return _mm256_set_m128(hi, hi);
    } else XSIMD_IF_CONSTEXPR (is_pairdup) {
        __m256i idx = _mm256_setr_epi32(V0, V0, V2, V2, V4, V4, V6, V6);
        return _mm256_permutevar8x32_ps(self, idx);
    } else {
        __m128 lo = _mm256_castps256_ps128(self);
        __m128 hi = _mm256_extractf128_ps(self, 1);

        constexpr int lo_im = _MM_SHUFFLE(int(V3 % 4), int(V2 % 4), int(V1 % 4), int(V0 % 4));
        constexpr int hi_im = _MM_SHUFFLE(int(V7 % 4), int(V6 % 4), int(V5 % 4), int(V4 % 4));

        __m128 lo_s = _mm_shuffle_ps(lo, lo, lo_im);
        __m128 hi_s = _mm_shuffle_ps(hi, hi, hi_im);

        return _mm256_set_m128(hi_s, lo_s);
    }
}

template <uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
static inline __m256d swizzle_const_opt(__m256d self) noexcept {
    constexpr bool is_dup_re = (V0 % 2 == 0 && V1 % 2 == 0 && V2 % 2 == 0 && V3 % 2 == 0);
    constexpr bool is_dup_im = (V0 % 2 == 1 && V1 % 2 == 1 && V2 % 2 == 1 && V3 % 2 == 1);
    constexpr bool is_swap = (V0 % 2 == 1 && V1 % 2 == 0 && V2 % 2 == 1 && V3 % 2 == 0);
    constexpr bool is_identity = (V0 == 0 && V1 == 1 && V2 == 2 && V3 == 3);
    constexpr bool is_pairdup = (V0 == V1 && V2 == V3);

    XSIMD_IF_CONSTEXPR (is_identity) {
        return self;
    } else XSIMD_IF_CONSTEXPR (is_dup_re) {
        return _mm256_permute_pd(self, 0x0);
    } else XSIMD_IF_CONSTEXPR (is_dup_im) {
        return _mm256_permute_pd(self, 0xF);
    } else XSIMD_IF_CONSTEXPR (is_swap) {
        return _mm256_permute_pd(self, 0x5);
    } else XSIMD_IF_CONSTEXPR (is_pairdup) {
        constexpr int permute_mask = ((V2 & 3) << 2) | (V0 & 3);
        return _mm256_permute4x64_pd(self, permute_mask);
    } else {
        __m128d lo = _mm256_castpd256_pd128(self);
        __m128d hi = _mm256_extractf128_pd(self, 1);

        constexpr int lo_ctrl = ((V0 % 2) << 0) | ((V1 % 2) << 1);
        constexpr int hi_ctrl = ((V2 % 2) << 0) | ((V3 % 2) << 1);

        __m128d lo_s = _mm_shuffle_pd(lo, lo, lo_ctrl);
        __m128d hi_s = _mm_shuffle_pd(hi, hi, hi_ctrl);

        return _mm256_set_m128d(hi_s, lo_s);
    }

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions