diff --git a/.github/workflows/emulated.yml b/.github/workflows/emulated.yml index df64cd2cd..acd315de2 100644 --- a/.github/workflows/emulated.yml +++ b/.github/workflows/emulated.yml @@ -17,7 +17,7 @@ jobs: - { compiler: 'clang', version: '16'} steps: - name: Setup compiler - if: ${{ matrix.sys.compiler == 'gcc' }} + if: ${{ matrix.sys.compiler == 'gcc' }} run: | GCC_VERSION=${{ matrix.sys.version }} sudo apt-get update @@ -31,7 +31,7 @@ jobs: - name: Setup compiler if: ${{ matrix.sys.compiler == 'clang' }} run: | - LLVM_VERSION=${{ matrix.sys.version }} + LLVM_VERSION=${{ matrix.sys.version }} sudo apt-get update || exit 1 sudo apt-get --no-install-suggests --no-install-recommends install clang-$LLVM_VERSION || exit 1 sudo apt-get --no-install-suggests --no-install-recommends install g++ g++-multilib || exit 1 @@ -49,7 +49,7 @@ jobs: - name: Configure build env: CC: ${{ env.CC }} - CXX: ${{ env.CXX }} + CXX: ${{ env.CXX }} run: | mkdir _build diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index 94b4ddc45..5ebcab4c2 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -29,7 +29,7 @@ jobs: - { compiler: 'clang', version: '18', flags: 'avx512' } steps: - name: Setup compiler - if: ${{ matrix.sys.compiler == 'gcc' }} + if: ${{ matrix.sys.compiler == 'gcc' }} run: | GCC_VERSION=${{ matrix.sys.version }} sudo apt-get update @@ -45,7 +45,7 @@ jobs: - name: Setup compiler if: ${{ matrix.sys.compiler == 'clang' }} run: | - LLVM_VERSION=${{ matrix.sys.version }} + LLVM_VERSION=${{ matrix.sys.version }} sudo apt-get update || exit 1 sudo apt-get --no-install-suggests --no-install-recommends install clang-$LLVM_VERSION || exit 1 sudo apt-get --no-install-suggests --no-install-recommends install g++ g++-multilib || exit 1 @@ -66,7 +66,7 @@ jobs: - name: Configure build env: CC: ${{ env.CC }} - CXX: ${{ env.CXX }} + CXX: ${{ env.CXX }} run: | if [[ '${{ matrix.sys.flags }}' == 'enable_xtl_complex' ]]; then CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DENABLE_XTL_COMPLEX=ON" diff --git a/include/xsimd/arch/common/xsimd_common_swizzle.hpp b/include/xsimd/arch/common/xsimd_common_swizzle.hpp new file mode 100644 index 000000000..356f587f8 --- /dev/null +++ b/include/xsimd/arch/common/xsimd_common_swizzle.hpp @@ -0,0 +1,211 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * Copyright (c) Marco Barbone * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software.* + ****************************************************************************/ +#ifndef XSIMD_COMMON_SWIZZLE_HPP +#define XSIMD_COMMON_SWIZZLE_HPP + +#include +#include +#include + +namespace xsimd +{ + template + struct batch_constant; + + namespace kernel + { + namespace detail + { + // ──────────────────────────────────────────────────────────────────────── + // get_at → the I-th element of the pack + template + struct get_at + { + static constexpr T value = get_at::value; + }; + template + struct get_at + { + static constexpr T value = V0; + }; + + // ──────────────────────────────────────────────────────────────────────── + // 1) identity_impl + template + XSIMD_INLINE constexpr bool identity_impl() noexcept { return true; } + template + XSIMD_INLINE constexpr bool identity_impl() noexcept + { + return V0 == static_cast(I) + && identity_impl(); + } + + // ──────────────────────────────────────────────────────────────────────── + // 2) bitmask_impl + template + XSIMD_INLINE constexpr std::uint32_t bitmask_impl() noexcept { return 0u; } + template + XSIMD_INLINE constexpr std::uint32_t bitmask_impl() noexcept + { + return (1u << (static_cast(V0) & (N - 1))) + | bitmask_impl(); + } + + // ──────────────────────────────────────────────────────────────────────── + // 3) dup_lo_impl + template ::type = 0> + XSIMD_INLINE constexpr bool dup_lo_impl() noexcept { return true; } + + template ::type = 0> + XSIMD_INLINE constexpr bool dup_lo_impl() noexcept + { + return get_at::value < static_cast(N / 2) + && get_at::value == get_at::value + && dup_lo_impl(); + } + + // ──────────────────────────────────────────────────────────────────────── + // 4) dup_hi_impl + template ::type = 0> + XSIMD_INLINE constexpr bool dup_hi_impl() noexcept { return true; } + + template ::type = 0> + XSIMD_INLINE constexpr bool dup_hi_impl() noexcept + { + return get_at::value >= static_cast(N / 2) + && get_at::value < static_cast(N) + && get_at::value == get_at::value + && dup_hi_impl(); + } + + // ──────────────────────────────────────────────────────────────────────── + // 1) helper to get the I-th value from the Vs pack + template + struct get_nth_value + { + static constexpr uint32_t value = get_nth_value::value; + }; + template + struct get_nth_value<0, Head, Tail...> + { + static constexpr uint32_t value = Head; + }; + + // ──────────────────────────────────────────────────────────────────────── + // 2) recursive cross‐lane test: true if any output‐lane i pulls from the opposite half + template + struct cross_impl + { + // does element I cross? (i.e. i=H) or (i>=H but V::value; + static constexpr bool curr = (I < H ? (Vi >= H) : (Vi < H)); + static constexpr bool next = cross_impl::value; + static constexpr bool value = curr || next; + }; + template + struct cross_impl + { + static constexpr bool value = false; + }; + template + XSIMD_INLINE constexpr bool no_duplicates_impl() noexcept + { + // build the bitmask of (Vs & (N-1)) across all lanes + return detail::bitmask_impl<0, N, T, Vs...>() == ((1u << N) - 1u); + } + template + XSIMD_INLINE constexpr bool no_duplicates_v() noexcept + { + // forward to your existing no_duplicates_impl + return no_duplicates_impl<0, sizeof...(Vs), uint32_t, Vs...>(); + } + template + XSIMD_INLINE constexpr bool is_cross_lane() noexcept + { + static_assert(sizeof...(Vs) >= 1, "Need at least one lane"); + return cross_impl<0, sizeof...(Vs), sizeof...(Vs) / 2, Vs...>::value; + } + template + XSIMD_INLINE constexpr bool is_identity() noexcept { return detail::identity_impl<0, T, Vs...>(); } + template + XSIMD_INLINE constexpr bool is_all_different() noexcept + { + return detail::bitmask_impl<0, sizeof...(Vs), T, Vs...>() == ((1u << sizeof...(Vs)) - 1); + } + + template + XSIMD_INLINE constexpr bool is_dup_lo() noexcept { return detail::dup_lo_impl<0, sizeof...(Vs), T, Vs...>(); } + template + XSIMD_INLINE constexpr bool is_dup_hi() noexcept { return detail::dup_hi_impl<0, sizeof...(Vs), T, Vs...>(); } + template + XSIMD_INLINE constexpr bool is_identity(batch_constant) noexcept { return is_identity(); } + template + XSIMD_INLINE constexpr bool is_all_different(batch_constant) noexcept { return is_all_different(); } + template + XSIMD_INLINE constexpr bool is_dup_lo(batch_constant) noexcept { return is_dup_lo(); } + template + XSIMD_INLINE constexpr bool is_dup_hi(batch_constant) noexcept { return is_dup_hi(); } + template + XSIMD_INLINE constexpr bool is_cross_lane(batch_constant) noexcept { return detail::is_cross_lane(); } + template + XSIMD_INLINE constexpr bool no_duplicates(batch_constant) noexcept { return no_duplicates_impl<0, sizeof...(Vs), T, Vs...>(); } + // ──────────────────────────────────────────────────────────────────────── + // compile-time tests (identity, all-different, dup-lo, dup-hi) + // 8-lane identity + static_assert(is_identity(), "identity failed"); + // 8-lane reverse is all-different but not identity + static_assert(is_all_different(), "all-diff failed"); + static_assert(!is_identity(), "identity on reverse"); + // 8-lane dup-lo (repeat 0..3 twice) + static_assert(is_dup_lo(), "dup_lo failed"); + static_assert(!is_dup_hi(), "dup_hi on dup_lo"); + // 8-lane dup-hi (repeat 4..7 twice) + static_assert(is_dup_hi(), "dup_hi failed"); + static_assert(!is_dup_lo(), "dup_lo on dup_hi"); + // ──────────────────────────────────────────────────────────────────────── + // 4-lane identity + static_assert(is_identity(), "4-lane identity failed"); + // 4-lane reverse all-different but not identity + static_assert(is_all_different(), "4-lane all-diff failed"); + static_assert(!is_identity(), "4-lane identity on reverse"); + // 4-lane dup-lo (repeat 0..1 twice) + static_assert(is_dup_lo(), "4-lane dup_lo failed"); + static_assert(!is_dup_hi(), "4-lane dup_hi on dup_lo"); + // 4-lane dup-hi (repeat 2..3 twice) + static_assert(is_dup_hi(), "4-lane dup_hi failed"); + static_assert(!is_dup_lo(), "4-lane dup_lo on dup_hi"); + + static_assert(is_cross_lane<0, 1, 0, 1>(), "dup-lo only → crossing"); + static_assert(is_cross_lane<2, 3, 2, 3>(), "dup-hi only → crossing"); + static_assert(is_cross_lane<0, 3, 3, 3>(), "one low + rest high → crossing"); + static_assert(!is_cross_lane<1, 0, 2, 3>(), "mixed low/high → no crossing"); + static_assert(!is_cross_lane<0, 1, 2, 3>(), "mixed low/high → no crossing"); + + static_assert(no_duplicates_v<0, 1, 2, 3>(), "N=4: [0,1,2,3] → distinct"); + static_assert(!no_duplicates_v<0, 1, 2, 2>(), "N=4: [0,1,2,2] → dup"); + + static_assert(no_duplicates_v<0, 1, 2, 3, 4, 5, 6, 7>(), "N=8: [0..7] → distinct"); + static_assert(!no_duplicates_v<0, 1, 2, 3, 4, 5, 6, 0>(), "N=8: last repeats 0"); + + } // namespace detail + } // namespace kernel +} // namespace xsimd + +#endif // XSIMD_COMMON_SWIZZLE_HPP diff --git a/include/xsimd/arch/xsimd_avx.hpp b/include/xsimd/arch/xsimd_avx.hpp index 76eed49c2..ed3319d4e 100644 --- a/include/xsimd/arch/xsimd_avx.hpp +++ b/include/xsimd/arch/xsimd_avx.hpp @@ -3,6 +3,7 @@ * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * + * Copyright (c) Marco Barbone * * * * Distributed under the terms of the BSD 3-Clause License. * * * @@ -1417,23 +1418,19 @@ namespace xsimd XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept { // duplicate low and high part of input - __m256 hi = _mm256_castps128_ps256(_mm256_extractf128_ps(self, 1)); - __m256 hi_hi = _mm256_insertf128_ps(self, _mm256_castps256_ps128(hi), 0); - - __m256 low = _mm256_castps128_ps256(_mm256_castps256_ps128(self)); - __m256 low_low = _mm256_insertf128_ps(self, _mm256_castps256_ps128(low), 1); + // Duplicate lanes separately + // 1) duplicate low and high lanes + __m256 lo = _mm256_permute2f128_ps(self, self, 0x00); // [low | low] + __m256 hi = _mm256_permute2f128_ps(self, self, 0x11); // [high| high] // normalize mask batch half_mask = mask % 4; // permute within each lane - __m256 r0 = _mm256_permutevar_ps(low_low, half_mask); - __m256 r1 = _mm256_permutevar_ps(hi_hi, half_mask); + __m256 r0 = _mm256_permutevar_ps(lo, half_mask); + __m256 r1 = _mm256_permutevar_ps(hi, half_mask); - // mask to choose the right lane batch_bool blend_mask = mask >= 4; - - // blend the two permutes return _mm256_blendv_ps(r0, r1, batch_bool_cast(blend_mask)); } @@ -1441,18 +1438,15 @@ namespace xsimd XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept { // duplicate low and high part of input - __m256d hi = _mm256_castpd128_pd256(_mm256_extractf128_pd(self, 1)); - __m256d hi_hi = _mm256_insertf128_pd(self, _mm256_castpd256_pd128(hi), 0); - - __m256d low = _mm256_castpd128_pd256(_mm256_castpd256_pd128(self)); - __m256d low_low = _mm256_insertf128_pd(self, _mm256_castpd256_pd128(low), 1); + __m256d lo = _mm256_permute2f128_pd(self, self, 0x00); + __m256d hi = _mm256_permute2f128_pd(self, self, 0x11); // normalize mask batch half_mask = -(mask & 1); // permute within each lane - __m256d r0 = _mm256_permutevar_pd(low_low, half_mask); - __m256d r1 = _mm256_permutevar_pd(hi_hi, half_mask); + __m256d r0 = _mm256_permutevar_pd(lo, half_mask); + __m256d r1 = _mm256_permutevar_pd(hi, half_mask); // mask to choose the right lane batch_bool blend_mask = mask >= 2; @@ -1478,53 +1472,67 @@ namespace xsimd // swizzle (constant mask) template - XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept + XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept { - // duplicate low and high part of input - __m256 hi = _mm256_castps128_ps256(_mm256_extractf128_ps(self, 1)); - __m256 hi_hi = _mm256_insertf128_ps(self, _mm256_castps256_ps128(hi), 0); - - __m256 low = _mm256_castps128_ps256(_mm256_castps256_ps128(self)); - __m256 low_low = _mm256_insertf128_ps(self, _mm256_castps256_ps128(low), 1); + constexpr bool is_identity = detail::is_identity(mask); + constexpr bool is_dup_low = detail::is_dup_lo(mask); + constexpr bool is_dup_hi = detail::is_dup_hi(mask); + constexpr bool is_dup = is_dup_low || is_dup_hi; + XSIMD_IF_CONSTEXPR(is_identity) + { + return self; + } + XSIMD_IF_CONSTEXPR(is_dup) + { + constexpr auto control = is_dup_low ? 0x00 : 0x11; + constexpr auto is_dup_identity = is_dup_low ? detail::is_identity() : detail::is_identity(); + auto split = _mm256_permute2f128_ps(self, self, control); + XSIMD_IF_CONSTEXPR(!is_dup_identity) + { + constexpr auto shuffle_mask = is_dup_low ? detail::mod_shuffle(V0, V1, V2, V3) : detail::mod_shuffle(V4 - 4, V5 - 4, V6 - 4, V7 - 4); + split = _mm256_permute_ps(split, shuffle_mask); + } + return split; + } + // Duplicate lanes separately + // 1) duplicate low and high lanes + __m256 low_dup = _mm256_permute2f128_ps(self, self, 0x00); // [low | low] + __m256 hi_dup = _mm256_permute2f128_ps(self, self, 0x11); // [high| high] - // normalize mask - batch_constant half_mask; + // 2) build lane-local index vector (each element = source_index & 3) + constexpr batch_constant half_mask; - // permute within each lane - __m256 r0 = _mm256_permutevar_ps(low_low, half_mask.as_batch()); - __m256 r1 = _mm256_permutevar_ps(hi_hi, half_mask.as_batch()); + __m256 r0 = _mm256_permutevar_ps(low_dup, half_mask.as_batch()); // pick from low lane + __m256 r1 = _mm256_permutevar_ps(hi_dup, half_mask.as_batch()); // pick from high lane - // mask to choose the right lane - batch_bool_constant= 4), (V1 >= 4), (V2 >= 4), (V3 >= 4), (V4 >= 4), (V5 >= 4), (V6 >= 4), (V7 >= 4)> blend_mask; + constexpr batch_bool_constant= 4), (V1 >= 4), (V2 >= 4), (V3 >= 4), (V4 >= 4), (V5 >= 4), (V6 >= 4), (V7 >= 4)> lane_mask {}; - // blend the two permutes - constexpr auto mask = blend_mask.mask(); - return _mm256_blend_ps(r0, r1, mask); + return _mm256_blend_ps(r0, r1, lane_mask.mask()); } template - XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept + XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept { + // cannot use detail::mod_shuffle as the mod and shift are different in this case + constexpr auto imm = ((V0 & 1) << 0) | ((V1 & 1) << 1) | ((V2 & 1) << 2) | ((V3 & 1) << 3); + XSIMD_IF_CONSTEXPR(detail::is_identity(mask)) { return self; } + XSIMD_IF_CONSTEXPR(!detail::is_cross_lane(mask)) + { + return _mm256_permute_pd(self, imm); + } // duplicate low and high part of input - __m256d hi = _mm256_castpd128_pd256(_mm256_extractf128_pd(self, 1)); - __m256d hi_hi = _mm256_insertf128_pd(self, _mm256_castpd256_pd128(hi), 0); - - __m256d low = _mm256_castpd128_pd256(_mm256_castpd256_pd128(self)); - __m256d low_low = _mm256_insertf128_pd(self, _mm256_castpd256_pd128(low), 1); - - // normalize mask - batch_constant half_mask; + __m256d lo = _mm256_permute2f128_pd(self, self, 0x00); + __m256d hi = _mm256_permute2f128_pd(self, self, 0x11); // permute within each lane - __m256d r0 = _mm256_permutevar_pd(low_low, half_mask.as_batch()); - __m256d r1 = _mm256_permutevar_pd(hi_hi, half_mask.as_batch()); + __m256d r0 = _mm256_permute_pd(lo, imm); + __m256d r1 = _mm256_permute_pd(hi, imm); // mask to choose the right lane - batch_bool_constant= 2), (V1 >= 2), (V2 >= 2), (V3 >= 2)> blend_mask; + constexpr batch_bool_constant= 2), (V1 >= 2), (V2 >= 2), (V3 >= 2)> blend_mask; // blend the two permutes - constexpr auto mask = blend_mask.mask(); - return _mm256_blend_pd(r0, r1, mask); + return _mm256_blend_pd(r0, r1, blend_mask.mask()); } template XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept { - return _mm256_permutevar8x32_ps(self, mask); + return swizzle(self, mask, avx {}); } - template XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept { @@ -930,7 +928,7 @@ namespace xsimd template XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept { - return _mm256_permutevar8x32_epi32(self, mask); + return swizzle(self, mask, avx {}); } template XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept @@ -942,20 +940,33 @@ namespace xsimd template XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept { - return _mm256_permutevar8x32_ps(self, mask.as_batch()); + XSIMD_IF_CONSTEXPR(detail::is_all_different(mask) && !detail::is_identity(mask)) + { + // The intrinsic does NOT allow to copy the same element of the source vector to more than one element of the destination vector. + // one-shot 8-lane permute + return _mm256_permutevar8x32_ps(self, mask.as_batch()); + } + return swizzle(self, mask, avx {}); } template - XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept + XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept { - constexpr auto mask = detail::shuffle(V0, V1, V2, V3); - return _mm256_permute4x64_pd(self, mask); + XSIMD_IF_CONSTEXPR(detail::is_identity(mask)) { return self; } + XSIMD_IF_CONSTEXPR(!detail::is_cross_lane(mask)) + { + constexpr auto imm = ((V0 & 1) << 0) | ((V1 & 1) << 1) | ((V2 & 1) << 2) | ((V3 & 1) << 3); + return _mm256_permute_pd(self, imm); + } + constexpr auto imm = detail::mod_shuffle(V0, V1, V2, V3); + // fallback to full 4-element permute + return _mm256_permute4x64_pd(self, imm); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept { - constexpr auto mask = detail::shuffle(V0, V1, V2, V3); + constexpr auto mask = detail::mod_shuffle(V0, V1, V2, V3); return _mm256_permute4x64_epi64(self, mask); } template diff --git a/include/xsimd/arch/xsimd_common.hpp b/include/xsimd/arch/xsimd_common.hpp index 4f8c09f37..ccd4b47c6 100644 --- a/include/xsimd/arch/xsimd_common.hpp +++ b/include/xsimd/arch/xsimd_common.hpp @@ -18,6 +18,7 @@ #include "./common/xsimd_common_math.hpp" #include "./common/xsimd_common_memory.hpp" #include "./common/xsimd_common_rounding.hpp" +#include "./common/xsimd_common_swizzle.hpp" #include "./common/xsimd_common_trigo.hpp" #endif diff --git a/include/xsimd/arch/xsimd_common_fwd.hpp b/include/xsimd/arch/xsimd_common_fwd.hpp index 26a9bdbdb..aa9a0c163 100644 --- a/include/xsimd/arch/xsimd_common_fwd.hpp +++ b/include/xsimd/arch/xsimd_common_fwd.hpp @@ -3,6 +3,7 @@ * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * + * Copyright (c) Marco Barbone * * * * Distributed under the terms of the BSD 3-Clause License. * * * @@ -37,7 +38,25 @@ namespace xsimd XSIMD_INLINE batch ssub(batch const& self, batch const& other, requires_arch) noexcept; template ::value, void>::type> XSIMD_INLINE T hadd(batch const& self, requires_arch) noexcept; + // Forward declarations for pack-level helpers + namespace detail + { + template + XSIMD_INLINE constexpr bool is_identity() noexcept; + template + XSIMD_INLINE constexpr bool is_identity(batch_constant) noexcept; + template + XSIMD_INLINE constexpr bool is_all_different(batch_constant) noexcept; + template + XSIMD_INLINE constexpr bool is_dup_lo(batch_constant) noexcept; + template + XSIMD_INLINE constexpr bool is_dup_hi(batch_constant) noexcept; + template + XSIMD_INLINE constexpr bool is_cross_lane(batch_constant) noexcept; + template + XSIMD_INLINE constexpr bool no_duplicates(batch_constant) noexcept; + } } }