diff --git a/.github/workflows/emulated.yml b/.github/workflows/emulated.yml
index df64cd2cd..acd315de2 100644
--- a/.github/workflows/emulated.yml
+++ b/.github/workflows/emulated.yml
@@ -17,7 +17,7 @@ jobs:
           - { compiler: 'clang', version: '16'}
     steps:
     - name: Setup compiler
-      if: ${{ matrix.sys.compiler == 'gcc' }}
+      if: ${{ matrix.sys.compiler == 'gcc' }}
       run: |
         GCC_VERSION=${{ matrix.sys.version }}
         sudo apt-get update
@@ -31,7 +31,7 @@ jobs:
     - name: Setup compiler
       if: ${{ matrix.sys.compiler == 'clang' }}
       run: |
-        LLVM_VERSION=${{ matrix.sys.version }}
+        LLVM_VERSION=${{ matrix.sys.version }}
         sudo apt-get update || exit 1
         sudo apt-get --no-install-suggests --no-install-recommends install clang-$LLVM_VERSION || exit 1
         sudo apt-get --no-install-suggests --no-install-recommends install g++ g++-multilib || exit 1
@@ -49,7 +49,7 @@ jobs:
     - name: Configure build
       env:
         CC: ${{ env.CC }}
-        CXX: ${{ env.CXX }}
+        CXX: ${{ env.CXX }}
       run: |
 
         mkdir _build
diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
index 94b4ddc45..5ebcab4c2 100644
--- a/.github/workflows/linux.yml
+++ b/.github/workflows/linux.yml
@@ -29,7 +29,7 @@ jobs:
           - { compiler: 'clang', version: '18', flags: 'avx512' }
     steps:
     - name: Setup compiler
-      if: ${{ matrix.sys.compiler == 'gcc' }}
+      if: ${{ matrix.sys.compiler == 'gcc' }}
       run: |
         GCC_VERSION=${{ matrix.sys.version }}
         sudo apt-get update
@@ -45,7 +45,7 @@ jobs:
     - name: Setup compiler
       if: ${{ matrix.sys.compiler == 'clang' }}
       run: |
-        LLVM_VERSION=${{ matrix.sys.version }}
+        LLVM_VERSION=${{ matrix.sys.version }}
         sudo apt-get update || exit 1
         sudo apt-get --no-install-suggests --no-install-recommends install clang-$LLVM_VERSION || exit 1
         sudo apt-get --no-install-suggests --no-install-recommends install g++ g++-multilib || exit 1
@@ -66,7 +66,7 @@ jobs:
     - name: Configure build
       env:
         CC: ${{ env.CC }}
-        CXX: ${{ env.CXX }}
+        CXX: ${{ env.CXX }}
       run: |
         if [[ '${{ matrix.sys.flags }}' == 'enable_xtl_complex' ]]; then
           CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DENABLE_XTL_COMPLEX=ON"
diff --git a/include/xsimd/arch/common/xsimd_common_swizzle.hpp b/include/xsimd/arch/common/xsimd_common_swizzle.hpp
new file mode 100644
index 000000000..356f587f8
--- /dev/null
+++ b/include/xsimd/arch/common/xsimd_common_swizzle.hpp
@@ -0,0 +1,211 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ * Copyright (c) Marco Barbone                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software.*
+ ****************************************************************************/
+#ifndef XSIMD_COMMON_SWIZZLE_HPP
+#define XSIMD_COMMON_SWIZZLE_HPP
+
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+
+namespace xsimd
+{
+    template <typename T, class A, T... Values>
+    struct batch_constant;
+
+    namespace kernel
+    {
+        namespace detail
+        {
+            // ────────────────────────────────────────────────────────────────────────
+            //  get_at<I,Values...> → the I-th element of the pack
+            template <typename T, std::size_t I, T V0, T... Vs>
+            struct get_at
+            {
+                static constexpr T value = get_at<T, I - 1, Vs...>::value;
+            };
+            template <typename T, T V0, T... Vs>
+            struct get_at<T, 0, V0, Vs...>
+            {
+                static constexpr T value = V0;
+            };
+
+            // ────────────────────────────────────────────────────────────────────────
+            //  1) identity_impl
+            template <std::size_t /*I*/, typename T>
+            XSIMD_INLINE constexpr bool identity_impl() noexcept { return true; }
+            template <std::size_t I, typename T, T V0, T... Vs>
+            XSIMD_INLINE constexpr bool identity_impl() noexcept
+            {
+                return V0 == static_cast<T>(I)
+                    && identity_impl<I + 1, T, Vs...>();
+            }
+
+            // ────────────────────────────────────────────────────────────────────────
+            //  2) bitmask_impl
+            template <std::size_t /*I*/, std::size_t /*N*/, typename T>
+            XSIMD_INLINE constexpr std::uint32_t bitmask_impl() noexcept { return 0u; }
+            template <std::size_t I, std::size_t N, typename T, T V0, T... Vs>
+            XSIMD_INLINE constexpr std::uint32_t bitmask_impl() noexcept
+            {
+                return (1u << (static_cast<std::uint32_t>(V0) & (N - 1)))
+                    | bitmask_impl<I + 1, N, T, Vs...>();
+            }
+
+            // ────────────────────────────────────────────────────────────────────────
+            //  3) dup_lo_impl
+            template <std::size_t I, std::size_t N, typename T,
+                      T... Vs, typename std::enable_if<I == N / 2, int>::type = 0>
+            XSIMD_INLINE constexpr bool dup_lo_impl() noexcept { return true; }
+
+            template <std::size_t I, std::size_t N, typename T,
+                      T... Vs, typename std::enable_if<(I < N / 2), int>::type = 0>
+            XSIMD_INLINE constexpr bool dup_lo_impl() noexcept
+            {
+                return get_at<T, I, Vs...>::value < static_cast<T>(N / 2)
+                    && get_at<T, I + N / 2, Vs...>::value == get_at<T, I, Vs...>::value
+                    && dup_lo_impl<I + 1, N, T, Vs...>();
+            }
+
+            // ────────────────────────────────────────────────────────────────────────
+            //  4) dup_hi_impl
+            template <std::size_t I, std::size_t N, typename T,
+                      T... Vs, typename std::enable_if<I == N / 2, int>::type = 0>
+            XSIMD_INLINE constexpr bool dup_hi_impl() noexcept { return true; }
+
+            template <std::size_t I, std::size_t N, typename T,
+                      T... Vs, typename std::enable_if<(I < N / 2), int>::type = 0>
+            XSIMD_INLINE constexpr bool dup_hi_impl() noexcept
+            {
+                return get_at<T, I, Vs...>::value >= static_cast<T>(N / 2)
+                    && get_at<T, I, Vs...>::value < static_cast<T>(N)
+                    && get_at<T, I + N / 2, Vs...>::value == get_at<T, I, Vs...>::value
+                    && dup_hi_impl<I + 1, N, T, Vs...>();
+            }
+
+            // ────────────────────────────────────────────────────────────────────────
+            //  1) helper to get the I-th value from the Vs pack
+            template <std::size_t I, uint32_t Head, uint32_t... Tail>
+            struct get_nth_value
+            {
+                static constexpr uint32_t value = get_nth_value<I - 1, Tail...>::value;
+            };
+            template <uint32_t Head, uint32_t... Tail>
+            struct get_nth_value<0, Head, Tail...>
+            {
+                static constexpr uint32_t value = Head;
+            };
+
+            // ────────────────────────────────────────────────────────────────────────
+            //  2) recursive cross‐lane test: true if any output‐lane i pulls from the opposite half
+            template <std::size_t I,
+                      std::size_t N,
+                      std::size_t H,
+                      uint32_t... Vs>
+            struct cross_impl
+            {
+                // does element I cross? (i.e. i<H but V>=H) or (i>=H but V<H)
+                static constexpr uint32_t Vi = get_nth_value<I, Vs...>::value;
+                static constexpr bool curr = (I < H ? (Vi >= H) : (Vi < H));
+                static constexpr bool next = cross_impl<I + 1, N, H, Vs...>::value;
+                static constexpr bool value = curr || next;
+            };
+            template <std::size_t N, std::size_t H, uint32_t... Vs>
+            struct cross_impl<N, N, H, Vs...>
+            {
+                static constexpr bool value = false;
+            };
+            template <std::size_t I, std::size_t N, typename T,
+                      T... Vs>
+            XSIMD_INLINE constexpr bool no_duplicates_impl() noexcept
+            {
+                // build the bitmask of (Vs & (N-1)) across all lanes
+                return detail::bitmask_impl<0, N, T, Vs...>() == ((1u << N) - 1u);
+            }
+            template <uint32_t... Vs>
+            XSIMD_INLINE constexpr bool no_duplicates_v() noexcept
+            {
+                // forward to your existing no_duplicates_impl
+                return no_duplicates_impl<0, sizeof...(Vs), uint32_t, Vs...>();
+            }
+            template <uint32_t... Vs>
+            XSIMD_INLINE constexpr bool is_cross_lane() noexcept
+            {
+                static_assert(sizeof...(Vs) >= 1, "Need at least one lane");
+                return cross_impl<0, sizeof...(Vs), sizeof...(Vs) / 2, Vs...>::value;
+            }
+            template <typename T, T... Vs>
+            XSIMD_INLINE constexpr bool is_identity() noexcept { return detail::identity_impl<0, T, Vs...>(); }
+            template <typename T, T... Vs>
+            XSIMD_INLINE constexpr bool is_all_different() noexcept
+            {
+                return detail::bitmask_impl<0, sizeof...(Vs), T, Vs...>() == ((1u << sizeof...(Vs)) - 1);
+            }
+
+            template <typename T, T... Vs>
+            XSIMD_INLINE constexpr bool is_dup_lo() noexcept { return detail::dup_lo_impl<0, sizeof...(Vs), T, Vs...>(); }
+            template <typename T, T... Vs>
+            XSIMD_INLINE constexpr bool is_dup_hi() noexcept { return detail::dup_hi_impl<0, sizeof...(Vs), T, Vs...>(); }
+            template <typename T, class A, T... Vs>
+            XSIMD_INLINE constexpr bool is_identity(batch_constant<T, A, Vs...>) noexcept { return is_identity<T, Vs...>(); }
+            template <typename T, class A, T... Vs>
+            XSIMD_INLINE constexpr bool is_all_different(batch_constant<T, A, Vs...>) noexcept { return is_all_different<T, Vs...>(); }
+            template <typename T, class A, T... Vs>
+            XSIMD_INLINE constexpr bool is_dup_lo(batch_constant<T, A, Vs...>) noexcept { return is_dup_lo<T, Vs...>(); }
+            template <typename T, class A, T... Vs>
+            XSIMD_INLINE constexpr bool is_dup_hi(batch_constant<T, A, Vs...>) noexcept { return is_dup_hi<T, Vs...>(); }
+            template <typename T, class A, T... Vs>
+            XSIMD_INLINE constexpr bool is_cross_lane(batch_constant<T, A, Vs...>) noexcept { return detail::is_cross_lane<Vs...>(); }
+            template <typename T, class A, T... Vs>
+            XSIMD_INLINE constexpr bool no_duplicates(batch_constant<T, A, Vs...>) noexcept { return no_duplicates_impl<0, sizeof...(Vs), T, Vs...>(); }
+            // ────────────────────────────────────────────────────────────────────────
+            //  compile-time tests (identity, all-different, dup-lo, dup-hi)
+            //  8-lane identity
+            static_assert(is_identity<std::uint32_t, 0, 1, 2, 3, 4, 5, 6, 7>(), "identity failed");
+            // 8-lane reverse is all-different but not identity
+            static_assert(is_all_different<std::uint32_t, 7, 6, 5, 4, 3, 2, 1, 0>(), "all-diff failed");
+            static_assert(!is_identity<std::uint32_t, 7, 6, 5, 4, 3, 2, 1, 0>(), "identity on reverse");
+            // 8-lane dup-lo (repeat 0..3 twice)
+            static_assert(is_dup_lo<std::uint32_t, 0, 1, 2, 3, 0, 1, 2, 3>(), "dup_lo failed");
+            static_assert(!is_dup_hi<std::uint32_t, 0, 1, 2, 3, 0, 1, 2, 3>(), "dup_hi on dup_lo");
+            // 8-lane dup-hi (repeat 4..7 twice)
+            static_assert(is_dup_hi<std::uint32_t, 4, 5, 6, 7, 4, 5, 6, 7>(), "dup_hi failed");
+            static_assert(!is_dup_lo<std::uint32_t, 4, 5, 6, 7, 4, 5, 6, 7>(), "dup_lo on dup_hi");
+            // ────────────────────────────────────────────────────────────────────────
+            //  4-lane identity
+            static_assert(is_identity<std::uint32_t, 0, 1, 2, 3>(), "4-lane identity failed");
+            // 4-lane reverse all-different but not identity
+            static_assert(is_all_different<std::uint32_t, 3, 2, 1, 0>(), "4-lane all-diff failed");
+            static_assert(!is_identity<std::uint32_t, 3, 2, 1, 0>(), "4-lane identity on reverse");
+            // 4-lane dup-lo (repeat 0..1 twice)
+            static_assert(is_dup_lo<std::uint32_t, 0, 1, 0, 1>(), "4-lane dup_lo failed");
+            static_assert(!is_dup_hi<std::uint32_t, 0, 1, 0, 1>(), "4-lane dup_hi on dup_lo");
+            // 4-lane dup-hi (repeat 2..3 twice)
+            static_assert(is_dup_hi<std::uint32_t, 2, 3, 2, 3>(), "4-lane dup_hi failed");
+            static_assert(!is_dup_lo<std::uint32_t, 2, 3, 2, 3>(), "4-lane dup_lo on dup_hi");
+
+            static_assert(is_cross_lane<0, 1, 0, 1>(), "dup-lo only → crossing");
+            static_assert(is_cross_lane<2, 3, 2, 3>(), "dup-hi only → crossing");
+            static_assert(is_cross_lane<0, 3, 3, 3>(), "one low + rest high → crossing");
+            static_assert(!is_cross_lane<1, 0, 2, 3>(), "mixed low/high → no crossing");
+            static_assert(!is_cross_lane<0, 1, 2, 3>(), "mixed low/high → no crossing");
+
+            static_assert(no_duplicates_v<0, 1, 2, 3>(), "N=4: [0,1,2,3] → distinct");
+            static_assert(!no_duplicates_v<0, 1, 2, 2>(), "N=4: [0,1,2,2] → dup");
+
+            static_assert(no_duplicates_v<0, 1, 2, 3, 4, 5, 6, 7>(), "N=8: [0..7] → distinct");
+            static_assert(!no_duplicates_v<0, 1, 2, 3, 4, 5, 6, 0>(), "N=8: last repeats 0");
+
+        } // namespace detail
+    } // namespace kernel
+} // namespace xsimd
+
+#endif // XSIMD_COMMON_SWIZZLE_HPP
diff --git a/include/xsimd/arch/xsimd_avx.hpp b/include/xsimd/arch/xsimd_avx.hpp
index 76eed49c2..ed3319d4e 100644
--- a/include/xsimd/arch/xsimd_avx.hpp
+++ b/include/xsimd/arch/xsimd_avx.hpp
@@ -3,6 +3,7 @@
  * Martin Renou                                                             *
  * Copyright (c) QuantStack                                                 *
  * Copyright (c) Serge Guelton                                              *
+ * Copyright (c) Marco Barbone                                              *
  *                                                                          *
  * Distributed under the terms of the BSD 3-Clause License.                 *
  *                                                                          *
@@ -1417,23 +1418,19 @@ namespace xsimd
         XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch<uint32_t, A> mask, requires_arch<avx>) noexcept
         {
             // duplicate low and high part of input
-            __m256 hi = _mm256_castps128_ps256(_mm256_extractf128_ps(self, 1));
-            __m256 hi_hi = _mm256_insertf128_ps(self, _mm256_castps256_ps128(hi), 0);
-
-            __m256 low = _mm256_castps128_ps256(_mm256_castps256_ps128(self));
-            __m256 low_low = _mm256_insertf128_ps(self, _mm256_castps256_ps128(low), 1);
+            // Duplicate lanes separately
+            // 1) duplicate low and high lanes
+            __m256 lo = _mm256_permute2f128_ps(self, self, 0x00); // [low | low]
+            __m256 hi = _mm256_permute2f128_ps(self, self, 0x11); // [high| high]
 
             // normalize mask
             batch<uint32_t, A> half_mask = mask % 4;
 
             // permute within each lane
-            __m256 r0 = _mm256_permutevar_ps(low_low, half_mask);
-            __m256 r1 = _mm256_permutevar_ps(hi_hi, half_mask);
+            __m256 r0 = _mm256_permutevar_ps(lo, half_mask);
+            __m256 r1 = _mm256_permutevar_ps(hi, half_mask);
 
-            // mask to choose the right lane
             batch_bool<uint32_t, A> blend_mask = mask >= 4;
-
-            // blend the two permutes
             return _mm256_blendv_ps(r0, r1, batch_bool_cast<float>(blend_mask));
         }
 
@@ -1441,18 +1438,15 @@ namespace xsimd
         XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch<uint64_t, A> mask, requires_arch<avx>) noexcept
         {
             // duplicate low and high part of input
-            __m256d hi = _mm256_castpd128_pd256(_mm256_extractf128_pd(self, 1));
-            __m256d hi_hi = _mm256_insertf128_pd(self, _mm256_castpd256_pd128(hi), 0);
-
-            __m256d low = _mm256_castpd128_pd256(_mm256_castpd256_pd128(self));
-            __m256d low_low = _mm256_insertf128_pd(self, _mm256_castpd256_pd128(low), 1);
+            __m256d lo = _mm256_permute2f128_pd(self, self, 0x00);
+            __m256d hi = _mm256_permute2f128_pd(self, self, 0x11);
 
             // normalize mask
             batch<uint64_t, A> half_mask = -(mask & 1);
 
             // permute within each lane
-            __m256d r0 = _mm256_permutevar_pd(low_low, half_mask);
-            __m256d r1 = _mm256_permutevar_pd(hi_hi, half_mask);
+            __m256d r0 = _mm256_permutevar_pd(lo, half_mask);
+            __m256d r1 = _mm256_permutevar_pd(hi, half_mask);
 
             // mask to choose the right lane
             batch_bool<uint64_t, A> blend_mask = mask >= 2;
@@ -1478,53 +1472,67 @@ namespace xsimd
 
         // swizzle (constant mask)
         template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
-        XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx>) noexcept
         {
-            // duplicate low and high part of input
-            __m256 hi = _mm256_castps128_ps256(_mm256_extractf128_ps(self, 1));
-            __m256 hi_hi = _mm256_insertf128_ps(self, _mm256_castps256_ps128(hi), 0);
-
-            __m256 low = _mm256_castps128_ps256(_mm256_castps256_ps128(self));
-            __m256 low_low = _mm256_insertf128_ps(self, _mm256_castps256_ps128(low), 1);
+            constexpr bool is_identity = detail::is_identity(mask);
+            constexpr bool is_dup_low = detail::is_dup_lo(mask);
+            constexpr bool is_dup_hi = detail::is_dup_hi(mask);
+            constexpr bool is_dup = is_dup_low || is_dup_hi;
+            XSIMD_IF_CONSTEXPR(is_identity)
+            {
+                return self;
+            }
+            XSIMD_IF_CONSTEXPR(is_dup)
+            {
+                constexpr auto control = is_dup_low ? 0x00 : 0x11;
+                constexpr auto is_dup_identity = is_dup_low ? detail::is_identity<uint32_t, V0, V1, V2, V3>() : detail::is_identity<int64_t, V4 - 4, V5 - 4, V6 - 4, V7 - 4>();
+                auto split = _mm256_permute2f128_ps(self, self, control);
+                XSIMD_IF_CONSTEXPR(!is_dup_identity)
+                {
+                    constexpr auto shuffle_mask = is_dup_low ? detail::mod_shuffle(V0, V1, V2, V3) : detail::mod_shuffle(V4 - 4, V5 - 4, V6 - 4, V7 - 4);
+                    split = _mm256_permute_ps(split, shuffle_mask);
+                }
+                return split;
+            }
+            // Duplicate lanes separately
+            // 1) duplicate low and high lanes
+            __m256 low_dup = _mm256_permute2f128_ps(self, self, 0x00); // [low | low]
+            __m256 hi_dup = _mm256_permute2f128_ps(self, self, 0x11); // [high| high]
 
-            // normalize mask
-            batch_constant<uint32_t, A, (V0 % 4), (V1 % 4), (V2 % 4), (V3 % 4), (V4 % 4), (V5 % 4), (V6 % 4), (V7 % 4)> half_mask;
+            // 2) build lane-local index vector (each element = source_index & 3)
+            constexpr batch_constant<uint32_t, A, (V0 % 4), (V1 % 4), (V2 % 4), (V3 % 4), (V4 % 4), (V5 % 4), (V6 % 4), (V7 % 4)> half_mask;
 
-            // permute within each lane
-            __m256 r0 = _mm256_permutevar_ps(low_low, half_mask.as_batch());
-            __m256 r1 = _mm256_permutevar_ps(hi_hi, half_mask.as_batch());
+            __m256 r0 = _mm256_permutevar_ps(low_dup, half_mask.as_batch()); // pick from low lane
+            __m256 r1 = _mm256_permutevar_ps(hi_dup, half_mask.as_batch()); // pick from high lane
 
-            // mask to choose the right lane
-            batch_bool_constant<uint32_t, A, (V0 >= 4), (V1 >= 4), (V2 >= 4), (V3 >= 4), (V4 >= 4), (V5 >= 4), (V6 >= 4), (V7 >= 4)> blend_mask;
+            constexpr batch_bool_constant<uint32_t, A, (V0 >= 4), (V1 >= 4), (V2 >= 4), (V3 >= 4), (V4 >= 4), (V5 >= 4), (V6 >= 4), (V7 >= 4)> lane_mask {};
 
-            // blend the two permutes
-            constexpr auto mask = blend_mask.mask();
-            return _mm256_blend_ps(r0, r1, mask);
+            return _mm256_blend_ps(r0, r1, lane_mask.mask());
         }
 
         template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
-        XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch_constant<uint64_t, A, V0, V1, V2, V3>, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch_constant<uint64_t, A, V0, V1, V2, V3> mask, requires_arch<avx>) noexcept
         {
+            // cannot use detail::mod_shuffle as the mod and shift are different in this case
+            constexpr auto imm = ((V0 & 1) << 0) | ((V1 & 1) << 1) | ((V2 & 1) << 2) | ((V3 & 1) << 3);
+            XSIMD_IF_CONSTEXPR(detail::is_identity(mask)) { return self; }
+            XSIMD_IF_CONSTEXPR(!detail::is_cross_lane(mask))
+            {
+                return _mm256_permute_pd(self, imm);
+            }
             // duplicate low and high part of input
-            __m256d hi = _mm256_castpd128_pd256(_mm256_extractf128_pd(self, 1));
-            __m256d hi_hi = _mm256_insertf128_pd(self, _mm256_castpd256_pd128(hi), 0);
-
-            __m256d low = _mm256_castpd128_pd256(_mm256_castpd256_pd128(self));
-            __m256d low_low = _mm256_insertf128_pd(self, _mm256_castpd256_pd128(low), 1);
-
-            // normalize mask
-            batch_constant<uint64_t, A, (V0 % 2) * -1, (V1 % 2) * -1, (V2 % 2) * -1, (V3 % 2) * -1> half_mask;
+            __m256d lo = _mm256_permute2f128_pd(self, self, 0x00);
+            __m256d hi = _mm256_permute2f128_pd(self, self, 0x11);
 
             // permute within each lane
-            __m256d r0 = _mm256_permutevar_pd(low_low, half_mask.as_batch());
-            __m256d r1 = _mm256_permutevar_pd(hi_hi, half_mask.as_batch());
+            __m256d r0 = _mm256_permute_pd(lo, imm);
+            __m256d r1 = _mm256_permute_pd(hi, imm);
 
             // mask to choose the right lane
-            batch_bool_constant<uint64_t, A, (V0 >= 2), (V1 >= 2), (V2 >= 2), (V3 >= 2)> blend_mask;
+            constexpr batch_bool_constant<uint64_t, A, (V0 >= 2), (V1 >= 2), (V2 >= 2), (V3 >= 2)> blend_mask;
 
             // blend the two permutes
-            constexpr auto mask = blend_mask.mask();
-            return _mm256_blend_pd(r0, r1, mask);
+            return _mm256_blend_pd(r0, r1, blend_mask.mask());
         }
         template <class A,
                   typename T,
@@ -1903,4 +1911,4 @@ namespace xsimd
     }
 }
 
-#endif
+#endif
\ No newline at end of file
diff --git a/include/xsimd/arch/xsimd_avx2.hpp b/include/xsimd/arch/xsimd_avx2.hpp
index 0a17ce512..4758484f3 100644
--- a/include/xsimd/arch/xsimd_avx2.hpp
+++ b/include/xsimd/arch/xsimd_avx2.hpp
@@ -902,13 +902,11 @@ namespace xsimd
         }
 
         // swizzle (dynamic mask)
-
         template <class A>
         XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch<uint32_t, A> mask, requires_arch<avx2>) noexcept
         {
-            return _mm256_permutevar8x32_ps(self, mask);
+            return swizzle(self, mask, avx {});
         }
-
         template <class A>
         XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch<uint64_t, A> mask, requires_arch<avx2>) noexcept
         {
@@ -930,7 +928,7 @@ namespace xsimd
         template <class A>
         XSIMD_INLINE batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch<uint32_t, A> mask, requires_arch<avx2>) noexcept
         {
-            return _mm256_permutevar8x32_epi32(self, mask);
+            return swizzle(self, mask, avx {});
         }
         template <class A>
         XSIMD_INLINE batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch<uint32_t, A> mask, requires_arch<avx2>) noexcept
@@ -942,20 +940,33 @@ namespace xsimd
         template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
         XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
         {
-            return _mm256_permutevar8x32_ps(self, mask.as_batch());
+            XSIMD_IF_CONSTEXPR(detail::is_all_different(mask) && !detail::is_identity(mask))
+            {
+                // The intrinsic does NOT allow to copy the same element of the source vector to more than one element of the destination vector.
+                // one-shot 8-lane permute
+                return _mm256_permutevar8x32_ps(self, mask.as_batch());
+            }
+            return swizzle(self, mask, avx {});
         }
 
         template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
-        XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch_constant<uint64_t, A, V0, V1, V2, V3>, requires_arch<avx2>) noexcept
+        XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch_constant<uint64_t, A, V0, V1, V2, V3> mask, requires_arch<avx2>) noexcept
         {
-            constexpr auto mask = detail::shuffle(V0, V1, V2, V3);
-            return _mm256_permute4x64_pd(self, mask);
+            XSIMD_IF_CONSTEXPR(detail::is_identity(mask)) { return self; }
+            XSIMD_IF_CONSTEXPR(!detail::is_cross_lane(mask))
+            {
+                constexpr auto imm = ((V0 & 1) << 0) | ((V1 & 1) << 1) | ((V2 & 1) << 2) | ((V3 & 1) << 3);
+                return _mm256_permute_pd(self, imm);
+            }
+            constexpr auto imm = detail::mod_shuffle(V0, V1, V2, V3);
+            // fallback to full 4-element permute
+            return _mm256_permute4x64_pd(self, imm);
         }
 
         template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
         XSIMD_INLINE batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<uint64_t, A, V0, V1, V2, V3>, requires_arch<avx2>) noexcept
         {
-            constexpr auto mask = detail::shuffle(V0, V1, V2, V3);
+            constexpr auto mask = detail::mod_shuffle(V0, V1, V2, V3);
             return _mm256_permute4x64_epi64(self, mask);
         }
         template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
diff --git a/include/xsimd/arch/xsimd_common.hpp b/include/xsimd/arch/xsimd_common.hpp
index 4f8c09f37..ccd4b47c6 100644
--- a/include/xsimd/arch/xsimd_common.hpp
+++ b/include/xsimd/arch/xsimd_common.hpp
@@ -18,6 +18,7 @@
 #include "./common/xsimd_common_math.hpp"
 #include "./common/xsimd_common_memory.hpp"
 #include "./common/xsimd_common_rounding.hpp"
+#include "./common/xsimd_common_swizzle.hpp"
 #include "./common/xsimd_common_trigo.hpp"
 
 #endif
diff --git a/include/xsimd/arch/xsimd_common_fwd.hpp b/include/xsimd/arch/xsimd_common_fwd.hpp
index 26a9bdbdb..aa9a0c163 100644
--- a/include/xsimd/arch/xsimd_common_fwd.hpp
+++ b/include/xsimd/arch/xsimd_common_fwd.hpp
@@ -3,6 +3,7 @@
  * Martin Renou                                                             *
  * Copyright (c) QuantStack                                                 *
  * Copyright (c) Serge Guelton                                              *
+ * Copyright (c) Marco Barbone                                              *
  *                                                                          *
  * Distributed under the terms of the BSD 3-Clause License.                 *
  *                                                                          *
@@ -37,7 +38,25 @@ namespace xsimd
         XSIMD_INLINE batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<common>) noexcept;
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
         XSIMD_INLINE T hadd(batch<T, A> const& self, requires_arch<common>) noexcept;
+        // Forward declarations for pack-level helpers
+        namespace detail
+        {
+            template <typename T, T... Vs>
+            XSIMD_INLINE constexpr bool is_identity() noexcept;
+            template <typename T, class A, T... Vs>
+            XSIMD_INLINE constexpr bool is_identity(batch_constant<T, A, Vs...>) noexcept;
+            template <typename T, class A, T... Vs>
+            XSIMD_INLINE constexpr bool is_all_different(batch_constant<T, A, Vs...>) noexcept;
+            template <typename T, class A, T... Vs>
+            XSIMD_INLINE constexpr bool is_dup_lo(batch_constant<T, A, Vs...>) noexcept;
+            template <typename T, class A, T... Vs>
+            XSIMD_INLINE constexpr bool is_dup_hi(batch_constant<T, A, Vs...>) noexcept;
+            template <typename T, class A, T... Vs>
+            XSIMD_INLINE constexpr bool is_cross_lane(batch_constant<T, A, Vs...>) noexcept;
+            template <typename T, class A, T... Vs>
+            XSIMD_INLINE constexpr bool no_duplicates(batch_constant<T, A, Vs...>) noexcept;
 
+        }
     }
 }