From 9a017e96d5e3ba3e66b3c4c03ec8d703426b5a47 Mon Sep 17 00:00:00 2001
From: eternal-flame-AD <yume@yumechi.jp>
Date: Thu, 31 Jul 2025 22:11:12 -0500
Subject: [PATCH 1/8] scrypt: sse2 RoMix optimization

Signed-off-by: eternal-flame-AD <yume@yumechi.jp>
---
 scrypt/src/romix.rs | 116 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 116 insertions(+)

diff --git a/scrypt/src/romix.rs b/scrypt/src/romix.rs
index bd3c56dd..38b8ae51 100644
--- a/scrypt/src/romix.rs
+++ b/scrypt/src/romix.rs
@@ -1,3 +1,26 @@
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+/// Permute Salsa20 block to column major order
+const PIVOT_ABCD: [usize; 16] = [0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11];
+
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+/// Inverse of PIVOT_ABCD
+const INVERSE_PIVOT_ABCD: [usize; 16] = const {
+    let mut index = [0; 16];
+    let mut i = 0;
+    while i < 16 {
+        let mut inverse = 0;
+        while inverse < 16 {
+            if PIVOT_ABCD[inverse] == i {
+                index[i] = inverse;
+                break;
+            }
+            inverse += 1;
+        }
+        i += 1;
+    }
+    index
+};
+
 /// Execute the ROMix operation in-place.
 /// b - the data to operate on
 /// v - a temporary variable to store the vector V
@@ -18,6 +41,17 @@ pub(crate) fn scrypt_ro_mix(b: &mut [u8], v: &mut [u8], t: &mut [u8], n: usize)
 
     let len = b.len();
 
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    for chunk in b.chunks_exact_mut(64) {
+        let mut t = [0u32; 16];
+        for (c, b) in chunk.chunks_exact(4).zip(t.iter_mut()) {
+            *b = u32::from_ne_bytes(c.try_into().unwrap());
+        }
+        chunk.chunks_exact_mut(4).enumerate().for_each(|(i, b)| {
+            b.copy_from_slice(&t[PIVOT_ABCD[i]].to_ne_bytes());
+        });
+    }
+
     for chunk in v.chunks_mut(len) {
         chunk.copy_from_slice(b);
         scrypt_block_mix(chunk, b);
@@ -28,11 +62,23 @@ pub(crate) fn scrypt_ro_mix(b: &mut [u8], v: &mut [u8], t: &mut [u8], n: usize)
         xor(b, &v[j * len..(j + 1) * len], t);
         scrypt_block_mix(t, b);
     }
+
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    for chunk in b.chunks_exact_mut(64) {
+        let mut t = [0u32; 16];
+        for (c, b) in chunk.chunks_exact(4).zip(t.iter_mut()) {
+            *b = u32::from_ne_bytes(c.try_into().unwrap());
+        }
+        chunk.chunks_exact_mut(4).enumerate().for_each(|(i, b)| {
+            b.copy_from_slice(&t[INVERSE_PIVOT_ABCD[i]].to_ne_bytes());
+        });
+    }
 }
 
 /// Execute the BlockMix operation
 /// input - the input vector. The length must be a multiple of 128.
 /// output - the output vector. Must be the same length as input.
+#[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
 fn scrypt_block_mix(input: &[u8], output: &mut [u8]) {
     use salsa20::{
         SalsaCore,
@@ -67,6 +113,76 @@ fn scrypt_block_mix(input: &[u8], output: &mut [u8]) {
     }
 }
 
+/// Execute the BlockMix operation
+/// input - the input vector. The length must be a multiple of 128.
+/// output - the output vector. Must be the same length as input.
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+fn scrypt_block_mix(input: &[u8], output: &mut [u8]) {
+    #[cfg(target_arch = "x86")]
+    use core::arch::x86::*;
+
+    #[cfg(target_arch = "x86_64")]
+    use core::arch::x86_64::*;
+
+    macro_rules! mm_rol_epi32x {
+        ($w:expr, $amt:literal) => {{
+            let w = $w;
+            _mm_or_si128(_mm_slli_epi32(w, $amt), _mm_srli_epi32(w, 32 - $amt))
+        }};
+    }
+
+    let mut x = [0u8; 64];
+    x.copy_from_slice(&input[input.len() - 64..]);
+
+    let mut a = unsafe { _mm_loadu_si128(x.as_ptr().cast()) };
+    let mut b = unsafe { _mm_loadu_si128(x.as_ptr().add(16).cast()) };
+    let mut c = unsafe { _mm_loadu_si128(x.as_ptr().add(32).cast()) };
+    let mut d = unsafe { _mm_loadu_si128(x.as_ptr().add(48).cast()) };
+
+    for (i, chunk) in input.chunks(64).enumerate() {
+        let pos = if i % 2 == 0 {
+            (i / 2) * 64
+        } else {
+            (i / 2) * 64 + input.len() / 2
+        };
+
+        unsafe {
+            a = _mm_xor_si128(a, _mm_loadu_si128(chunk.as_ptr().cast()));
+            b = _mm_xor_si128(b, _mm_loadu_si128(chunk.as_ptr().add(16).cast()));
+            c = _mm_xor_si128(c, _mm_loadu_si128(chunk.as_ptr().add(32).cast()));
+            d = _mm_xor_si128(d, _mm_loadu_si128(chunk.as_ptr().add(48).cast()));
+
+            let saves = [a, b, c, d];
+
+            for _ in 0..8 {
+                b = _mm_xor_si128(b, mm_rol_epi32x!(_mm_add_epi32(a, d), 7));
+                c = _mm_xor_si128(c, mm_rol_epi32x!(_mm_add_epi32(b, a), 9));
+                d = _mm_xor_si128(d, mm_rol_epi32x!(_mm_add_epi32(c, b), 13));
+                a = _mm_xor_si128(a, mm_rol_epi32x!(_mm_add_epi32(d, c), 18));
+
+                // a stays in place
+                // b = left shuffle d by 1 element
+                d = _mm_shuffle_epi32(d, 0b00_11_10_01);
+                // c = left shuffle c by 2 elements
+                c = _mm_shuffle_epi32(c, 0b01_00_11_10);
+                // d = left shuffle b by 3 elements
+                b = _mm_shuffle_epi32(b, 0b10_01_00_11);
+                (b, d) = (d, b);
+            }
+
+            a = _mm_add_epi32(a, saves[0]);
+            b = _mm_add_epi32(b, saves[1]);
+            c = _mm_add_epi32(c, saves[2]);
+            d = _mm_add_epi32(d, saves[3]);
+
+            _mm_storeu_si128(output.as_mut_ptr().add(pos).cast(), a);
+            _mm_storeu_si128(output.as_mut_ptr().add(pos + 16).cast(), b);
+            _mm_storeu_si128(output.as_mut_ptr().add(pos + 32).cast(), c);
+            _mm_storeu_si128(output.as_mut_ptr().add(pos + 48).cast(), d);
+        }
+    }
+}
+
 fn xor(x: &[u8], y: &[u8], output: &mut [u8]) {
     for ((out, &x_i), &y_i) in output.iter_mut().zip(x.iter()).zip(y.iter()) {
         *out = x_i ^ y_i;

From e991eb9876b9ddb65e895d2397fd698372a82237 Mon Sep 17 00:00:00 2001
From: eternal-flame-AD <yume@yumechi.jp>
Date: Thu, 31 Jul 2025 23:04:33 -0500
Subject: [PATCH 2/8] use different function names for the preshuffled version

Signed-off-by: eternal-flame-AD <yume@yumechi.jp>
---
 scrypt/src/romix.rs | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/scrypt/src/romix.rs b/scrypt/src/romix.rs
index 38b8ae51..45d8ce0c 100644
--- a/scrypt/src/romix.rs
+++ b/scrypt/src/romix.rs
@@ -54,12 +54,22 @@ pub(crate) fn scrypt_ro_mix(b: &mut [u8], v: &mut [u8], t: &mut [u8], n: usize)
 
     for chunk in v.chunks_mut(len) {
         chunk.copy_from_slice(b);
+
+        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+        scrypt_block_mix_abcd(chunk, b);
+
+        #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
         scrypt_block_mix(chunk, b);
     }
 
     for _ in 0..n {
         let j = integerify(b, n);
         xor(b, &v[j * len..(j + 1) * len], t);
+
+        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+        scrypt_block_mix_abcd(t, b);
+
+        #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
         scrypt_block_mix(t, b);
     }
 
@@ -113,11 +123,11 @@ fn scrypt_block_mix(input: &[u8], output: &mut [u8]) {
     }
 }
 
-/// Execute the BlockMix operation
+/// Execute the BlockMix operation with pre-shuffled input.
 /// input - the input vector. The length must be a multiple of 128.
 /// output - the output vector. Must be the same length as input.
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-fn scrypt_block_mix(input: &[u8], output: &mut [u8]) {
+fn scrypt_block_mix_abcd(input: &[u8], output: &mut [u8]) {
     #[cfg(target_arch = "x86")]
     use core::arch::x86::*;
 
@@ -131,13 +141,12 @@ fn scrypt_block_mix(input: &[u8], output: &mut [u8]) {
         }};
     }
 
-    let mut x = [0u8; 64];
-    x.copy_from_slice(&input[input.len() - 64..]);
+    let last_block = &input[input.len() - 64..];
 
-    let mut a = unsafe { _mm_loadu_si128(x.as_ptr().cast()) };
-    let mut b = unsafe { _mm_loadu_si128(x.as_ptr().add(16).cast()) };
-    let mut c = unsafe { _mm_loadu_si128(x.as_ptr().add(32).cast()) };
-    let mut d = unsafe { _mm_loadu_si128(x.as_ptr().add(48).cast()) };
+    let mut a = unsafe { _mm_loadu_si128(last_block.as_ptr().cast()) };
+    let mut b = unsafe { _mm_loadu_si128(last_block.as_ptr().add(16).cast()) };
+    let mut c = unsafe { _mm_loadu_si128(last_block.as_ptr().add(32).cast()) };
+    let mut d = unsafe { _mm_loadu_si128(last_block.as_ptr().add(48).cast()) };
 
     for (i, chunk) in input.chunks(64).enumerate() {
         let pos = if i % 2 == 0 {

From c65ad4eb25dceab28cb6665f45179fdb1c7119af Mon Sep 17 00:00:00 2001
From: eternal-flame-AD <yume@yumechi.jp>
Date: Thu, 31 Jul 2025 23:57:31 -0500
Subject: [PATCH 3/8] wasm32 kernel

Signed-off-by: eternal-flame-AD <yume@yumechi.jp>
---
 scrypt/src/romix.rs | 112 ++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 103 insertions(+), 9 deletions(-)

diff --git a/scrypt/src/romix.rs b/scrypt/src/romix.rs
index 45d8ce0c..715d7c5b 100644
--- a/scrypt/src/romix.rs
+++ b/scrypt/src/romix.rs
@@ -1,8 +1,8 @@
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64", target_arch = "wasm32"))]
 /// Permute Salsa20 block to column major order
 const PIVOT_ABCD: [usize; 16] = [0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11];
 
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64", target_arch = "wasm32"))]
 /// Inverse of PIVOT_ABCD
 const INVERSE_PIVOT_ABCD: [usize; 16] = const {
     let mut index = [0; 16];
@@ -41,7 +41,11 @@ pub(crate) fn scrypt_ro_mix(b: &mut [u8], v: &mut [u8], t: &mut [u8], n: usize)
 
     let len = b.len();
 
-    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    #[cfg(any(
+        target_arch = "x86",
+        target_arch = "x86_64",
+        all(target_arch = "wasm32", target_feature = "simd128")
+    ))]
     for chunk in b.chunks_exact_mut(64) {
         let mut t = [0u32; 16];
         for (c, b) in chunk.chunks_exact(4).zip(t.iter_mut()) {
@@ -55,10 +59,18 @@ pub(crate) fn scrypt_ro_mix(b: &mut [u8], v: &mut [u8], t: &mut [u8], n: usize)
     for chunk in v.chunks_mut(len) {
         chunk.copy_from_slice(b);
 
-        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+        #[cfg(any(
+            target_arch = "x86",
+            target_arch = "x86_64",
+            all(target_arch = "wasm32", target_feature = "simd128")
+        ))]
         scrypt_block_mix_abcd(chunk, b);
 
-        #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
+        #[cfg(not(any(
+            target_arch = "x86",
+            target_arch = "x86_64",
+            all(target_arch = "wasm32", target_feature = "simd128")
+        )))]
         scrypt_block_mix(chunk, b);
     }
 
@@ -66,14 +78,26 @@ pub(crate) fn scrypt_ro_mix(b: &mut [u8], v: &mut [u8], t: &mut [u8], n: usize)
         let j = integerify(b, n);
         xor(b, &v[j * len..(j + 1) * len], t);
 
-        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+        #[cfg(any(
+            target_arch = "x86",
+            target_arch = "x86_64",
+            all(target_arch = "wasm32", target_feature = "simd128")
+        ))]
         scrypt_block_mix_abcd(t, b);
 
-        #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
+        #[cfg(not(any(
+            target_arch = "x86",
+            target_arch = "x86_64",
+            all(target_arch = "wasm32", target_feature = "simd128")
+        )))]
         scrypt_block_mix(t, b);
     }
 
-    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    #[cfg(any(
+        target_arch = "x86",
+        target_arch = "x86_64",
+        all(target_arch = "wasm32", target_feature = "simd128")
+    ))]
     for chunk in b.chunks_exact_mut(64) {
         let mut t = [0u32; 16];
         for (c, b) in chunk.chunks_exact(4).zip(t.iter_mut()) {
@@ -88,7 +112,11 @@ pub(crate) fn scrypt_ro_mix(b: &mut [u8], v: &mut [u8], t: &mut [u8], n: usize)
 /// Execute the BlockMix operation
 /// input - the input vector. The length must be a multiple of 128.
 /// output - the output vector. Must be the same length as input.
-#[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
+#[cfg(not(any(
+    target_arch = "x86",
+    target_arch = "x86_64",
+    all(target_arch = "wasm32", target_feature = "simd128")
+)))]
 fn scrypt_block_mix(input: &[u8], output: &mut [u8]) {
     use salsa20::{
         SalsaCore,
@@ -192,6 +220,72 @@ fn scrypt_block_mix_abcd(input: &[u8], output: &mut [u8]) {
     }
 }
 
+/// Execute the BlockMix operation with pre-shuffled input.
+/// input - the input vector. The length must be a multiple of 128.
+/// output - the output vector. Must be the same length as input.
+#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
+fn scrypt_block_mix_abcd(input: &[u8], output: &mut [u8]) {
+    use core::arch::wasm32::*;
+
+    macro_rules! u32x4_rol {
+        ($x:expr, $amt:literal) => {
+            v128_or(u32x4_shl($x, $amt), u32x4_shr($x, 32 - $amt))
+        };
+    }
+
+    let last_block = &input[input.len() - 64..];
+
+    let mut a = unsafe { v128_load(last_block.as_ptr().cast()) };
+    let mut b = unsafe { v128_load(last_block.as_ptr().add(16).cast()) };
+    let mut c = unsafe { v128_load(last_block.as_ptr().add(32).cast()) };
+    let mut d = unsafe { v128_load(last_block.as_ptr().add(48).cast()) };
+
+    for (i, chunk) in input.chunks(64).enumerate() {
+        let pos = if i % 2 == 0 {
+            (i / 2) * 64
+        } else {
+            (i / 2) * 64 + input.len() / 2
+        };
+
+        unsafe {
+            let chunk_a = v128_load(chunk.as_ptr().cast());
+            let chunk_b = v128_load(chunk.as_ptr().add(16).cast());
+            let chunk_c = v128_load(chunk.as_ptr().add(32).cast());
+            let chunk_d = v128_load(chunk.as_ptr().add(48).cast());
+
+            a = v128_xor(a, chunk_a);
+            b = v128_xor(b, chunk_b);
+            c = v128_xor(c, chunk_c);
+            d = v128_xor(d, chunk_d);
+
+            let saves = [a, b, c, d];
+
+            for _ in 0..8 {
+                b = v128_xor(b, u32x4_rol!(u32x4_add(a, d), 7));
+                c = v128_xor(c, u32x4_rol!(u32x4_add(b, a), 9));
+                d = v128_xor(d, u32x4_rol!(u32x4_add(c, b), 13));
+                a = v128_xor(a, u32x4_rol!(u32x4_add(d, c), 18));
+
+                d = i32x4_shuffle::<1, 2, 3, 0>(d, d);
+                c = i32x4_shuffle::<2, 3, 0, 1>(c, c);
+                b = i32x4_shuffle::<3, 0, 1, 2>(b, b);
+
+                (b, d) = (d, b);
+            }
+
+            a = u32x4_add(a, saves[0]);
+            b = u32x4_add(b, saves[1]);
+            c = u32x4_add(c, saves[2]);
+            d = u32x4_add(d, saves[3]);
+
+            v128_store(output.as_mut_ptr().add(pos).cast(), a);
+            v128_store(output.as_mut_ptr().add(pos + 16).cast(), b);
+            v128_store(output.as_mut_ptr().add(pos + 32).cast(), c);
+            v128_store(output.as_mut_ptr().add(pos + 48).cast(), d);
+        }
+    }
+}
+
 fn xor(x: &[u8], y: &[u8], output: &mut [u8]) {
     for ((out, &x_i), &y_i) in output.iter_mut().zip(x.iter()).zip(y.iter()) {
         *out = x_i ^ y_i;

From 804c6224206e74dae74e892c07a1f44c449c2561 Mon Sep 17 00:00:00 2001
From: eternal-flame-AD <yume@yumechi.jp>
Date: Fri, 1 Aug 2025 00:03:01 -0500
Subject: [PATCH 4/8] fix unused warning

Signed-off-by: eternal-flame-AD <yume@yumechi.jp>
---
 scrypt/src/romix.rs | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/scrypt/src/romix.rs b/scrypt/src/romix.rs
index 715d7c5b..1187de58 100644
--- a/scrypt/src/romix.rs
+++ b/scrypt/src/romix.rs
@@ -1,8 +1,16 @@
-#[cfg(any(target_arch = "x86", target_arch = "x86_64", target_arch = "wasm32"))]
+#[cfg(any(
+    target_arch = "x86",
+    target_arch = "x86_64",
+    all(target_arch = "wasm32", target_feature = "simd128")
+))]
 /// Permute Salsa20 block to column major order
 const PIVOT_ABCD: [usize; 16] = [0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11];
 
-#[cfg(any(target_arch = "x86", target_arch = "x86_64", target_arch = "wasm32"))]
+#[cfg(any(
+    target_arch = "x86",
+    target_arch = "x86_64",
+    all(target_arch = "wasm32", target_feature = "simd128")
+))]
 /// Inverse of PIVOT_ABCD
 const INVERSE_PIVOT_ABCD: [usize; 16] = const {
     let mut index = [0; 16];

From bae8e9837e5fc4af7ea68073ef2f0a83757001c1 Mon Sep 17 00:00:00 2001
From: eternal-flame-AD <yume@yumechi.jp>
Date: Fri, 1 Aug 2025 07:37:38 -0500
Subject: [PATCH 5/8] move arch-dependent impls to block_mix module

Signed-off-by: eternal-flame-AD <yume@yumechi.jp>
---
 Cargo.lock                      |   1 +
 scrypt/Cargo.toml               |   1 +
 scrypt/src/block_mix/mod.rs     |  68 ++++++++
 scrypt/src/block_mix/neon.rs    |  88 +++++++++++
 scrypt/src/block_mix/pivot.rs   |  20 +++
 scrypt/src/block_mix/simd128.rs |  88 +++++++++++
 scrypt/src/block_mix/soft.rs    |  42 +++++
 scrypt/src/block_mix/sse2.rs    |  90 +++++++++++
 scrypt/src/lib.rs               |   1 +
 scrypt/src/romix.rs             | 267 +-------------------------------
 10 files changed, 403 insertions(+), 263 deletions(-)
 create mode 100644 scrypt/src/block_mix/mod.rs
 create mode 100644 scrypt/src/block_mix/neon.rs
 create mode 100644 scrypt/src/block_mix/pivot.rs
 create mode 100644 scrypt/src/block_mix/simd128.rs
 create mode 100644 scrypt/src/block_mix/soft.rs
 create mode 100644 scrypt/src/block_mix/sse2.rs

diff --git a/Cargo.lock b/Cargo.lock
index 11574624..aa3317d4 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -436,6 +436,7 @@ dependencies = [
 name = "scrypt"
 version = "0.12.0-rc.1"
 dependencies = [
+ "cfg-if",
  "password-hash",
  "pbkdf2",
  "salsa20",
diff --git a/scrypt/Cargo.toml b/scrypt/Cargo.toml
index 5d05607f..ed86407c 100644
--- a/scrypt/Cargo.toml
+++ b/scrypt/Cargo.toml
@@ -14,6 +14,7 @@ edition = "2024"
 rust-version = "1.85"
 
 [dependencies]
+cfg-if = "1.0"
 pbkdf2 = { version = "0.13.0-rc.0", path = "../pbkdf2" }
 salsa20 = { version = "0.11.0-rc.0", default-features = false }
 sha2 = { version = "0.11.0-rc.0", default-features = false }
diff --git a/scrypt/src/block_mix/mod.rs b/scrypt/src/block_mix/mod.rs
new file mode 100644
index 00000000..abd55930
--- /dev/null
+++ b/scrypt/src/block_mix/mod.rs
@@ -0,0 +1,68 @@
+#[cfg(any(
+    test,
+    not(any(
+        all(
+            any(target_arch = "x86", target_arch = "x86_64"),
+            target_feature = "sse2"
+        ),
+        all(target_arch = "aarch64", target_feature = "neon"),
+        all(target_arch = "wasm32", target_feature = "simd128"),
+    ))
+))]
+mod soft;
+
+cfg_if::cfg_if! {
+    if #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] {
+        mod pivot;
+        mod neon;
+        pub(crate) use neon::{scrypt_block_mix, shuffle_in, shuffle_out};
+    } else if #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))] {
+        mod pivot;
+        mod simd128;
+        pub(crate) use simd128::{scrypt_block_mix, shuffle_in, shuffle_out};
+    } else if #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "sse2"))] {
+        mod pivot;
+        mod sse2;
+        pub(crate) use sse2::{scrypt_block_mix, shuffle_in, shuffle_out};
+    } else {
+        pub(crate) use soft::scrypt_block_mix;
+
+        pub(crate) fn shuffle_in(_input: &mut [u8]) {}
+        pub(crate) fn shuffle_out(_input: &mut [u8]) {}
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_scrypt_block_mix_abcd_against_soft() {
+        let mut input: [u8; 128] = core::array::from_fn(|i| i as u8);
+        for _round in 0..10 {
+            let mut output = [0u8; 128];
+
+            let mut expected0 = [0u8; 128];
+            let mut expected1 = [0u8; 128]; // check shuffle_out is a correct inverse of shuffle_in
+            soft::scrypt_block_mix(&input, &mut expected0);
+            shuffle_in(&mut input);
+            scrypt_block_mix(&input, &mut output);
+            shuffle_out(&mut input);
+            soft::scrypt_block_mix(&input, &mut expected1);
+            shuffle_out(&mut output);
+            assert_eq!(
+                expected0, expected1,
+                "expected0 != expected1, shuffle_out is not a correct inverse of shuffle_in?"
+            );
+            assert_eq!(
+                output, expected0,
+                "output != expected0, scrypt_block_mix is not correct?"
+            );
+
+            input
+                .iter_mut()
+                .zip(output.iter())
+                .for_each(|(a, b)| *a = a.wrapping_add(*b));
+        }
+    }
+}
diff --git a/scrypt/src/block_mix/neon.rs b/scrypt/src/block_mix/neon.rs
new file mode 100644
index 00000000..e1c0a319
--- /dev/null
+++ b/scrypt/src/block_mix/neon.rs
@@ -0,0 +1,88 @@
+use crate::block_mix::pivot::{INVERSE_PIVOT_ABCD, PIVOT_ABCD};
+
+pub(crate) fn shuffle_in(b: &mut [u8]) {
+    for chunk in b.chunks_exact_mut(64) {
+        let mut t = [0u32; 16];
+        for (c, b) in chunk.chunks_exact(4).zip(t.iter_mut()) {
+            *b = u32::from_ne_bytes(c.try_into().unwrap());
+        }
+        chunk.chunks_exact_mut(4).enumerate().for_each(|(i, b)| {
+            b.copy_from_slice(&t[PIVOT_ABCD[i]].to_ne_bytes());
+        });
+    }
+}
+
+pub(crate) fn shuffle_out(b: &mut [u8]) {
+    for chunk in b.chunks_exact_mut(64) {
+        let mut t = [0u32; 16];
+        for (c, b) in chunk.chunks_exact(4).zip(t.iter_mut()) {
+            *b = u32::from_ne_bytes(c.try_into().unwrap());
+        }
+        chunk.chunks_exact_mut(4).enumerate().for_each(|(i, b)| {
+            b.copy_from_slice(&t[INVERSE_PIVOT_ABCD[i]].to_ne_bytes());
+        });
+    }
+}
+
+pub(crate) fn scrypt_block_mix(input: &[u8], output: &mut [u8]) {
+    use core::arch::aarch64::*;
+
+    macro_rules! vrol_u32 {
+        ($w:expr, $amt:literal) => {{
+            let w = $w;
+            vsraq_n_u32(vshlq_n_u32(w, $amt), w, 32 - $amt)
+        }};
+    }
+
+    let last_block = &input[input.len() - 64..];
+
+    let mut a = unsafe { vld1q_u32(last_block.as_ptr().cast()) };
+    let mut b = unsafe { vld1q_u32(last_block.as_ptr().add(16).cast()) };
+    let mut c = unsafe { vld1q_u32(last_block.as_ptr().add(32).cast()) };
+    let mut d = unsafe { vld1q_u32(last_block.as_ptr().add(48).cast()) };
+
+    for (i, chunk) in input.chunks(64).enumerate() {
+        let pos = if i % 2 == 0 {
+            (i / 2) * 64
+        } else {
+            (i / 2) * 64 + input.len() / 2
+        };
+
+        unsafe {
+            let chunk_a = vld1q_u32(chunk.as_ptr().cast());
+            let chunk_b = vld1q_u32(chunk.as_ptr().add(16).cast());
+            let chunk_c = vld1q_u32(chunk.as_ptr().add(32).cast());
+            let chunk_d = vld1q_u32(chunk.as_ptr().add(48).cast());
+
+            a = veorq_u32(a, chunk_a);
+            b = veorq_u32(b, chunk_b);
+            c = veorq_u32(c, chunk_c);
+            d = veorq_u32(d, chunk_d);
+
+            let saves = [a, b, c, d];
+
+            for _ in 0..8 {
+                b = veorq_u32(b, vrol_u32!(vaddq_u32(a, d), 7));
+                c = veorq_u32(c, vrol_u32!(vaddq_u32(b, a), 9));
+                d = veorq_u32(d, vrol_u32!(vaddq_u32(c, b), 13));
+                a = veorq_u32(a, vrol_u32!(vaddq_u32(d, c), 18));
+
+                d = vextq_u32(d, d, 1);
+                c = vextq_u32(c, c, 2);
+                b = vextq_u32(b, b, 3);
+
+                (b, d) = (d, b);
+            }
+
+            a = vaddq_u32(a, saves[0]);
+            b = vaddq_u32(b, saves[1]);
+            c = vaddq_u32(c, saves[2]);
+            d = vaddq_u32(d, saves[3]);
+
+            vst1q_u32(output.as_mut_ptr().add(pos).cast(), a);
+            vst1q_u32(output.as_mut_ptr().add(pos + 16).cast(), b);
+            vst1q_u32(output.as_mut_ptr().add(pos + 32).cast(), c);
+            vst1q_u32(output.as_mut_ptr().add(pos + 48).cast(), d);
+        }
+    }
+}
diff --git a/scrypt/src/block_mix/pivot.rs b/scrypt/src/block_mix/pivot.rs
new file mode 100644
index 00000000..3839ad70
--- /dev/null
+++ b/scrypt/src/block_mix/pivot.rs
@@ -0,0 +1,20 @@
+/// Permute Salsa20 block to column major order
+pub(crate) const PIVOT_ABCD: [usize; 16] = [0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11];
+
+/// Inverse of PIVOT_ABCD
+pub(crate) const INVERSE_PIVOT_ABCD: [usize; 16] = const {
+    let mut index = [0; 16];
+    let mut i = 0;
+    while i < 16 {
+        let mut inverse = 0;
+        while inverse < 16 {
+            if PIVOT_ABCD[inverse] == i {
+                index[i] = inverse;
+                break;
+            }
+            inverse += 1;
+        }
+        i += 1;
+    }
+    index
+};
diff --git a/scrypt/src/block_mix/simd128.rs b/scrypt/src/block_mix/simd128.rs
new file mode 100644
index 00000000..0757ffc2
--- /dev/null
+++ b/scrypt/src/block_mix/simd128.rs
@@ -0,0 +1,88 @@
+use crate::block_mix::pivot::{INVERSE_PIVOT_ABCD, PIVOT_ABCD};
+
+pub(crate) fn shuffle_in(b: &mut [u8]) {
+    for chunk in b.chunks_exact_mut(64) {
+        let mut t = [0u32; 16];
+        for (c, b) in chunk.chunks_exact(4).zip(t.iter_mut()) {
+            *b = u32::from_ne_bytes(c.try_into().unwrap());
+        }
+        chunk.chunks_exact_mut(4).enumerate().for_each(|(i, b)| {
+            b.copy_from_slice(&t[PIVOT_ABCD[i]].to_ne_bytes());
+        });
+    }
+}
+
+pub(crate) fn shuffle_out(b: &mut [u8]) {
+    for chunk in b.chunks_exact_mut(64) {
+        let mut t = [0u32; 16];
+        for (c, b) in chunk.chunks_exact(4).zip(t.iter_mut()) {
+            *b = u32::from_ne_bytes(c.try_into().unwrap());
+        }
+        chunk.chunks_exact_mut(4).enumerate().for_each(|(i, b)| {
+            b.copy_from_slice(&t[INVERSE_PIVOT_ABCD[i]].to_ne_bytes());
+        });
+    }
+}
+
+pub(crate) fn scrypt_block_mix(input: &[u8], output: &mut [u8]) {
+    use core::arch::wasm32::*;
+
+    macro_rules! u32x4_rol {
+        ($w:expr, $amt:literal) => {{
+            let w = $w;
+            v128_or(u32x4_shl(w, $amt), u32x4_shr(w, 32 - $amt))
+        }};
+    }
+
+    let last_block = &input[input.len() - 64..];
+
+    let mut a = unsafe { v128_load(last_block.as_ptr().cast()) };
+    let mut b = unsafe { v128_load(last_block.as_ptr().add(16).cast()) };
+    let mut c = unsafe { v128_load(last_block.as_ptr().add(32).cast()) };
+    let mut d = unsafe { v128_load(last_block.as_ptr().add(48).cast()) };
+
+    for (i, chunk) in input.chunks(64).enumerate() {
+        let pos = if i % 2 == 0 {
+            (i / 2) * 64
+        } else {
+            (i / 2) * 64 + input.len() / 2
+        };
+
+        unsafe {
+            let chunk_a = v128_load(chunk.as_ptr().cast());
+            let chunk_b = v128_load(chunk.as_ptr().add(16).cast());
+            let chunk_c = v128_load(chunk.as_ptr().add(32).cast());
+            let chunk_d = v128_load(chunk.as_ptr().add(48).cast());
+
+            a = v128_xor(a, chunk_a);
+            b = v128_xor(b, chunk_b);
+            c = v128_xor(c, chunk_c);
+            d = v128_xor(d, chunk_d);
+
+            let saves = [a, b, c, d];
+
+            for _ in 0..8 {
+                b = v128_xor(b, u32x4_rol!(u32x4_add(a, d), 7));
+                c = v128_xor(c, u32x4_rol!(u32x4_add(b, a), 9));
+                d = v128_xor(d, u32x4_rol!(u32x4_add(c, b), 13));
+                a = v128_xor(a, u32x4_rol!(u32x4_add(d, c), 18));
+
+                d = i32x4_shuffle::<1, 2, 3, 0>(d, d);
+                c = i32x4_shuffle::<2, 3, 0, 1>(c, c);
+                b = i32x4_shuffle::<3, 0, 1, 2>(b, b);
+
+                (b, d) = (d, b);
+            }
+
+            a = u32x4_add(a, saves[0]);
+            b = u32x4_add(b, saves[1]);
+            c = u32x4_add(c, saves[2]);
+            d = u32x4_add(d, saves[3]);
+
+            v128_store(output.as_mut_ptr().add(pos).cast(), a);
+            v128_store(output.as_mut_ptr().add(pos + 16).cast(), b);
+            v128_store(output.as_mut_ptr().add(pos + 32).cast(), c);
+            v128_store(output.as_mut_ptr().add(pos + 48).cast(), d);
+        }
+    }
+}
diff --git a/scrypt/src/block_mix/soft.rs b/scrypt/src/block_mix/soft.rs
new file mode 100644
index 00000000..9dbdd984
--- /dev/null
+++ b/scrypt/src/block_mix/soft.rs
@@ -0,0 +1,42 @@
+/// Execute the BlockMix operation
+/// input - the input vector. The length must be a multiple of 128.
+/// output - the output vector. Must be the same length as input.
+pub(crate) fn scrypt_block_mix(input: &[u8], output: &mut [u8]) {
+    use salsa20::{
+        SalsaCore,
+        cipher::{StreamCipherCore, typenum::U4},
+    };
+
+    type Salsa20_8 = SalsaCore<U4>;
+
+    let mut x = [0u8; 64];
+    x.copy_from_slice(&input[input.len() - 64..]);
+
+    let mut t = [0u8; 64];
+
+    for (i, chunk) in input.chunks(64).enumerate() {
+        xor(&x, chunk, &mut t);
+
+        let mut t2 = [0u32; 16];
+
+        for (c, b) in t.chunks_exact(4).zip(t2.iter_mut()) {
+            *b = u32::from_le_bytes(c.try_into().unwrap());
+        }
+
+        Salsa20_8::from_raw_state(t2).write_keystream_block((&mut x).into());
+
+        let pos = if i % 2 == 0 {
+            (i / 2) * 64
+        } else {
+            (i / 2) * 64 + input.len() / 2
+        };
+
+        output[pos..pos + 64].copy_from_slice(&x);
+    }
+}
+
+fn xor(x: &[u8], y: &[u8], output: &mut [u8]) {
+    for ((out, &x_i), &y_i) in output.iter_mut().zip(x.iter()).zip(y.iter()) {
+        *out = x_i ^ y_i;
+    }
+}
diff --git a/scrypt/src/block_mix/sse2.rs b/scrypt/src/block_mix/sse2.rs
new file mode 100644
index 00000000..1daebe57
--- /dev/null
+++ b/scrypt/src/block_mix/sse2.rs
@@ -0,0 +1,90 @@
+use crate::block_mix::pivot::{INVERSE_PIVOT_ABCD, PIVOT_ABCD};
+
+pub(crate) fn shuffle_in(b: &mut [u8]) {
+    for chunk in b.chunks_exact_mut(64) {
+        let mut t = [0u32; 16];
+        for (c, b) in chunk.chunks_exact(4).zip(t.iter_mut()) {
+            *b = u32::from_ne_bytes(c.try_into().unwrap());
+        }
+        chunk.chunks_exact_mut(4).enumerate().for_each(|(i, b)| {
+            b.copy_from_slice(&t[PIVOT_ABCD[i]].to_ne_bytes());
+        });
+    }
+}
+
+pub(crate) fn shuffle_out(b: &mut [u8]) {
+    for chunk in b.chunks_exact_mut(64) {
+        let mut t = [0u32; 16];
+        for (c, b) in chunk.chunks_exact(4).zip(t.iter_mut()) {
+            *b = u32::from_ne_bytes(c.try_into().unwrap());
+        }
+        chunk.chunks_exact_mut(4).enumerate().for_each(|(i, b)| {
+            b.copy_from_slice(&t[INVERSE_PIVOT_ABCD[i]].to_ne_bytes());
+        });
+    }
+}
+
+pub(crate) fn scrypt_block_mix(input: &[u8], output: &mut [u8]) {
+    #[cfg(target_arch = "x86")]
+    use core::arch::x86::*;
+
+    #[cfg(target_arch = "x86_64")]
+    use core::arch::x86_64::*;
+
+    macro_rules! mm_rol_epi32x {
+        ($w:expr, $amt:literal) => {{
+            let w = $w;
+            _mm_or_si128(_mm_slli_epi32(w, $amt), _mm_srli_epi32(w, 32 - $amt))
+        }};
+    }
+
+    let last_block = &input[input.len() - 64..];
+
+    let mut a = unsafe { _mm_loadu_si128(last_block.as_ptr().cast()) };
+    let mut b = unsafe { _mm_loadu_si128(last_block.as_ptr().add(16).cast()) };
+    let mut c = unsafe { _mm_loadu_si128(last_block.as_ptr().add(32).cast()) };
+    let mut d = unsafe { _mm_loadu_si128(last_block.as_ptr().add(48).cast()) };
+
+    for (i, chunk) in input.chunks(64).enumerate() {
+        let pos = if i % 2 == 0 {
+            (i / 2) * 64
+        } else {
+            (i / 2) * 64 + input.len() / 2
+        };
+
+        unsafe {
+            a = _mm_xor_si128(a, _mm_loadu_si128(chunk.as_ptr().cast()));
+            b = _mm_xor_si128(b, _mm_loadu_si128(chunk.as_ptr().add(16).cast()));
+            c = _mm_xor_si128(c, _mm_loadu_si128(chunk.as_ptr().add(32).cast()));
+            d = _mm_xor_si128(d, _mm_loadu_si128(chunk.as_ptr().add(48).cast()));
+
+            let saves = [a, b, c, d];
+
+            for _ in 0..8 {
+                b = _mm_xor_si128(b, mm_rol_epi32x!(_mm_add_epi32(a, d), 7));
+                c = _mm_xor_si128(c, mm_rol_epi32x!(_mm_add_epi32(b, a), 9));
+                d = _mm_xor_si128(d, mm_rol_epi32x!(_mm_add_epi32(c, b), 13));
+                a = _mm_xor_si128(a, mm_rol_epi32x!(_mm_add_epi32(d, c), 18));
+
+                // a stays in place
+                // b = left shuffle d by 1 element
+                d = _mm_shuffle_epi32(d, 0b00_11_10_01);
+                // c = left shuffle c by 2 elements
+                c = _mm_shuffle_epi32(c, 0b01_00_11_10);
+                // d = left shuffle b by 3 elements
+                b = _mm_shuffle_epi32(b, 0b10_01_00_11);
+                (b, d) = (d, b);
+            }
+
+            a = _mm_add_epi32(a, saves[0]);
+            b = _mm_add_epi32(b, saves[1]);
+            c = _mm_add_epi32(c, saves[2]);
+            d = _mm_add_epi32(d, saves[3]);
+
+            _mm_storeu_si128(output.as_mut_ptr().add(pos).cast(), a);
+            _mm_storeu_si128(output.as_mut_ptr().add(pos + 16).cast(), b);
+            _mm_storeu_si128(output.as_mut_ptr().add(pos + 32).cast(), c);
+            _mm_storeu_si128(output.as_mut_ptr().add(pos + 48).cast(), d);
+        }
+    }
+}
diff --git a/scrypt/src/lib.rs b/scrypt/src/lib.rs
index 7c5fff80..dd426499 100644
--- a/scrypt/src/lib.rs
+++ b/scrypt/src/lib.rs
@@ -55,6 +55,7 @@ extern crate alloc;
 use pbkdf2::pbkdf2_hmac;
 use sha2::Sha256;
 
+mod block_mix;
 /// Errors for `scrypt` operations.
 pub mod errors;
 mod params;
diff --git a/scrypt/src/romix.rs b/scrypt/src/romix.rs
index 1187de58..1be655dc 100644
--- a/scrypt/src/romix.rs
+++ b/scrypt/src/romix.rs
@@ -1,34 +1,3 @@
-#[cfg(any(
-    target_arch = "x86",
-    target_arch = "x86_64",
-    all(target_arch = "wasm32", target_feature = "simd128")
-))]
-/// Permute Salsa20 block to column major order
-const PIVOT_ABCD: [usize; 16] = [0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11];
-
-#[cfg(any(
-    target_arch = "x86",
-    target_arch = "x86_64",
-    all(target_arch = "wasm32", target_feature = "simd128")
-))]
-/// Inverse of PIVOT_ABCD
-const INVERSE_PIVOT_ABCD: [usize; 16] = const {
-    let mut index = [0; 16];
-    let mut i = 0;
-    while i < 16 {
-        let mut inverse = 0;
-        while inverse < 16 {
-            if PIVOT_ABCD[inverse] == i {
-                index[i] = inverse;
-                break;
-            }
-            inverse += 1;
-        }
-        i += 1;
-    }
-    index
-};
-
 /// Execute the ROMix operation in-place.
 /// b - the data to operate on
 /// v - a temporary variable to store the vector V
@@ -49,249 +18,21 @@ pub(crate) fn scrypt_ro_mix(b: &mut [u8], v: &mut [u8], t: &mut [u8], n: usize)
 
     let len = b.len();
 
-    #[cfg(any(
-        target_arch = "x86",
-        target_arch = "x86_64",
-        all(target_arch = "wasm32", target_feature = "simd128")
-    ))]
-    for chunk in b.chunks_exact_mut(64) {
-        let mut t = [0u32; 16];
-        for (c, b) in chunk.chunks_exact(4).zip(t.iter_mut()) {
-            *b = u32::from_ne_bytes(c.try_into().unwrap());
-        }
-        chunk.chunks_exact_mut(4).enumerate().for_each(|(i, b)| {
-            b.copy_from_slice(&t[PIVOT_ABCD[i]].to_ne_bytes());
-        });
-    }
+    crate::block_mix::shuffle_in(b);
 
     for chunk in v.chunks_mut(len) {
         chunk.copy_from_slice(b);
-
-        #[cfg(any(
-            target_arch = "x86",
-            target_arch = "x86_64",
-            all(target_arch = "wasm32", target_feature = "simd128")
-        ))]
-        scrypt_block_mix_abcd(chunk, b);
-
-        #[cfg(not(any(
-            target_arch = "x86",
-            target_arch = "x86_64",
-            all(target_arch = "wasm32", target_feature = "simd128")
-        )))]
-        scrypt_block_mix(chunk, b);
+        crate::block_mix::scrypt_block_mix(chunk, b);
     }
 
     for _ in 0..n {
         let j = integerify(b, n);
         xor(b, &v[j * len..(j + 1) * len], t);
 
-        #[cfg(any(
-            target_arch = "x86",
-            target_arch = "x86_64",
-            all(target_arch = "wasm32", target_feature = "simd128")
-        ))]
-        scrypt_block_mix_abcd(t, b);
-
-        #[cfg(not(any(
-            target_arch = "x86",
-            target_arch = "x86_64",
-            all(target_arch = "wasm32", target_feature = "simd128")
-        )))]
-        scrypt_block_mix(t, b);
-    }
-
-    #[cfg(any(
-        target_arch = "x86",
-        target_arch = "x86_64",
-        all(target_arch = "wasm32", target_feature = "simd128")
-    ))]
-    for chunk in b.chunks_exact_mut(64) {
-        let mut t = [0u32; 16];
-        for (c, b) in chunk.chunks_exact(4).zip(t.iter_mut()) {
-            *b = u32::from_ne_bytes(c.try_into().unwrap());
-        }
-        chunk.chunks_exact_mut(4).enumerate().for_each(|(i, b)| {
-            b.copy_from_slice(&t[INVERSE_PIVOT_ABCD[i]].to_ne_bytes());
-        });
-    }
-}
-
-/// Execute the BlockMix operation
-/// input - the input vector. The length must be a multiple of 128.
-/// output - the output vector. Must be the same length as input.
-#[cfg(not(any(
-    target_arch = "x86",
-    target_arch = "x86_64",
-    all(target_arch = "wasm32", target_feature = "simd128")
-)))]
-fn scrypt_block_mix(input: &[u8], output: &mut [u8]) {
-    use salsa20::{
-        SalsaCore,
-        cipher::{StreamCipherCore, typenum::U4},
-    };
-
-    type Salsa20_8 = SalsaCore<U4>;
-
-    let mut x = [0u8; 64];
-    x.copy_from_slice(&input[input.len() - 64..]);
-
-    let mut t = [0u8; 64];
-
-    for (i, chunk) in input.chunks(64).enumerate() {
-        xor(&x, chunk, &mut t);
-
-        let mut t2 = [0u32; 16];
-
-        for (c, b) in t.chunks_exact(4).zip(t2.iter_mut()) {
-            *b = u32::from_le_bytes(c.try_into().unwrap());
-        }
-
-        Salsa20_8::from_raw_state(t2).write_keystream_block((&mut x).into());
-
-        let pos = if i % 2 == 0 {
-            (i / 2) * 64
-        } else {
-            (i / 2) * 64 + input.len() / 2
-        };
-
-        output[pos..pos + 64].copy_from_slice(&x);
-    }
-}
-
-/// Execute the BlockMix operation with pre-shuffled input.
-/// input - the input vector. The length must be a multiple of 128.
-/// output - the output vector. Must be the same length as input.
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-fn scrypt_block_mix_abcd(input: &[u8], output: &mut [u8]) {
-    #[cfg(target_arch = "x86")]
-    use core::arch::x86::*;
-
-    #[cfg(target_arch = "x86_64")]
-    use core::arch::x86_64::*;
-
-    macro_rules! mm_rol_epi32x {
-        ($w:expr, $amt:literal) => {{
-            let w = $w;
-            _mm_or_si128(_mm_slli_epi32(w, $amt), _mm_srli_epi32(w, 32 - $amt))
-        }};
+        crate::block_mix::scrypt_block_mix(t, b);
     }
 
-    let last_block = &input[input.len() - 64..];
-
-    let mut a = unsafe { _mm_loadu_si128(last_block.as_ptr().cast()) };
-    let mut b = unsafe { _mm_loadu_si128(last_block.as_ptr().add(16).cast()) };
-    let mut c = unsafe { _mm_loadu_si128(last_block.as_ptr().add(32).cast()) };
-    let mut d = unsafe { _mm_loadu_si128(last_block.as_ptr().add(48).cast()) };
-
-    for (i, chunk) in input.chunks(64).enumerate() {
-        let pos = if i % 2 == 0 {
-            (i / 2) * 64
-        } else {
-            (i / 2) * 64 + input.len() / 2
-        };
-
-        unsafe {
-            a = _mm_xor_si128(a, _mm_loadu_si128(chunk.as_ptr().cast()));
-            b = _mm_xor_si128(b, _mm_loadu_si128(chunk.as_ptr().add(16).cast()));
-            c = _mm_xor_si128(c, _mm_loadu_si128(chunk.as_ptr().add(32).cast()));
-            d = _mm_xor_si128(d, _mm_loadu_si128(chunk.as_ptr().add(48).cast()));
-
-            let saves = [a, b, c, d];
-
-            for _ in 0..8 {
-                b = _mm_xor_si128(b, mm_rol_epi32x!(_mm_add_epi32(a, d), 7));
-                c = _mm_xor_si128(c, mm_rol_epi32x!(_mm_add_epi32(b, a), 9));
-                d = _mm_xor_si128(d, mm_rol_epi32x!(_mm_add_epi32(c, b), 13));
-                a = _mm_xor_si128(a, mm_rol_epi32x!(_mm_add_epi32(d, c), 18));
-
-                // a stays in place
-                // b = left shuffle d by 1 element
-                d = _mm_shuffle_epi32(d, 0b00_11_10_01);
-                // c = left shuffle c by 2 elements
-                c = _mm_shuffle_epi32(c, 0b01_00_11_10);
-                // d = left shuffle b by 3 elements
-                b = _mm_shuffle_epi32(b, 0b10_01_00_11);
-                (b, d) = (d, b);
-            }
-
-            a = _mm_add_epi32(a, saves[0]);
-            b = _mm_add_epi32(b, saves[1]);
-            c = _mm_add_epi32(c, saves[2]);
-            d = _mm_add_epi32(d, saves[3]);
-
-            _mm_storeu_si128(output.as_mut_ptr().add(pos).cast(), a);
-            _mm_storeu_si128(output.as_mut_ptr().add(pos + 16).cast(), b);
-            _mm_storeu_si128(output.as_mut_ptr().add(pos + 32).cast(), c);
-            _mm_storeu_si128(output.as_mut_ptr().add(pos + 48).cast(), d);
-        }
-    }
-}
-
-/// Execute the BlockMix operation with pre-shuffled input.
-/// input - the input vector. The length must be a multiple of 128.
-/// output - the output vector. Must be the same length as input.
-#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
-fn scrypt_block_mix_abcd(input: &[u8], output: &mut [u8]) {
-    use core::arch::wasm32::*;
-
-    macro_rules! u32x4_rol {
-        ($x:expr, $amt:literal) => {
-            v128_or(u32x4_shl($x, $amt), u32x4_shr($x, 32 - $amt))
-        };
-    }
-
-    let last_block = &input[input.len() - 64..];
-
-    let mut a = unsafe { v128_load(last_block.as_ptr().cast()) };
-    let mut b = unsafe { v128_load(last_block.as_ptr().add(16).cast()) };
-    let mut c = unsafe { v128_load(last_block.as_ptr().add(32).cast()) };
-    let mut d = unsafe { v128_load(last_block.as_ptr().add(48).cast()) };
-
-    for (i, chunk) in input.chunks(64).enumerate() {
-        let pos = if i % 2 == 0 {
-            (i / 2) * 64
-        } else {
-            (i / 2) * 64 + input.len() / 2
-        };
-
-        unsafe {
-            let chunk_a = v128_load(chunk.as_ptr().cast());
-            let chunk_b = v128_load(chunk.as_ptr().add(16).cast());
-            let chunk_c = v128_load(chunk.as_ptr().add(32).cast());
-            let chunk_d = v128_load(chunk.as_ptr().add(48).cast());
-
-            a = v128_xor(a, chunk_a);
-            b = v128_xor(b, chunk_b);
-            c = v128_xor(c, chunk_c);
-            d = v128_xor(d, chunk_d);
-
-            let saves = [a, b, c, d];
-
-            for _ in 0..8 {
-                b = v128_xor(b, u32x4_rol!(u32x4_add(a, d), 7));
-                c = v128_xor(c, u32x4_rol!(u32x4_add(b, a), 9));
-                d = v128_xor(d, u32x4_rol!(u32x4_add(c, b), 13));
-                a = v128_xor(a, u32x4_rol!(u32x4_add(d, c), 18));
-
-                d = i32x4_shuffle::<1, 2, 3, 0>(d, d);
-                c = i32x4_shuffle::<2, 3, 0, 1>(c, c);
-                b = i32x4_shuffle::<3, 0, 1, 2>(b, b);
-
-                (b, d) = (d, b);
-            }
-
-            a = u32x4_add(a, saves[0]);
-            b = u32x4_add(b, saves[1]);
-            c = u32x4_add(c, saves[2]);
-            d = u32x4_add(d, saves[3]);
-
-            v128_store(output.as_mut_ptr().add(pos).cast(), a);
-            v128_store(output.as_mut_ptr().add(pos + 16).cast(), b);
-            v128_store(output.as_mut_ptr().add(pos + 32).cast(), c);
-            v128_store(output.as_mut_ptr().add(pos + 48).cast(), d);
-        }
-    }
+    crate::block_mix::shuffle_out(b);
 }
 
 fn xor(x: &[u8], y: &[u8], output: &mut [u8]) {

From 94bef2fcf8e5945739ed9e445854be3202a4c236 Mon Sep 17 00:00:00 2001
From: eternal-flame-AD <yume@yumechi.jp>
Date: Fri, 1 Aug 2025 08:06:38 -0500
Subject: [PATCH 6/8] apply suggestions

Signed-off-by: eternal-flame-AD <yume@yumechi.jp>
---
 scrypt/src/{block_mix/mod.rs => block_mix.rs} | 22 ++++++-------------
 1 file changed, 7 insertions(+), 15 deletions(-)
 rename scrypt/src/{block_mix/mod.rs => block_mix.rs} (81%)

diff --git a/scrypt/src/block_mix/mod.rs b/scrypt/src/block_mix.rs
similarity index 81%
rename from scrypt/src/block_mix/mod.rs
rename to scrypt/src/block_mix.rs
index abd55930..968a15e6 100644
--- a/scrypt/src/block_mix/mod.rs
+++ b/scrypt/src/block_mix.rs
@@ -1,16 +1,3 @@
-#[cfg(any(
-    test,
-    not(any(
-        all(
-            any(target_arch = "x86", target_arch = "x86_64"),
-            target_feature = "sse2"
-        ),
-        all(target_arch = "aarch64", target_feature = "neon"),
-        all(target_arch = "wasm32", target_feature = "simd128"),
-    ))
-))]
-mod soft;
-
 cfg_if::cfg_if! {
     if #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] {
         mod pivot;
@@ -25,6 +12,7 @@ cfg_if::cfg_if! {
         mod sse2;
         pub(crate) use sse2::{scrypt_block_mix, shuffle_in, shuffle_out};
     } else {
+        mod soft;
         pub(crate) use soft::scrypt_block_mix;
 
         pub(crate) fn shuffle_in(_input: &mut [u8]) {}
@@ -32,6 +20,10 @@ cfg_if::cfg_if! {
     }
 }
 
+#[cfg(test)]
+#[path = "block_mix/soft.rs"]
+mod soft_test;
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -44,11 +36,11 @@ mod tests {
 
             let mut expected0 = [0u8; 128];
             let mut expected1 = [0u8; 128]; // check shuffle_out is a correct inverse of shuffle_in
-            soft::scrypt_block_mix(&input, &mut expected0);
+            soft_test::scrypt_block_mix(&input, &mut expected0);
             shuffle_in(&mut input);
             scrypt_block_mix(&input, &mut output);
             shuffle_out(&mut input);
-            soft::scrypt_block_mix(&input, &mut expected1);
+            soft_test::scrypt_block_mix(&input, &mut expected1);
             shuffle_out(&mut output);
             assert_eq!(
                 expected0, expected1,

From c05a6d4885ff14b4b426f30397d8533d56fa61ec Mon Sep 17 00:00:00 2001
From: eternal-flame-AD <yume@yumechi.jp>
Date: Fri, 1 Aug 2025 12:25:38 -0500
Subject: [PATCH 7/8] neon: use multiple-register load/store

Signed-off-by: eternal-flame-AD <yume@yumechi.jp>
---
 scrypt/src/block_mix/neon.rs | 36 +++++++++++++++---------------------
 1 file changed, 15 insertions(+), 21 deletions(-)

diff --git a/scrypt/src/block_mix/neon.rs b/scrypt/src/block_mix/neon.rs
index e1c0a319..b5531f54 100644
--- a/scrypt/src/block_mix/neon.rs
+++ b/scrypt/src/block_mix/neon.rs
@@ -36,10 +36,7 @@ pub(crate) fn scrypt_block_mix(input: &[u8], output: &mut [u8]) {
 
     let last_block = &input[input.len() - 64..];
 
-    let mut a = unsafe { vld1q_u32(last_block.as_ptr().cast()) };
-    let mut b = unsafe { vld1q_u32(last_block.as_ptr().add(16).cast()) };
-    let mut c = unsafe { vld1q_u32(last_block.as_ptr().add(32).cast()) };
-    let mut d = unsafe { vld1q_u32(last_block.as_ptr().add(48).cast()) };
+    let mut x = unsafe { vld1q_u32_x4(last_block.as_ptr().cast()) };
 
     for (i, chunk) in input.chunks(64).enumerate() {
         let pos = if i % 2 == 0 {
@@ -49,17 +46,17 @@ pub(crate) fn scrypt_block_mix(input: &[u8], output: &mut [u8]) {
         };
 
         unsafe {
-            let chunk_a = vld1q_u32(chunk.as_ptr().cast());
-            let chunk_b = vld1q_u32(chunk.as_ptr().add(16).cast());
-            let chunk_c = vld1q_u32(chunk.as_ptr().add(32).cast());
-            let chunk_d = vld1q_u32(chunk.as_ptr().add(48).cast());
+            let chunk = vld1q_u32_x4(chunk.as_ptr().cast());
 
-            a = veorq_u32(a, chunk_a);
-            b = veorq_u32(b, chunk_b);
-            c = veorq_u32(c, chunk_c);
-            d = veorq_u32(d, chunk_d);
+            x.0 = veorq_u32(x.0, chunk.0);
+            x.1 = veorq_u32(x.1, chunk.1);
+            x.2 = veorq_u32(x.2, chunk.2);
+            x.3 = veorq_u32(x.3, chunk.3);
 
-            let saves = [a, b, c, d];
+            let mut a = x.0;
+            let mut b = x.1;
+            let mut c = x.2;
+            let mut d = x.3;
 
             for _ in 0..8 {
                 b = veorq_u32(b, vrol_u32!(vaddq_u32(a, d), 7));
@@ -74,15 +71,12 @@ pub(crate) fn scrypt_block_mix(input: &[u8], output: &mut [u8]) {
                 (b, d) = (d, b);
             }
 
-            a = vaddq_u32(a, saves[0]);
-            b = vaddq_u32(b, saves[1]);
-            c = vaddq_u32(c, saves[2]);
-            d = vaddq_u32(d, saves[3]);
+            x.0 = vaddq_u32(x.0, a);
+            x.1 = vaddq_u32(x.1, b);
+            x.2 = vaddq_u32(x.2, c);
+            x.3 = vaddq_u32(x.3, d);
 
-            vst1q_u32(output.as_mut_ptr().add(pos).cast(), a);
-            vst1q_u32(output.as_mut_ptr().add(pos + 16).cast(), b);
-            vst1q_u32(output.as_mut_ptr().add(pos + 32).cast(), c);
-            vst1q_u32(output.as_mut_ptr().add(pos + 48).cast(), d);
+            vst1q_u32_x4(output.as_mut_ptr().add(pos).cast(), x);
         }
     }
 }

From aea4297697c5432d5ae575ddb6b97ed3a8e6ba13 Mon Sep 17 00:00:00 2001
From: eternal-flame-AD <yume@yumechi.jp>
Date: Fri, 1 Aug 2025 14:55:44 -0500
Subject: [PATCH 8/8] remove neon backend for performance regression

Signed-off-by: eternal-flame-AD <yume@yumechi.jp>
---
 scrypt/src/block_mix.rs      |  6 +--
 scrypt/src/block_mix/neon.rs | 82 ------------------------------------
 2 files changed, 1 insertion(+), 87 deletions(-)
 delete mode 100644 scrypt/src/block_mix/neon.rs

diff --git a/scrypt/src/block_mix.rs b/scrypt/src/block_mix.rs
index 968a15e6..66082888 100644
--- a/scrypt/src/block_mix.rs
+++ b/scrypt/src/block_mix.rs
@@ -1,9 +1,5 @@
 cfg_if::cfg_if! {
-    if #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] {
-        mod pivot;
-        mod neon;
-        pub(crate) use neon::{scrypt_block_mix, shuffle_in, shuffle_out};
-    } else if #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))] {
+    if #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))] {
         mod pivot;
         mod simd128;
         pub(crate) use simd128::{scrypt_block_mix, shuffle_in, shuffle_out};
diff --git a/scrypt/src/block_mix/neon.rs b/scrypt/src/block_mix/neon.rs
deleted file mode 100644
index b5531f54..00000000
--- a/scrypt/src/block_mix/neon.rs
+++ /dev/null
@@ -1,82 +0,0 @@
-use crate::block_mix::pivot::{INVERSE_PIVOT_ABCD, PIVOT_ABCD};
-
-pub(crate) fn shuffle_in(b: &mut [u8]) {
-    for chunk in b.chunks_exact_mut(64) {
-        let mut t = [0u32; 16];
-        for (c, b) in chunk.chunks_exact(4).zip(t.iter_mut()) {
-            *b = u32::from_ne_bytes(c.try_into().unwrap());
-        }
-        chunk.chunks_exact_mut(4).enumerate().for_each(|(i, b)| {
-            b.copy_from_slice(&t[PIVOT_ABCD[i]].to_ne_bytes());
-        });
-    }
-}
-
-pub(crate) fn shuffle_out(b: &mut [u8]) {
-    for chunk in b.chunks_exact_mut(64) {
-        let mut t = [0u32; 16];
-        for (c, b) in chunk.chunks_exact(4).zip(t.iter_mut()) {
-            *b = u32::from_ne_bytes(c.try_into().unwrap());
-        }
-        chunk.chunks_exact_mut(4).enumerate().for_each(|(i, b)| {
-            b.copy_from_slice(&t[INVERSE_PIVOT_ABCD[i]].to_ne_bytes());
-        });
-    }
-}
-
-pub(crate) fn scrypt_block_mix(input: &[u8], output: &mut [u8]) {
-    use core::arch::aarch64::*;
-
-    macro_rules! vrol_u32 {
-        ($w:expr, $amt:literal) => {{
-            let w = $w;
-            vsraq_n_u32(vshlq_n_u32(w, $amt), w, 32 - $amt)
-        }};
-    }
-
-    let last_block = &input[input.len() - 64..];
-
-    let mut x = unsafe { vld1q_u32_x4(last_block.as_ptr().cast()) };
-
-    for (i, chunk) in input.chunks(64).enumerate() {
-        let pos = if i % 2 == 0 {
-            (i / 2) * 64
-        } else {
-            (i / 2) * 64 + input.len() / 2
-        };
-
-        unsafe {
-            let chunk = vld1q_u32_x4(chunk.as_ptr().cast());
-
-            x.0 = veorq_u32(x.0, chunk.0);
-            x.1 = veorq_u32(x.1, chunk.1);
-            x.2 = veorq_u32(x.2, chunk.2);
-            x.3 = veorq_u32(x.3, chunk.3);
-
-            let mut a = x.0;
-            let mut b = x.1;
-            let mut c = x.2;
-            let mut d = x.3;
-
-            for _ in 0..8 {
-                b = veorq_u32(b, vrol_u32!(vaddq_u32(a, d), 7));
-                c = veorq_u32(c, vrol_u32!(vaddq_u32(b, a), 9));
-                d = veorq_u32(d, vrol_u32!(vaddq_u32(c, b), 13));
-                a = veorq_u32(a, vrol_u32!(vaddq_u32(d, c), 18));
-
-                d = vextq_u32(d, d, 1);
-                c = vextq_u32(c, c, 2);
-                b = vextq_u32(b, b, 3);
-
-                (b, d) = (d, b);
-            }
-
-            x.0 = vaddq_u32(x.0, a);
-            x.1 = vaddq_u32(x.1, b);
-            x.2 = vaddq_u32(x.2, c);
-            x.3 = vaddq_u32(x.3, d);
-
-            vst1q_u32_x4(output.as_mut_ptr().add(pos).cast(), x);
-        }
-    }
-}