From 9a017e96d5e3ba3e66b3c4c03ec8d703426b5a47 Mon Sep 17 00:00:00 2001 From: eternal-flame-AD Date: Thu, 31 Jul 2025 22:11:12 -0500 Subject: [PATCH 1/8] scrypt: sse2 RoMix optimization Signed-off-by: eternal-flame-AD --- scrypt/src/romix.rs | 116 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 116 insertions(+) diff --git a/scrypt/src/romix.rs b/scrypt/src/romix.rs index bd3c56dd..38b8ae51 100644 --- a/scrypt/src/romix.rs +++ b/scrypt/src/romix.rs @@ -1,3 +1,26 @@ +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +/// Permute Salsa20 block to column major order +const PIVOT_ABCD: [usize; 16] = [0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11]; + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +/// Inverse of PIVOT_ABCD +const INVERSE_PIVOT_ABCD: [usize; 16] = const { + let mut index = [0; 16]; + let mut i = 0; + while i < 16 { + let mut inverse = 0; + while inverse < 16 { + if PIVOT_ABCD[inverse] == i { + index[i] = inverse; + break; + } + inverse += 1; + } + i += 1; + } + index +}; + /// Execute the ROMix operation in-place. /// b - the data to operate on /// v - a temporary variable to store the vector V @@ -18,6 +41,17 @@ pub(crate) fn scrypt_ro_mix(b: &mut [u8], v: &mut [u8], t: &mut [u8], n: usize) let len = b.len(); + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + for chunk in b.chunks_exact_mut(64) { + let mut t = [0u32; 16]; + for (c, b) in chunk.chunks_exact(4).zip(t.iter_mut()) { + *b = u32::from_ne_bytes(c.try_into().unwrap()); + } + chunk.chunks_exact_mut(4).enumerate().for_each(|(i, b)| { + b.copy_from_slice(&t[PIVOT_ABCD[i]].to_ne_bytes()); + }); + } + for chunk in v.chunks_mut(len) { chunk.copy_from_slice(b); scrypt_block_mix(chunk, b); @@ -28,11 +62,23 @@ pub(crate) fn scrypt_ro_mix(b: &mut [u8], v: &mut [u8], t: &mut [u8], n: usize) xor(b, &v[j * len..(j + 1) * len], t); scrypt_block_mix(t, b); } + + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + for chunk in b.chunks_exact_mut(64) { + let mut t = [0u32; 16]; + for (c, b) in chunk.chunks_exact(4).zip(t.iter_mut()) { + *b = u32::from_ne_bytes(c.try_into().unwrap()); + } + chunk.chunks_exact_mut(4).enumerate().for_each(|(i, b)| { + b.copy_from_slice(&t[INVERSE_PIVOT_ABCD[i]].to_ne_bytes()); + }); + } } /// Execute the BlockMix operation /// input - the input vector. The length must be a multiple of 128. /// output - the output vector. Must be the same length as input. +#[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))] fn scrypt_block_mix(input: &[u8], output: &mut [u8]) { use salsa20::{ SalsaCore, @@ -67,6 +113,76 @@ fn scrypt_block_mix(input: &[u8], output: &mut [u8]) { } } +/// Execute the BlockMix operation +/// input - the input vector. The length must be a multiple of 128. +/// output - the output vector. Must be the same length as input. +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn scrypt_block_mix(input: &[u8], output: &mut [u8]) { + #[cfg(target_arch = "x86")] + use core::arch::x86::*; + + #[cfg(target_arch = "x86_64")] + use core::arch::x86_64::*; + + macro_rules! mm_rol_epi32x { + ($w:expr, $amt:literal) => {{ + let w = $w; + _mm_or_si128(_mm_slli_epi32(w, $amt), _mm_srli_epi32(w, 32 - $amt)) + }}; + } + + let mut x = [0u8; 64]; + x.copy_from_slice(&input[input.len() - 64..]); + + let mut a = unsafe { _mm_loadu_si128(x.as_ptr().cast()) }; + let mut b = unsafe { _mm_loadu_si128(x.as_ptr().add(16).cast()) }; + let mut c = unsafe { _mm_loadu_si128(x.as_ptr().add(32).cast()) }; + let mut d = unsafe { _mm_loadu_si128(x.as_ptr().add(48).cast()) }; + + for (i, chunk) in input.chunks(64).enumerate() { + let pos = if i % 2 == 0 { + (i / 2) * 64 + } else { + (i / 2) * 64 + input.len() / 2 + }; + + unsafe { + a = _mm_xor_si128(a, _mm_loadu_si128(chunk.as_ptr().cast())); + b = _mm_xor_si128(b, _mm_loadu_si128(chunk.as_ptr().add(16).cast())); + c = _mm_xor_si128(c, _mm_loadu_si128(chunk.as_ptr().add(32).cast())); + d = _mm_xor_si128(d, _mm_loadu_si128(chunk.as_ptr().add(48).cast())); + + let saves = [a, b, c, d]; + + for _ in 0..8 { + b = _mm_xor_si128(b, mm_rol_epi32x!(_mm_add_epi32(a, d), 7)); + c = _mm_xor_si128(c, mm_rol_epi32x!(_mm_add_epi32(b, a), 9)); + d = _mm_xor_si128(d, mm_rol_epi32x!(_mm_add_epi32(c, b), 13)); + a = _mm_xor_si128(a, mm_rol_epi32x!(_mm_add_epi32(d, c), 18)); + + // a stays in place + // b = left shuffle d by 1 element + d = _mm_shuffle_epi32(d, 0b00_11_10_01); + // c = left shuffle c by 2 elements + c = _mm_shuffle_epi32(c, 0b01_00_11_10); + // d = left shuffle b by 3 elements + b = _mm_shuffle_epi32(b, 0b10_01_00_11); + (b, d) = (d, b); + } + + a = _mm_add_epi32(a, saves[0]); + b = _mm_add_epi32(b, saves[1]); + c = _mm_add_epi32(c, saves[2]); + d = _mm_add_epi32(d, saves[3]); + + _mm_storeu_si128(output.as_mut_ptr().add(pos).cast(), a); + _mm_storeu_si128(output.as_mut_ptr().add(pos + 16).cast(), b); + _mm_storeu_si128(output.as_mut_ptr().add(pos + 32).cast(), c); + _mm_storeu_si128(output.as_mut_ptr().add(pos + 48).cast(), d); + } + } +} + fn xor(x: &[u8], y: &[u8], output: &mut [u8]) { for ((out, &x_i), &y_i) in output.iter_mut().zip(x.iter()).zip(y.iter()) { *out = x_i ^ y_i; From e991eb9876b9ddb65e895d2397fd698372a82237 Mon Sep 17 00:00:00 2001 From: eternal-flame-AD Date: Thu, 31 Jul 2025 23:04:33 -0500 Subject: [PATCH 2/8] use different function names for the preshuffled version Signed-off-by: eternal-flame-AD --- scrypt/src/romix.rs | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/scrypt/src/romix.rs b/scrypt/src/romix.rs index 38b8ae51..45d8ce0c 100644 --- a/scrypt/src/romix.rs +++ b/scrypt/src/romix.rs @@ -54,12 +54,22 @@ pub(crate) fn scrypt_ro_mix(b: &mut [u8], v: &mut [u8], t: &mut [u8], n: usize) for chunk in v.chunks_mut(len) { chunk.copy_from_slice(b); + + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + scrypt_block_mix_abcd(chunk, b); + + #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))] scrypt_block_mix(chunk, b); } for _ in 0..n { let j = integerify(b, n); xor(b, &v[j * len..(j + 1) * len], t); + + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + scrypt_block_mix_abcd(t, b); + + #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))] scrypt_block_mix(t, b); } @@ -113,11 +123,11 @@ fn scrypt_block_mix(input: &[u8], output: &mut [u8]) { } } -/// Execute the BlockMix operation +/// Execute the BlockMix operation with pre-shuffled input. /// input - the input vector. The length must be a multiple of 128. /// output - the output vector. Must be the same length as input. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -fn scrypt_block_mix(input: &[u8], output: &mut [u8]) { +fn scrypt_block_mix_abcd(input: &[u8], output: &mut [u8]) { #[cfg(target_arch = "x86")] use core::arch::x86::*; @@ -131,13 +141,12 @@ fn scrypt_block_mix(input: &[u8], output: &mut [u8]) { }}; } - let mut x = [0u8; 64]; - x.copy_from_slice(&input[input.len() - 64..]); + let last_block = &input[input.len() - 64..]; - let mut a = unsafe { _mm_loadu_si128(x.as_ptr().cast()) }; - let mut b = unsafe { _mm_loadu_si128(x.as_ptr().add(16).cast()) }; - let mut c = unsafe { _mm_loadu_si128(x.as_ptr().add(32).cast()) }; - let mut d = unsafe { _mm_loadu_si128(x.as_ptr().add(48).cast()) }; + let mut a = unsafe { _mm_loadu_si128(last_block.as_ptr().cast()) }; + let mut b = unsafe { _mm_loadu_si128(last_block.as_ptr().add(16).cast()) }; + let mut c = unsafe { _mm_loadu_si128(last_block.as_ptr().add(32).cast()) }; + let mut d = unsafe { _mm_loadu_si128(last_block.as_ptr().add(48).cast()) }; for (i, chunk) in input.chunks(64).enumerate() { let pos = if i % 2 == 0 { From c65ad4eb25dceab28cb6665f45179fdb1c7119af Mon Sep 17 00:00:00 2001 From: eternal-flame-AD Date: Thu, 31 Jul 2025 23:57:31 -0500 Subject: [PATCH 3/8] wasm32 kernel Signed-off-by: eternal-flame-AD --- scrypt/src/romix.rs | 112 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 103 insertions(+), 9 deletions(-) diff --git a/scrypt/src/romix.rs b/scrypt/src/romix.rs index 45d8ce0c..715d7c5b 100644 --- a/scrypt/src/romix.rs +++ b/scrypt/src/romix.rs @@ -1,8 +1,8 @@ -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +#[cfg(any(target_arch = "x86", target_arch = "x86_64", target_arch = "wasm32"))] /// Permute Salsa20 block to column major order const PIVOT_ABCD: [usize; 16] = [0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11]; -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +#[cfg(any(target_arch = "x86", target_arch = "x86_64", target_arch = "wasm32"))] /// Inverse of PIVOT_ABCD const INVERSE_PIVOT_ABCD: [usize; 16] = const { let mut index = [0; 16]; @@ -41,7 +41,11 @@ pub(crate) fn scrypt_ro_mix(b: &mut [u8], v: &mut [u8], t: &mut [u8], n: usize) let len = b.len(); - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + #[cfg(any( + target_arch = "x86", + target_arch = "x86_64", + all(target_arch = "wasm32", target_feature = "simd128") + ))] for chunk in b.chunks_exact_mut(64) { let mut t = [0u32; 16]; for (c, b) in chunk.chunks_exact(4).zip(t.iter_mut()) { @@ -55,10 +59,18 @@ pub(crate) fn scrypt_ro_mix(b: &mut [u8], v: &mut [u8], t: &mut [u8], n: usize) for chunk in v.chunks_mut(len) { chunk.copy_from_slice(b); - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + #[cfg(any( + target_arch = "x86", + target_arch = "x86_64", + all(target_arch = "wasm32", target_feature = "simd128") + ))] scrypt_block_mix_abcd(chunk, b); - #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))] + #[cfg(not(any( + target_arch = "x86", + target_arch = "x86_64", + all(target_arch = "wasm32", target_feature = "simd128") + )))] scrypt_block_mix(chunk, b); } @@ -66,14 +78,26 @@ pub(crate) fn scrypt_ro_mix(b: &mut [u8], v: &mut [u8], t: &mut [u8], n: usize) let j = integerify(b, n); xor(b, &v[j * len..(j + 1) * len], t); - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + #[cfg(any( + target_arch = "x86", + target_arch = "x86_64", + all(target_arch = "wasm32", target_feature = "simd128") + ))] scrypt_block_mix_abcd(t, b); - #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))] + #[cfg(not(any( + target_arch = "x86", + target_arch = "x86_64", + all(target_arch = "wasm32", target_feature = "simd128") + )))] scrypt_block_mix(t, b); } - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + #[cfg(any( + target_arch = "x86", + target_arch = "x86_64", + all(target_arch = "wasm32", target_feature = "simd128") + ))] for chunk in b.chunks_exact_mut(64) { let mut t = [0u32; 16]; for (c, b) in chunk.chunks_exact(4).zip(t.iter_mut()) { @@ -88,7 +112,11 @@ pub(crate) fn scrypt_ro_mix(b: &mut [u8], v: &mut [u8], t: &mut [u8], n: usize) /// Execute the BlockMix operation /// input - the input vector. The length must be a multiple of 128. /// output - the output vector. Must be the same length as input. -#[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))] +#[cfg(not(any( + target_arch = "x86", + target_arch = "x86_64", + all(target_arch = "wasm32", target_feature = "simd128") +)))] fn scrypt_block_mix(input: &[u8], output: &mut [u8]) { use salsa20::{ SalsaCore, @@ -192,6 +220,72 @@ fn scrypt_block_mix_abcd(input: &[u8], output: &mut [u8]) { } } +/// Execute the BlockMix operation with pre-shuffled input. +/// input - the input vector. The length must be a multiple of 128. +/// output - the output vector. Must be the same length as input. +#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))] +fn scrypt_block_mix_abcd(input: &[u8], output: &mut [u8]) { + use core::arch::wasm32::*; + + macro_rules! u32x4_rol { + ($x:expr, $amt:literal) => { + v128_or(u32x4_shl($x, $amt), u32x4_shr($x, 32 - $amt)) + }; + } + + let last_block = &input[input.len() - 64..]; + + let mut a = unsafe { v128_load(last_block.as_ptr().cast()) }; + let mut b = unsafe { v128_load(last_block.as_ptr().add(16).cast()) }; + let mut c = unsafe { v128_load(last_block.as_ptr().add(32).cast()) }; + let mut d = unsafe { v128_load(last_block.as_ptr().add(48).cast()) }; + + for (i, chunk) in input.chunks(64).enumerate() { + let pos = if i % 2 == 0 { + (i / 2) * 64 + } else { + (i / 2) * 64 + input.len() / 2 + }; + + unsafe { + let chunk_a = v128_load(chunk.as_ptr().cast()); + let chunk_b = v128_load(chunk.as_ptr().add(16).cast()); + let chunk_c = v128_load(chunk.as_ptr().add(32).cast()); + let chunk_d = v128_load(chunk.as_ptr().add(48).cast()); + + a = v128_xor(a, chunk_a); + b = v128_xor(b, chunk_b); + c = v128_xor(c, chunk_c); + d = v128_xor(d, chunk_d); + + let saves = [a, b, c, d]; + + for _ in 0..8 { + b = v128_xor(b, u32x4_rol!(u32x4_add(a, d), 7)); + c = v128_xor(c, u32x4_rol!(u32x4_add(b, a), 9)); + d = v128_xor(d, u32x4_rol!(u32x4_add(c, b), 13)); + a = v128_xor(a, u32x4_rol!(u32x4_add(d, c), 18)); + + d = i32x4_shuffle::<1, 2, 3, 0>(d, d); + c = i32x4_shuffle::<2, 3, 0, 1>(c, c); + b = i32x4_shuffle::<3, 0, 1, 2>(b, b); + + (b, d) = (d, b); + } + + a = u32x4_add(a, saves[0]); + b = u32x4_add(b, saves[1]); + c = u32x4_add(c, saves[2]); + d = u32x4_add(d, saves[3]); + + v128_store(output.as_mut_ptr().add(pos).cast(), a); + v128_store(output.as_mut_ptr().add(pos + 16).cast(), b); + v128_store(output.as_mut_ptr().add(pos + 32).cast(), c); + v128_store(output.as_mut_ptr().add(pos + 48).cast(), d); + } + } +} + fn xor(x: &[u8], y: &[u8], output: &mut [u8]) { for ((out, &x_i), &y_i) in output.iter_mut().zip(x.iter()).zip(y.iter()) { *out = x_i ^ y_i; From 804c6224206e74dae74e892c07a1f44c449c2561 Mon Sep 17 00:00:00 2001 From: eternal-flame-AD Date: Fri, 1 Aug 2025 00:03:01 -0500 Subject: [PATCH 4/8] fix unused warning Signed-off-by: eternal-flame-AD --- scrypt/src/romix.rs | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/scrypt/src/romix.rs b/scrypt/src/romix.rs index 715d7c5b..1187de58 100644 --- a/scrypt/src/romix.rs +++ b/scrypt/src/romix.rs @@ -1,8 +1,16 @@ -#[cfg(any(target_arch = "x86", target_arch = "x86_64", target_arch = "wasm32"))] +#[cfg(any( + target_arch = "x86", + target_arch = "x86_64", + all(target_arch = "wasm32", target_feature = "simd128") +))] /// Permute Salsa20 block to column major order const PIVOT_ABCD: [usize; 16] = [0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11]; -#[cfg(any(target_arch = "x86", target_arch = "x86_64", target_arch = "wasm32"))] +#[cfg(any( + target_arch = "x86", + target_arch = "x86_64", + all(target_arch = "wasm32", target_feature = "simd128") +))] /// Inverse of PIVOT_ABCD const INVERSE_PIVOT_ABCD: [usize; 16] = const { let mut index = [0; 16]; From bae8e9837e5fc4af7ea68073ef2f0a83757001c1 Mon Sep 17 00:00:00 2001 From: eternal-flame-AD Date: Fri, 1 Aug 2025 07:37:38 -0500 Subject: [PATCH 5/8] move arch-dependent impls to block_mix module Signed-off-by: eternal-flame-AD --- Cargo.lock | 1 + scrypt/Cargo.toml | 1 + scrypt/src/block_mix/mod.rs | 68 ++++++++ scrypt/src/block_mix/neon.rs | 88 +++++++++++ scrypt/src/block_mix/pivot.rs | 20 +++ scrypt/src/block_mix/simd128.rs | 88 +++++++++++ scrypt/src/block_mix/soft.rs | 42 +++++ scrypt/src/block_mix/sse2.rs | 90 +++++++++++ scrypt/src/lib.rs | 1 + scrypt/src/romix.rs | 267 +------------------------------- 10 files changed, 403 insertions(+), 263 deletions(-) create mode 100644 scrypt/src/block_mix/mod.rs create mode 100644 scrypt/src/block_mix/neon.rs create mode 100644 scrypt/src/block_mix/pivot.rs create mode 100644 scrypt/src/block_mix/simd128.rs create mode 100644 scrypt/src/block_mix/soft.rs create mode 100644 scrypt/src/block_mix/sse2.rs diff --git a/Cargo.lock b/Cargo.lock index 11574624..aa3317d4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -436,6 +436,7 @@ dependencies = [ name = "scrypt" version = "0.12.0-rc.1" dependencies = [ + "cfg-if", "password-hash", "pbkdf2", "salsa20", diff --git a/scrypt/Cargo.toml b/scrypt/Cargo.toml index 5d05607f..ed86407c 100644 --- a/scrypt/Cargo.toml +++ b/scrypt/Cargo.toml @@ -14,6 +14,7 @@ edition = "2024" rust-version = "1.85" [dependencies] +cfg-if = "1.0" pbkdf2 = { version = "0.13.0-rc.0", path = "../pbkdf2" } salsa20 = { version = "0.11.0-rc.0", default-features = false } sha2 = { version = "0.11.0-rc.0", default-features = false } diff --git a/scrypt/src/block_mix/mod.rs b/scrypt/src/block_mix/mod.rs new file mode 100644 index 00000000..abd55930 --- /dev/null +++ b/scrypt/src/block_mix/mod.rs @@ -0,0 +1,68 @@ +#[cfg(any( + test, + not(any( + all( + any(target_arch = "x86", target_arch = "x86_64"), + target_feature = "sse2" + ), + all(target_arch = "aarch64", target_feature = "neon"), + all(target_arch = "wasm32", target_feature = "simd128"), + )) +))] +mod soft; + +cfg_if::cfg_if! { + if #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] { + mod pivot; + mod neon; + pub(crate) use neon::{scrypt_block_mix, shuffle_in, shuffle_out}; + } else if #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))] { + mod pivot; + mod simd128; + pub(crate) use simd128::{scrypt_block_mix, shuffle_in, shuffle_out}; + } else if #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "sse2"))] { + mod pivot; + mod sse2; + pub(crate) use sse2::{scrypt_block_mix, shuffle_in, shuffle_out}; + } else { + pub(crate) use soft::scrypt_block_mix; + + pub(crate) fn shuffle_in(_input: &mut [u8]) {} + pub(crate) fn shuffle_out(_input: &mut [u8]) {} + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_scrypt_block_mix_abcd_against_soft() { + let mut input: [u8; 128] = core::array::from_fn(|i| i as u8); + for _round in 0..10 { + let mut output = [0u8; 128]; + + let mut expected0 = [0u8; 128]; + let mut expected1 = [0u8; 128]; // check shuffle_out is a correct inverse of shuffle_in + soft::scrypt_block_mix(&input, &mut expected0); + shuffle_in(&mut input); + scrypt_block_mix(&input, &mut output); + shuffle_out(&mut input); + soft::scrypt_block_mix(&input, &mut expected1); + shuffle_out(&mut output); + assert_eq!( + expected0, expected1, + "expected0 != expected1, shuffle_out is not a correct inverse of shuffle_in?" + ); + assert_eq!( + output, expected0, + "output != expected0, scrypt_block_mix is not correct?" + ); + + input + .iter_mut() + .zip(output.iter()) + .for_each(|(a, b)| *a = a.wrapping_add(*b)); + } + } +} diff --git a/scrypt/src/block_mix/neon.rs b/scrypt/src/block_mix/neon.rs new file mode 100644 index 00000000..e1c0a319 --- /dev/null +++ b/scrypt/src/block_mix/neon.rs @@ -0,0 +1,88 @@ +use crate::block_mix::pivot::{INVERSE_PIVOT_ABCD, PIVOT_ABCD}; + +pub(crate) fn shuffle_in(b: &mut [u8]) { + for chunk in b.chunks_exact_mut(64) { + let mut t = [0u32; 16]; + for (c, b) in chunk.chunks_exact(4).zip(t.iter_mut()) { + *b = u32::from_ne_bytes(c.try_into().unwrap()); + } + chunk.chunks_exact_mut(4).enumerate().for_each(|(i, b)| { + b.copy_from_slice(&t[PIVOT_ABCD[i]].to_ne_bytes()); + }); + } +} + +pub(crate) fn shuffle_out(b: &mut [u8]) { + for chunk in b.chunks_exact_mut(64) { + let mut t = [0u32; 16]; + for (c, b) in chunk.chunks_exact(4).zip(t.iter_mut()) { + *b = u32::from_ne_bytes(c.try_into().unwrap()); + } + chunk.chunks_exact_mut(4).enumerate().for_each(|(i, b)| { + b.copy_from_slice(&t[INVERSE_PIVOT_ABCD[i]].to_ne_bytes()); + }); + } +} + +pub(crate) fn scrypt_block_mix(input: &[u8], output: &mut [u8]) { + use core::arch::aarch64::*; + + macro_rules! vrol_u32 { + ($w:expr, $amt:literal) => {{ + let w = $w; + vsraq_n_u32(vshlq_n_u32(w, $amt), w, 32 - $amt) + }}; + } + + let last_block = &input[input.len() - 64..]; + + let mut a = unsafe { vld1q_u32(last_block.as_ptr().cast()) }; + let mut b = unsafe { vld1q_u32(last_block.as_ptr().add(16).cast()) }; + let mut c = unsafe { vld1q_u32(last_block.as_ptr().add(32).cast()) }; + let mut d = unsafe { vld1q_u32(last_block.as_ptr().add(48).cast()) }; + + for (i, chunk) in input.chunks(64).enumerate() { + let pos = if i % 2 == 0 { + (i / 2) * 64 + } else { + (i / 2) * 64 + input.len() / 2 + }; + + unsafe { + let chunk_a = vld1q_u32(chunk.as_ptr().cast()); + let chunk_b = vld1q_u32(chunk.as_ptr().add(16).cast()); + let chunk_c = vld1q_u32(chunk.as_ptr().add(32).cast()); + let chunk_d = vld1q_u32(chunk.as_ptr().add(48).cast()); + + a = veorq_u32(a, chunk_a); + b = veorq_u32(b, chunk_b); + c = veorq_u32(c, chunk_c); + d = veorq_u32(d, chunk_d); + + let saves = [a, b, c, d]; + + for _ in 0..8 { + b = veorq_u32(b, vrol_u32!(vaddq_u32(a, d), 7)); + c = veorq_u32(c, vrol_u32!(vaddq_u32(b, a), 9)); + d = veorq_u32(d, vrol_u32!(vaddq_u32(c, b), 13)); + a = veorq_u32(a, vrol_u32!(vaddq_u32(d, c), 18)); + + d = vextq_u32(d, d, 1); + c = vextq_u32(c, c, 2); + b = vextq_u32(b, b, 3); + + (b, d) = (d, b); + } + + a = vaddq_u32(a, saves[0]); + b = vaddq_u32(b, saves[1]); + c = vaddq_u32(c, saves[2]); + d = vaddq_u32(d, saves[3]); + + vst1q_u32(output.as_mut_ptr().add(pos).cast(), a); + vst1q_u32(output.as_mut_ptr().add(pos + 16).cast(), b); + vst1q_u32(output.as_mut_ptr().add(pos + 32).cast(), c); + vst1q_u32(output.as_mut_ptr().add(pos + 48).cast(), d); + } + } +} diff --git a/scrypt/src/block_mix/pivot.rs b/scrypt/src/block_mix/pivot.rs new file mode 100644 index 00000000..3839ad70 --- /dev/null +++ b/scrypt/src/block_mix/pivot.rs @@ -0,0 +1,20 @@ +/// Permute Salsa20 block to column major order +pub(crate) const PIVOT_ABCD: [usize; 16] = [0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11]; + +/// Inverse of PIVOT_ABCD +pub(crate) const INVERSE_PIVOT_ABCD: [usize; 16] = const { + let mut index = [0; 16]; + let mut i = 0; + while i < 16 { + let mut inverse = 0; + while inverse < 16 { + if PIVOT_ABCD[inverse] == i { + index[i] = inverse; + break; + } + inverse += 1; + } + i += 1; + } + index +}; diff --git a/scrypt/src/block_mix/simd128.rs b/scrypt/src/block_mix/simd128.rs new file mode 100644 index 00000000..0757ffc2 --- /dev/null +++ b/scrypt/src/block_mix/simd128.rs @@ -0,0 +1,88 @@ +use crate::block_mix::pivot::{INVERSE_PIVOT_ABCD, PIVOT_ABCD}; + +pub(crate) fn shuffle_in(b: &mut [u8]) { + for chunk in b.chunks_exact_mut(64) { + let mut t = [0u32; 16]; + for (c, b) in chunk.chunks_exact(4).zip(t.iter_mut()) { + *b = u32::from_ne_bytes(c.try_into().unwrap()); + } + chunk.chunks_exact_mut(4).enumerate().for_each(|(i, b)| { + b.copy_from_slice(&t[PIVOT_ABCD[i]].to_ne_bytes()); + }); + } +} + +pub(crate) fn shuffle_out(b: &mut [u8]) { + for chunk in b.chunks_exact_mut(64) { + let mut t = [0u32; 16]; + for (c, b) in chunk.chunks_exact(4).zip(t.iter_mut()) { + *b = u32::from_ne_bytes(c.try_into().unwrap()); + } + chunk.chunks_exact_mut(4).enumerate().for_each(|(i, b)| { + b.copy_from_slice(&t[INVERSE_PIVOT_ABCD[i]].to_ne_bytes()); + }); + } +} + +pub(crate) fn scrypt_block_mix(input: &[u8], output: &mut [u8]) { + use core::arch::wasm32::*; + + macro_rules! u32x4_rol { + ($w:expr, $amt:literal) => {{ + let w = $w; + v128_or(u32x4_shl(w, $amt), u32x4_shr(w, 32 - $amt)) + }}; + } + + let last_block = &input[input.len() - 64..]; + + let mut a = unsafe { v128_load(last_block.as_ptr().cast()) }; + let mut b = unsafe { v128_load(last_block.as_ptr().add(16).cast()) }; + let mut c = unsafe { v128_load(last_block.as_ptr().add(32).cast()) }; + let mut d = unsafe { v128_load(last_block.as_ptr().add(48).cast()) }; + + for (i, chunk) in input.chunks(64).enumerate() { + let pos = if i % 2 == 0 { + (i / 2) * 64 + } else { + (i / 2) * 64 + input.len() / 2 + }; + + unsafe { + let chunk_a = v128_load(chunk.as_ptr().cast()); + let chunk_b = v128_load(chunk.as_ptr().add(16).cast()); + let chunk_c = v128_load(chunk.as_ptr().add(32).cast()); + let chunk_d = v128_load(chunk.as_ptr().add(48).cast()); + + a = v128_xor(a, chunk_a); + b = v128_xor(b, chunk_b); + c = v128_xor(c, chunk_c); + d = v128_xor(d, chunk_d); + + let saves = [a, b, c, d]; + + for _ in 0..8 { + b = v128_xor(b, u32x4_rol!(u32x4_add(a, d), 7)); + c = v128_xor(c, u32x4_rol!(u32x4_add(b, a), 9)); + d = v128_xor(d, u32x4_rol!(u32x4_add(c, b), 13)); + a = v128_xor(a, u32x4_rol!(u32x4_add(d, c), 18)); + + d = i32x4_shuffle::<1, 2, 3, 0>(d, d); + c = i32x4_shuffle::<2, 3, 0, 1>(c, c); + b = i32x4_shuffle::<3, 0, 1, 2>(b, b); + + (b, d) = (d, b); + } + + a = u32x4_add(a, saves[0]); + b = u32x4_add(b, saves[1]); + c = u32x4_add(c, saves[2]); + d = u32x4_add(d, saves[3]); + + v128_store(output.as_mut_ptr().add(pos).cast(), a); + v128_store(output.as_mut_ptr().add(pos + 16).cast(), b); + v128_store(output.as_mut_ptr().add(pos + 32).cast(), c); + v128_store(output.as_mut_ptr().add(pos + 48).cast(), d); + } + } +} diff --git a/scrypt/src/block_mix/soft.rs b/scrypt/src/block_mix/soft.rs new file mode 100644 index 00000000..9dbdd984 --- /dev/null +++ b/scrypt/src/block_mix/soft.rs @@ -0,0 +1,42 @@ +/// Execute the BlockMix operation +/// input - the input vector. The length must be a multiple of 128. +/// output - the output vector. Must be the same length as input. +pub(crate) fn scrypt_block_mix(input: &[u8], output: &mut [u8]) { + use salsa20::{ + SalsaCore, + cipher::{StreamCipherCore, typenum::U4}, + }; + + type Salsa20_8 = SalsaCore; + + let mut x = [0u8; 64]; + x.copy_from_slice(&input[input.len() - 64..]); + + let mut t = [0u8; 64]; + + for (i, chunk) in input.chunks(64).enumerate() { + xor(&x, chunk, &mut t); + + let mut t2 = [0u32; 16]; + + for (c, b) in t.chunks_exact(4).zip(t2.iter_mut()) { + *b = u32::from_le_bytes(c.try_into().unwrap()); + } + + Salsa20_8::from_raw_state(t2).write_keystream_block((&mut x).into()); + + let pos = if i % 2 == 0 { + (i / 2) * 64 + } else { + (i / 2) * 64 + input.len() / 2 + }; + + output[pos..pos + 64].copy_from_slice(&x); + } +} + +fn xor(x: &[u8], y: &[u8], output: &mut [u8]) { + for ((out, &x_i), &y_i) in output.iter_mut().zip(x.iter()).zip(y.iter()) { + *out = x_i ^ y_i; + } +} diff --git a/scrypt/src/block_mix/sse2.rs b/scrypt/src/block_mix/sse2.rs new file mode 100644 index 00000000..1daebe57 --- /dev/null +++ b/scrypt/src/block_mix/sse2.rs @@ -0,0 +1,90 @@ +use crate::block_mix::pivot::{INVERSE_PIVOT_ABCD, PIVOT_ABCD}; + +pub(crate) fn shuffle_in(b: &mut [u8]) { + for chunk in b.chunks_exact_mut(64) { + let mut t = [0u32; 16]; + for (c, b) in chunk.chunks_exact(4).zip(t.iter_mut()) { + *b = u32::from_ne_bytes(c.try_into().unwrap()); + } + chunk.chunks_exact_mut(4).enumerate().for_each(|(i, b)| { + b.copy_from_slice(&t[PIVOT_ABCD[i]].to_ne_bytes()); + }); + } +} + +pub(crate) fn shuffle_out(b: &mut [u8]) { + for chunk in b.chunks_exact_mut(64) { + let mut t = [0u32; 16]; + for (c, b) in chunk.chunks_exact(4).zip(t.iter_mut()) { + *b = u32::from_ne_bytes(c.try_into().unwrap()); + } + chunk.chunks_exact_mut(4).enumerate().for_each(|(i, b)| { + b.copy_from_slice(&t[INVERSE_PIVOT_ABCD[i]].to_ne_bytes()); + }); + } +} + +pub(crate) fn scrypt_block_mix(input: &[u8], output: &mut [u8]) { + #[cfg(target_arch = "x86")] + use core::arch::x86::*; + + #[cfg(target_arch = "x86_64")] + use core::arch::x86_64::*; + + macro_rules! mm_rol_epi32x { + ($w:expr, $amt:literal) => {{ + let w = $w; + _mm_or_si128(_mm_slli_epi32(w, $amt), _mm_srli_epi32(w, 32 - $amt)) + }}; + } + + let last_block = &input[input.len() - 64..]; + + let mut a = unsafe { _mm_loadu_si128(last_block.as_ptr().cast()) }; + let mut b = unsafe { _mm_loadu_si128(last_block.as_ptr().add(16).cast()) }; + let mut c = unsafe { _mm_loadu_si128(last_block.as_ptr().add(32).cast()) }; + let mut d = unsafe { _mm_loadu_si128(last_block.as_ptr().add(48).cast()) }; + + for (i, chunk) in input.chunks(64).enumerate() { + let pos = if i % 2 == 0 { + (i / 2) * 64 + } else { + (i / 2) * 64 + input.len() / 2 + }; + + unsafe { + a = _mm_xor_si128(a, _mm_loadu_si128(chunk.as_ptr().cast())); + b = _mm_xor_si128(b, _mm_loadu_si128(chunk.as_ptr().add(16).cast())); + c = _mm_xor_si128(c, _mm_loadu_si128(chunk.as_ptr().add(32).cast())); + d = _mm_xor_si128(d, _mm_loadu_si128(chunk.as_ptr().add(48).cast())); + + let saves = [a, b, c, d]; + + for _ in 0..8 { + b = _mm_xor_si128(b, mm_rol_epi32x!(_mm_add_epi32(a, d), 7)); + c = _mm_xor_si128(c, mm_rol_epi32x!(_mm_add_epi32(b, a), 9)); + d = _mm_xor_si128(d, mm_rol_epi32x!(_mm_add_epi32(c, b), 13)); + a = _mm_xor_si128(a, mm_rol_epi32x!(_mm_add_epi32(d, c), 18)); + + // a stays in place + // b = left shuffle d by 1 element + d = _mm_shuffle_epi32(d, 0b00_11_10_01); + // c = left shuffle c by 2 elements + c = _mm_shuffle_epi32(c, 0b01_00_11_10); + // d = left shuffle b by 3 elements + b = _mm_shuffle_epi32(b, 0b10_01_00_11); + (b, d) = (d, b); + } + + a = _mm_add_epi32(a, saves[0]); + b = _mm_add_epi32(b, saves[1]); + c = _mm_add_epi32(c, saves[2]); + d = _mm_add_epi32(d, saves[3]); + + _mm_storeu_si128(output.as_mut_ptr().add(pos).cast(), a); + _mm_storeu_si128(output.as_mut_ptr().add(pos + 16).cast(), b); + _mm_storeu_si128(output.as_mut_ptr().add(pos + 32).cast(), c); + _mm_storeu_si128(output.as_mut_ptr().add(pos + 48).cast(), d); + } + } +} diff --git a/scrypt/src/lib.rs b/scrypt/src/lib.rs index 7c5fff80..dd426499 100644 --- a/scrypt/src/lib.rs +++ b/scrypt/src/lib.rs @@ -55,6 +55,7 @@ extern crate alloc; use pbkdf2::pbkdf2_hmac; use sha2::Sha256; +mod block_mix; /// Errors for `scrypt` operations. pub mod errors; mod params; diff --git a/scrypt/src/romix.rs b/scrypt/src/romix.rs index 1187de58..1be655dc 100644 --- a/scrypt/src/romix.rs +++ b/scrypt/src/romix.rs @@ -1,34 +1,3 @@ -#[cfg(any( - target_arch = "x86", - target_arch = "x86_64", - all(target_arch = "wasm32", target_feature = "simd128") -))] -/// Permute Salsa20 block to column major order -const PIVOT_ABCD: [usize; 16] = [0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11]; - -#[cfg(any( - target_arch = "x86", - target_arch = "x86_64", - all(target_arch = "wasm32", target_feature = "simd128") -))] -/// Inverse of PIVOT_ABCD -const INVERSE_PIVOT_ABCD: [usize; 16] = const { - let mut index = [0; 16]; - let mut i = 0; - while i < 16 { - let mut inverse = 0; - while inverse < 16 { - if PIVOT_ABCD[inverse] == i { - index[i] = inverse; - break; - } - inverse += 1; - } - i += 1; - } - index -}; - /// Execute the ROMix operation in-place. /// b - the data to operate on /// v - a temporary variable to store the vector V @@ -49,249 +18,21 @@ pub(crate) fn scrypt_ro_mix(b: &mut [u8], v: &mut [u8], t: &mut [u8], n: usize) let len = b.len(); - #[cfg(any( - target_arch = "x86", - target_arch = "x86_64", - all(target_arch = "wasm32", target_feature = "simd128") - ))] - for chunk in b.chunks_exact_mut(64) { - let mut t = [0u32; 16]; - for (c, b) in chunk.chunks_exact(4).zip(t.iter_mut()) { - *b = u32::from_ne_bytes(c.try_into().unwrap()); - } - chunk.chunks_exact_mut(4).enumerate().for_each(|(i, b)| { - b.copy_from_slice(&t[PIVOT_ABCD[i]].to_ne_bytes()); - }); - } + crate::block_mix::shuffle_in(b); for chunk in v.chunks_mut(len) { chunk.copy_from_slice(b); - - #[cfg(any( - target_arch = "x86", - target_arch = "x86_64", - all(target_arch = "wasm32", target_feature = "simd128") - ))] - scrypt_block_mix_abcd(chunk, b); - - #[cfg(not(any( - target_arch = "x86", - target_arch = "x86_64", - all(target_arch = "wasm32", target_feature = "simd128") - )))] - scrypt_block_mix(chunk, b); + crate::block_mix::scrypt_block_mix(chunk, b); } for _ in 0..n { let j = integerify(b, n); xor(b, &v[j * len..(j + 1) * len], t); - #[cfg(any( - target_arch = "x86", - target_arch = "x86_64", - all(target_arch = "wasm32", target_feature = "simd128") - ))] - scrypt_block_mix_abcd(t, b); - - #[cfg(not(any( - target_arch = "x86", - target_arch = "x86_64", - all(target_arch = "wasm32", target_feature = "simd128") - )))] - scrypt_block_mix(t, b); - } - - #[cfg(any( - target_arch = "x86", - target_arch = "x86_64", - all(target_arch = "wasm32", target_feature = "simd128") - ))] - for chunk in b.chunks_exact_mut(64) { - let mut t = [0u32; 16]; - for (c, b) in chunk.chunks_exact(4).zip(t.iter_mut()) { - *b = u32::from_ne_bytes(c.try_into().unwrap()); - } - chunk.chunks_exact_mut(4).enumerate().for_each(|(i, b)| { - b.copy_from_slice(&t[INVERSE_PIVOT_ABCD[i]].to_ne_bytes()); - }); - } -} - -/// Execute the BlockMix operation -/// input - the input vector. The length must be a multiple of 128. -/// output - the output vector. Must be the same length as input. -#[cfg(not(any( - target_arch = "x86", - target_arch = "x86_64", - all(target_arch = "wasm32", target_feature = "simd128") -)))] -fn scrypt_block_mix(input: &[u8], output: &mut [u8]) { - use salsa20::{ - SalsaCore, - cipher::{StreamCipherCore, typenum::U4}, - }; - - type Salsa20_8 = SalsaCore; - - let mut x = [0u8; 64]; - x.copy_from_slice(&input[input.len() - 64..]); - - let mut t = [0u8; 64]; - - for (i, chunk) in input.chunks(64).enumerate() { - xor(&x, chunk, &mut t); - - let mut t2 = [0u32; 16]; - - for (c, b) in t.chunks_exact(4).zip(t2.iter_mut()) { - *b = u32::from_le_bytes(c.try_into().unwrap()); - } - - Salsa20_8::from_raw_state(t2).write_keystream_block((&mut x).into()); - - let pos = if i % 2 == 0 { - (i / 2) * 64 - } else { - (i / 2) * 64 + input.len() / 2 - }; - - output[pos..pos + 64].copy_from_slice(&x); - } -} - -/// Execute the BlockMix operation with pre-shuffled input. -/// input - the input vector. The length must be a multiple of 128. -/// output - the output vector. Must be the same length as input. -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -fn scrypt_block_mix_abcd(input: &[u8], output: &mut [u8]) { - #[cfg(target_arch = "x86")] - use core::arch::x86::*; - - #[cfg(target_arch = "x86_64")] - use core::arch::x86_64::*; - - macro_rules! mm_rol_epi32x { - ($w:expr, $amt:literal) => {{ - let w = $w; - _mm_or_si128(_mm_slli_epi32(w, $amt), _mm_srli_epi32(w, 32 - $amt)) - }}; + crate::block_mix::scrypt_block_mix(t, b); } - let last_block = &input[input.len() - 64..]; - - let mut a = unsafe { _mm_loadu_si128(last_block.as_ptr().cast()) }; - let mut b = unsafe { _mm_loadu_si128(last_block.as_ptr().add(16).cast()) }; - let mut c = unsafe { _mm_loadu_si128(last_block.as_ptr().add(32).cast()) }; - let mut d = unsafe { _mm_loadu_si128(last_block.as_ptr().add(48).cast()) }; - - for (i, chunk) in input.chunks(64).enumerate() { - let pos = if i % 2 == 0 { - (i / 2) * 64 - } else { - (i / 2) * 64 + input.len() / 2 - }; - - unsafe { - a = _mm_xor_si128(a, _mm_loadu_si128(chunk.as_ptr().cast())); - b = _mm_xor_si128(b, _mm_loadu_si128(chunk.as_ptr().add(16).cast())); - c = _mm_xor_si128(c, _mm_loadu_si128(chunk.as_ptr().add(32).cast())); - d = _mm_xor_si128(d, _mm_loadu_si128(chunk.as_ptr().add(48).cast())); - - let saves = [a, b, c, d]; - - for _ in 0..8 { - b = _mm_xor_si128(b, mm_rol_epi32x!(_mm_add_epi32(a, d), 7)); - c = _mm_xor_si128(c, mm_rol_epi32x!(_mm_add_epi32(b, a), 9)); - d = _mm_xor_si128(d, mm_rol_epi32x!(_mm_add_epi32(c, b), 13)); - a = _mm_xor_si128(a, mm_rol_epi32x!(_mm_add_epi32(d, c), 18)); - - // a stays in place - // b = left shuffle d by 1 element - d = _mm_shuffle_epi32(d, 0b00_11_10_01); - // c = left shuffle c by 2 elements - c = _mm_shuffle_epi32(c, 0b01_00_11_10); - // d = left shuffle b by 3 elements - b = _mm_shuffle_epi32(b, 0b10_01_00_11); - (b, d) = (d, b); - } - - a = _mm_add_epi32(a, saves[0]); - b = _mm_add_epi32(b, saves[1]); - c = _mm_add_epi32(c, saves[2]); - d = _mm_add_epi32(d, saves[3]); - - _mm_storeu_si128(output.as_mut_ptr().add(pos).cast(), a); - _mm_storeu_si128(output.as_mut_ptr().add(pos + 16).cast(), b); - _mm_storeu_si128(output.as_mut_ptr().add(pos + 32).cast(), c); - _mm_storeu_si128(output.as_mut_ptr().add(pos + 48).cast(), d); - } - } -} - -/// Execute the BlockMix operation with pre-shuffled input. -/// input - the input vector. The length must be a multiple of 128. -/// output - the output vector. Must be the same length as input. -#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))] -fn scrypt_block_mix_abcd(input: &[u8], output: &mut [u8]) { - use core::arch::wasm32::*; - - macro_rules! u32x4_rol { - ($x:expr, $amt:literal) => { - v128_or(u32x4_shl($x, $amt), u32x4_shr($x, 32 - $amt)) - }; - } - - let last_block = &input[input.len() - 64..]; - - let mut a = unsafe { v128_load(last_block.as_ptr().cast()) }; - let mut b = unsafe { v128_load(last_block.as_ptr().add(16).cast()) }; - let mut c = unsafe { v128_load(last_block.as_ptr().add(32).cast()) }; - let mut d = unsafe { v128_load(last_block.as_ptr().add(48).cast()) }; - - for (i, chunk) in input.chunks(64).enumerate() { - let pos = if i % 2 == 0 { - (i / 2) * 64 - } else { - (i / 2) * 64 + input.len() / 2 - }; - - unsafe { - let chunk_a = v128_load(chunk.as_ptr().cast()); - let chunk_b = v128_load(chunk.as_ptr().add(16).cast()); - let chunk_c = v128_load(chunk.as_ptr().add(32).cast()); - let chunk_d = v128_load(chunk.as_ptr().add(48).cast()); - - a = v128_xor(a, chunk_a); - b = v128_xor(b, chunk_b); - c = v128_xor(c, chunk_c); - d = v128_xor(d, chunk_d); - - let saves = [a, b, c, d]; - - for _ in 0..8 { - b = v128_xor(b, u32x4_rol!(u32x4_add(a, d), 7)); - c = v128_xor(c, u32x4_rol!(u32x4_add(b, a), 9)); - d = v128_xor(d, u32x4_rol!(u32x4_add(c, b), 13)); - a = v128_xor(a, u32x4_rol!(u32x4_add(d, c), 18)); - - d = i32x4_shuffle::<1, 2, 3, 0>(d, d); - c = i32x4_shuffle::<2, 3, 0, 1>(c, c); - b = i32x4_shuffle::<3, 0, 1, 2>(b, b); - - (b, d) = (d, b); - } - - a = u32x4_add(a, saves[0]); - b = u32x4_add(b, saves[1]); - c = u32x4_add(c, saves[2]); - d = u32x4_add(d, saves[3]); - - v128_store(output.as_mut_ptr().add(pos).cast(), a); - v128_store(output.as_mut_ptr().add(pos + 16).cast(), b); - v128_store(output.as_mut_ptr().add(pos + 32).cast(), c); - v128_store(output.as_mut_ptr().add(pos + 48).cast(), d); - } - } + crate::block_mix::shuffle_out(b); } fn xor(x: &[u8], y: &[u8], output: &mut [u8]) { From 94bef2fcf8e5945739ed9e445854be3202a4c236 Mon Sep 17 00:00:00 2001 From: eternal-flame-AD Date: Fri, 1 Aug 2025 08:06:38 -0500 Subject: [PATCH 6/8] apply suggestions Signed-off-by: eternal-flame-AD --- scrypt/src/{block_mix/mod.rs => block_mix.rs} | 22 ++++++------------- 1 file changed, 7 insertions(+), 15 deletions(-) rename scrypt/src/{block_mix/mod.rs => block_mix.rs} (81%) diff --git a/scrypt/src/block_mix/mod.rs b/scrypt/src/block_mix.rs similarity index 81% rename from scrypt/src/block_mix/mod.rs rename to scrypt/src/block_mix.rs index abd55930..968a15e6 100644 --- a/scrypt/src/block_mix/mod.rs +++ b/scrypt/src/block_mix.rs @@ -1,16 +1,3 @@ -#[cfg(any( - test, - not(any( - all( - any(target_arch = "x86", target_arch = "x86_64"), - target_feature = "sse2" - ), - all(target_arch = "aarch64", target_feature = "neon"), - all(target_arch = "wasm32", target_feature = "simd128"), - )) -))] -mod soft; - cfg_if::cfg_if! { if #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] { mod pivot; @@ -25,6 +12,7 @@ cfg_if::cfg_if! { mod sse2; pub(crate) use sse2::{scrypt_block_mix, shuffle_in, shuffle_out}; } else { + mod soft; pub(crate) use soft::scrypt_block_mix; pub(crate) fn shuffle_in(_input: &mut [u8]) {} @@ -32,6 +20,10 @@ cfg_if::cfg_if! { } } +#[cfg(test)] +#[path = "block_mix/soft.rs"] +mod soft_test; + #[cfg(test)] mod tests { use super::*; @@ -44,11 +36,11 @@ mod tests { let mut expected0 = [0u8; 128]; let mut expected1 = [0u8; 128]; // check shuffle_out is a correct inverse of shuffle_in - soft::scrypt_block_mix(&input, &mut expected0); + soft_test::scrypt_block_mix(&input, &mut expected0); shuffle_in(&mut input); scrypt_block_mix(&input, &mut output); shuffle_out(&mut input); - soft::scrypt_block_mix(&input, &mut expected1); + soft_test::scrypt_block_mix(&input, &mut expected1); shuffle_out(&mut output); assert_eq!( expected0, expected1, From c05a6d4885ff14b4b426f30397d8533d56fa61ec Mon Sep 17 00:00:00 2001 From: eternal-flame-AD Date: Fri, 1 Aug 2025 12:25:38 -0500 Subject: [PATCH 7/8] neon: use multiple-register load/store Signed-off-by: eternal-flame-AD --- scrypt/src/block_mix/neon.rs | 36 +++++++++++++++--------------------- 1 file changed, 15 insertions(+), 21 deletions(-) diff --git a/scrypt/src/block_mix/neon.rs b/scrypt/src/block_mix/neon.rs index e1c0a319..b5531f54 100644 --- a/scrypt/src/block_mix/neon.rs +++ b/scrypt/src/block_mix/neon.rs @@ -36,10 +36,7 @@ pub(crate) fn scrypt_block_mix(input: &[u8], output: &mut [u8]) { let last_block = &input[input.len() - 64..]; - let mut a = unsafe { vld1q_u32(last_block.as_ptr().cast()) }; - let mut b = unsafe { vld1q_u32(last_block.as_ptr().add(16).cast()) }; - let mut c = unsafe { vld1q_u32(last_block.as_ptr().add(32).cast()) }; - let mut d = unsafe { vld1q_u32(last_block.as_ptr().add(48).cast()) }; + let mut x = unsafe { vld1q_u32_x4(last_block.as_ptr().cast()) }; for (i, chunk) in input.chunks(64).enumerate() { let pos = if i % 2 == 0 { @@ -49,17 +46,17 @@ pub(crate) fn scrypt_block_mix(input: &[u8], output: &mut [u8]) { }; unsafe { - let chunk_a = vld1q_u32(chunk.as_ptr().cast()); - let chunk_b = vld1q_u32(chunk.as_ptr().add(16).cast()); - let chunk_c = vld1q_u32(chunk.as_ptr().add(32).cast()); - let chunk_d = vld1q_u32(chunk.as_ptr().add(48).cast()); + let chunk = vld1q_u32_x4(chunk.as_ptr().cast()); - a = veorq_u32(a, chunk_a); - b = veorq_u32(b, chunk_b); - c = veorq_u32(c, chunk_c); - d = veorq_u32(d, chunk_d); + x.0 = veorq_u32(x.0, chunk.0); + x.1 = veorq_u32(x.1, chunk.1); + x.2 = veorq_u32(x.2, chunk.2); + x.3 = veorq_u32(x.3, chunk.3); - let saves = [a, b, c, d]; + let mut a = x.0; + let mut b = x.1; + let mut c = x.2; + let mut d = x.3; for _ in 0..8 { b = veorq_u32(b, vrol_u32!(vaddq_u32(a, d), 7)); @@ -74,15 +71,12 @@ pub(crate) fn scrypt_block_mix(input: &[u8], output: &mut [u8]) { (b, d) = (d, b); } - a = vaddq_u32(a, saves[0]); - b = vaddq_u32(b, saves[1]); - c = vaddq_u32(c, saves[2]); - d = vaddq_u32(d, saves[3]); + x.0 = vaddq_u32(x.0, a); + x.1 = vaddq_u32(x.1, b); + x.2 = vaddq_u32(x.2, c); + x.3 = vaddq_u32(x.3, d); - vst1q_u32(output.as_mut_ptr().add(pos).cast(), a); - vst1q_u32(output.as_mut_ptr().add(pos + 16).cast(), b); - vst1q_u32(output.as_mut_ptr().add(pos + 32).cast(), c); - vst1q_u32(output.as_mut_ptr().add(pos + 48).cast(), d); + vst1q_u32_x4(output.as_mut_ptr().add(pos).cast(), x); } } } From aea4297697c5432d5ae575ddb6b97ed3a8e6ba13 Mon Sep 17 00:00:00 2001 From: eternal-flame-AD Date: Fri, 1 Aug 2025 14:55:44 -0500 Subject: [PATCH 8/8] remove neon backend for performance regression Signed-off-by: eternal-flame-AD --- scrypt/src/block_mix.rs | 6 +-- scrypt/src/block_mix/neon.rs | 82 ------------------------------------ 2 files changed, 1 insertion(+), 87 deletions(-) delete mode 100644 scrypt/src/block_mix/neon.rs diff --git a/scrypt/src/block_mix.rs b/scrypt/src/block_mix.rs index 968a15e6..66082888 100644 --- a/scrypt/src/block_mix.rs +++ b/scrypt/src/block_mix.rs @@ -1,9 +1,5 @@ cfg_if::cfg_if! { - if #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] { - mod pivot; - mod neon; - pub(crate) use neon::{scrypt_block_mix, shuffle_in, shuffle_out}; - } else if #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))] { + if #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))] { mod pivot; mod simd128; pub(crate) use simd128::{scrypt_block_mix, shuffle_in, shuffle_out}; diff --git a/scrypt/src/block_mix/neon.rs b/scrypt/src/block_mix/neon.rs deleted file mode 100644 index b5531f54..00000000 --- a/scrypt/src/block_mix/neon.rs +++ /dev/null @@ -1,82 +0,0 @@ -use crate::block_mix::pivot::{INVERSE_PIVOT_ABCD, PIVOT_ABCD}; - -pub(crate) fn shuffle_in(b: &mut [u8]) { - for chunk in b.chunks_exact_mut(64) { - let mut t = [0u32; 16]; - for (c, b) in chunk.chunks_exact(4).zip(t.iter_mut()) { - *b = u32::from_ne_bytes(c.try_into().unwrap()); - } - chunk.chunks_exact_mut(4).enumerate().for_each(|(i, b)| { - b.copy_from_slice(&t[PIVOT_ABCD[i]].to_ne_bytes()); - }); - } -} - -pub(crate) fn shuffle_out(b: &mut [u8]) { - for chunk in b.chunks_exact_mut(64) { - let mut t = [0u32; 16]; - for (c, b) in chunk.chunks_exact(4).zip(t.iter_mut()) { - *b = u32::from_ne_bytes(c.try_into().unwrap()); - } - chunk.chunks_exact_mut(4).enumerate().for_each(|(i, b)| { - b.copy_from_slice(&t[INVERSE_PIVOT_ABCD[i]].to_ne_bytes()); - }); - } -} - -pub(crate) fn scrypt_block_mix(input: &[u8], output: &mut [u8]) { - use core::arch::aarch64::*; - - macro_rules! vrol_u32 { - ($w:expr, $amt:literal) => {{ - let w = $w; - vsraq_n_u32(vshlq_n_u32(w, $amt), w, 32 - $amt) - }}; - } - - let last_block = &input[input.len() - 64..]; - - let mut x = unsafe { vld1q_u32_x4(last_block.as_ptr().cast()) }; - - for (i, chunk) in input.chunks(64).enumerate() { - let pos = if i % 2 == 0 { - (i / 2) * 64 - } else { - (i / 2) * 64 + input.len() / 2 - }; - - unsafe { - let chunk = vld1q_u32_x4(chunk.as_ptr().cast()); - - x.0 = veorq_u32(x.0, chunk.0); - x.1 = veorq_u32(x.1, chunk.1); - x.2 = veorq_u32(x.2, chunk.2); - x.3 = veorq_u32(x.3, chunk.3); - - let mut a = x.0; - let mut b = x.1; - let mut c = x.2; - let mut d = x.3; - - for _ in 0..8 { - b = veorq_u32(b, vrol_u32!(vaddq_u32(a, d), 7)); - c = veorq_u32(c, vrol_u32!(vaddq_u32(b, a), 9)); - d = veorq_u32(d, vrol_u32!(vaddq_u32(c, b), 13)); - a = veorq_u32(a, vrol_u32!(vaddq_u32(d, c), 18)); - - d = vextq_u32(d, d, 1); - c = vextq_u32(c, c, 2); - b = vextq_u32(b, b, 3); - - (b, d) = (d, b); - } - - x.0 = vaddq_u32(x.0, a); - x.1 = vaddq_u32(x.1, b); - x.2 = vaddq_u32(x.2, c); - x.3 = vaddq_u32(x.3, d); - - vst1q_u32_x4(output.as_mut_ptr().add(pos).cast(), x); - } - } -}