Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .github/workflows/checks.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -480,6 +480,10 @@ jobs:
run: "cargo +nightly miri nextest run -j4 -p test-libz-rs-sys --target ${{ matrix.target }} null::"
env:
RUSTFLAGS: "-Ctarget-feature=+avx2,+bmi2,+bmi1"
- name: Test avx512 crc32 implementation
run: "cargo +nightly miri nextest run -j4 -p zlib-rs --target ${{ matrix.target }} --features=vpclmulqdq crc32::"
env:
RUSTFLAGS: "-Ctarget-feature=+vpclmulqdq,+avx512f"
- name: Test allocator with miri
run: "cargo +nightly miri nextest run -j4 -p zlib-rs --target ${{ matrix.target }} allocate::"
- name: Test gz logic with miri
Expand Down
3 changes: 2 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ opt-level = 1 # required for the tail calls in inflate to optimize

[workspace.dependencies]
libloading = "0.8.1"
libz-sys = { version = "1.1.21", default-features = false, features = ["zlib-ng"] } # use libz-ng in libz compat mode
# FIXME: later libz-sys versions change the output of medium compression slightly.
libz-sys = { version = "=1.1.21", default-features = false, features = ["zlib-ng"] } # use libz-ng in libz compat mode
arbitrary = { version = "1.0" }
quickcheck = { version = "1.0.3", default-features = false, features = [] }

Expand Down
2 changes: 1 addition & 1 deletion fuzz/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ features = ["arbitrary-derive"]

[dependencies]
libc = "0.2.151"
libz-ng-sys = "1.1.21"
libz-ng-sys = "=1.1.21"
libloading = "0.8.1"
crc32fast = "1.3.2"
rstest = "0.23.0"
Expand Down
1 change: 1 addition & 0 deletions zlib-rs/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ __internal-fuzz = ["arbitrary"]
__internal-fuzz-disable-checksum = [] # disable checksum validation on inflate
__internal-test = ["quickcheck"]
ZLIB_DEBUG = []
vpclmulqdq = [] # use avx512 to speed up crc32. Only stable from 1.89.0 onwards


[dependencies]
Expand Down
3 changes: 3 additions & 0 deletions zlib-rs/src/crc32.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ mod braid;
mod combine;
#[cfg(target_arch = "x86_64")]
mod pclmulqdq;
#[cfg(target_arch = "x86_64")]
#[cfg(feature = "vpclmulqdq")]
mod vpclmulqdq;

pub use combine::crc32_combine;

Expand Down
51 changes: 24 additions & 27 deletions zlib-rs/src/crc32/pclmulqdq.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
use core::arch::x86_64::__m128i;
use core::arch::x86_64::{
_mm_and_si128, _mm_clmulepi64_si128, _mm_extract_epi32, _mm_load_si128, _mm_loadu_si128,
_mm_or_si128, _mm_shuffle_epi8, _mm_slli_si128, _mm_srli_si128, _mm_storeu_si128,
_mm_xor_si128,
__m128i, _mm_and_si128, _mm_clmulepi64_si128, _mm_extract_epi32, _mm_load_si128,
_mm_loadu_si128, _mm_or_si128, _mm_shuffle_epi8, _mm_slli_si128, _mm_srli_si128,
_mm_storeu_si128, _mm_xor_si128,
};

use crate::CRC32_INITIAL_VALUE;
Expand All @@ -24,7 +23,7 @@ const fn reg(input: [u32; 4]) -> __m128i {
#[derive(Debug, Clone, Copy)]
#[cfg(target_arch = "x86_64")]
pub(crate) struct Accumulator {
fold: [__m128i; 4],
pub(super) fold: [__m128i; 4],
}

#[cfg(target_arch = "x86_64")]
Expand Down Expand Up @@ -249,21 +248,22 @@ impl Accumulator {
// bytes of input is needed for the aligning load that occurs. If there's an initial CRC, to
// carry it forward through the folded CRC there must be 16 - src % 16 + 16 bytes available, which
// by definition can be up to 15 bytes + one full vector load. */
assert!(src.len() >= 31 || init_crc == CRC32_INITIAL_VALUE);
let xmm_initial = reg([init_crc, 0, 0, 0]);
let first = init_crc != CRC32_INITIAL_VALUE;
assert!(src.len() >= 31 || !first);

if COPY {
assert_eq!(dst.len(), src.len(), "dst and src must be the same length")
}

if src.len() < 16 {
if COPY {
if src.is_empty() {
return;
}
if src.is_empty() {
return;
}

partial_buf.0[..src.len()].copy_from_slice(src);
xmm_crc_part =
unsafe { _mm_load_si128(partial_buf.0.as_mut_ptr() as *mut __m128i) };
partial_buf.0[..src.len()].copy_from_slice(src);
xmm_crc_part = unsafe { _mm_load_si128(partial_buf.0.as_mut_ptr() as *mut __m128i) };
if COPY {
dst[..src.len()].copy_from_slice(&partial_buf.0[..src.len()]);
}
} else {
Expand All @@ -280,7 +280,6 @@ impl Accumulator {
let is_initial = init_crc == CRC32_INITIAL_VALUE;

if !is_initial {
let xmm_initial = reg([init_crc, 0, 0, 0]);
xmm_crc_part = unsafe { _mm_xor_si128(xmm_crc_part, xmm_initial) };
init_crc = CRC32_INITIAL_VALUE;
}
Expand All @@ -302,19 +301,17 @@ impl Accumulator {
src = &src[before.len()..];
}

// if is_x86_feature_detected!("vpclmulqdq") {
// if src.len() >= 256 {
// if COPY {
// // size_t n = fold_16_vpclmulqdq_copy(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, dst, src, len);
// // dst += n;
// } else {
// // size_t n = fold_16_vpclmulqdq(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, src, len, xmm_initial, first);
// // first = false;
// }
// // len -= n;
// // src += n;
// }
// }
#[cfg(feature = "vpclmulqdq")]
#[cfg(all(target_feature = "vpclmulqdq", target_feature = "avx512f"))]
if src.len() >= 256 {
let n;
if COPY {
n = unsafe { self.fold_16_vpclmulqdq_copy(dst, &mut src) };
dst = &mut dst[n..];
} else {
unsafe { self.fold_16_vpclmulqdq(dst, &mut src, &mut init_crc) };
}
}

while src.len() >= 64 {
let n = unsafe { self.progress::<4, COPY>(dst, &mut src, &mut init_crc) };
Expand Down
136 changes: 136 additions & 0 deletions zlib-rs/src/crc32/vpclmulqdq.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
use crate::CRC32_INITIAL_VALUE;
use core::arch::x86_64::{
__m512i, _mm512_clmulepi64_epi128, _mm512_extracti32x4_epi32, _mm512_inserti32x4,
_mm512_loadu_si512, _mm512_set4_epi32, _mm512_setzero_si512, _mm512_storeu_si512,
_mm512_ternarylogic_epi32, _mm512_xor_si512, _mm512_zextsi128_si512, _mm_cvtsi32_si128,
};

impl super::pclmulqdq::Accumulator {
#[target_feature(enable = "vpclmulqdq", enable = "avx512f")]
pub(super) unsafe fn fold_16_vpclmulqdq(
&mut self,
dst: &mut [u8],
src: &mut &[u8],
init_crc: &mut u32,
) -> usize {
unsafe { self.fold_help_vpclmulqdq::<false>(dst, src, init_crc) }
}

#[target_feature(enable = "vpclmulqdq", enable = "avx512f")]
pub(super) unsafe fn fold_16_vpclmulqdq_copy(
&mut self,
dst: &mut [u8],
src: &mut &[u8],
) -> usize {
let mut init_crc = CRC32_INITIAL_VALUE;
unsafe { self.fold_help_vpclmulqdq::<true>(dst, src, &mut init_crc) }
}

#[target_feature(enable = "vpclmulqdq", enable = "avx512f")]
unsafe fn fold_help_vpclmulqdq<const COPY: bool>(
&mut self,
mut dst: &mut [u8],
src: &mut &[u8],
init_crc: &mut u32,
) -> usize {
let [xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3] = &mut self.fold;
let start_len = src.len();

unsafe {
let zmm_fold4 =
_mm512_set4_epi32(0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596u32 as i32);
let zmm_fold16 = _mm512_set4_epi32(0x00000001, 0x1542778a, 0x00000001, 0x322d1430);

// zmm register init
let zmm_crc0 = _mm512_setzero_si512();
let mut zmm_t0 = _mm512_loadu_si512(src.as_ptr().cast::<__m512i>());

if !COPY && *init_crc != CRC32_INITIAL_VALUE {
let xmm_initial = _mm_cvtsi32_si128(*init_crc as i32);
let zmm_initial = _mm512_zextsi128_si512(xmm_initial);
zmm_t0 = _mm512_xor_si512(zmm_t0, zmm_initial);
*init_crc = CRC32_INITIAL_VALUE;
}

let mut zmm_crc1 = _mm512_loadu_si512(src.as_ptr().cast::<__m512i>().add(1));
let mut zmm_crc2 = _mm512_loadu_si512(src.as_ptr().cast::<__m512i>().add(2));
let mut zmm_crc3 = _mm512_loadu_si512(src.as_ptr().cast::<__m512i>().add(3));

/* already have intermediate CRC in xmm registers
* fold4 with 4 xmm_crc to get zmm_crc0
*/
let mut zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc0, 0);
zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc1, 1);
zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc2, 2);
zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc3, 3);
let mut z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_t0, 0x96);

if COPY {
_mm512_storeu_si512(dst.as_mut_ptr().cast::<__m512i>(), zmm_t0);
_mm512_storeu_si512(dst.as_mut_ptr().cast::<__m512i>().add(1), zmm_crc1);
_mm512_storeu_si512(dst.as_mut_ptr().cast::<__m512i>().add(2), zmm_crc2);
_mm512_storeu_si512(dst.as_mut_ptr().cast::<__m512i>().add(3), zmm_crc3);
dst = &mut dst[256..];
}

*src = &src[256..];

// fold-16 loops
while src.len() >= 256 {
let zmm_t0 = _mm512_loadu_si512(src.as_ptr().cast::<__m512i>());
let zmm_t1 = _mm512_loadu_si512(src.as_ptr().cast::<__m512i>().add(1));
let zmm_t2 = _mm512_loadu_si512(src.as_ptr().cast::<__m512i>().add(2));
let zmm_t3 = _mm512_loadu_si512(src.as_ptr().cast::<__m512i>().add(3));

let z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold16, 0x01);
let z1 = _mm512_clmulepi64_epi128(zmm_crc1, zmm_fold16, 0x01);
let z2 = _mm512_clmulepi64_epi128(zmm_crc2, zmm_fold16, 0x01);
let z3 = _mm512_clmulepi64_epi128(zmm_crc3, zmm_fold16, 0x01);

zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold16, 0x10);
zmm_crc1 = _mm512_clmulepi64_epi128(zmm_crc1, zmm_fold16, 0x10);
zmm_crc2 = _mm512_clmulepi64_epi128(zmm_crc2, zmm_fold16, 0x10);
zmm_crc3 = _mm512_clmulepi64_epi128(zmm_crc3, zmm_fold16, 0x10);

zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_t0, 0x96);
zmm_crc1 = _mm512_ternarylogic_epi32(zmm_crc1, z1, zmm_t1, 0x96);
zmm_crc2 = _mm512_ternarylogic_epi32(zmm_crc2, z2, zmm_t2, 0x96);
zmm_crc3 = _mm512_ternarylogic_epi32(zmm_crc3, z3, zmm_t3, 0x96);

if COPY {
_mm512_storeu_si512(dst.as_mut_ptr().cast::<__m512i>(), zmm_t0);
_mm512_storeu_si512(dst.as_mut_ptr().cast::<__m512i>().add(1), zmm_t1);
_mm512_storeu_si512(dst.as_mut_ptr().cast::<__m512i>().add(2), zmm_t2);
_mm512_storeu_si512(dst.as_mut_ptr().cast::<__m512i>().add(3), zmm_t3);
dst = &mut dst[256..];
}

*src = &src[256..];
}

// zmm_crc[0,1,2,3] -> zmm_crc0
z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_crc1, 0x96);

z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_crc2, 0x96);

z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_crc3, 0x96);

// zmm_crc0 -> xmm_crc[0, 1, 2, 3]
*xmm_crc0 = _mm512_extracti32x4_epi32(zmm_crc0, 0);
*xmm_crc1 = _mm512_extracti32x4_epi32(zmm_crc0, 1);
*xmm_crc2 = _mm512_extracti32x4_epi32(zmm_crc0, 2);
*xmm_crc3 = _mm512_extracti32x4_epi32(zmm_crc0, 3);

// return n bytes processed
start_len - src.len()
}
}
}