|
| 1 | +use crate::CRC32_INITIAL_VALUE; |
| 2 | +use core::arch::x86_64::{ |
| 3 | + __m128i, __m512i, _mm512_clmulepi64_epi128, _mm512_extracti32x4_epi32, _mm512_inserti32x4, |
| 4 | + _mm512_loadu_si512, _mm512_set4_epi32, _mm512_setzero_si512, _mm512_storeu_si512, |
| 5 | + _mm512_ternarylogic_epi32, _mm512_xor_si512, _mm512_zextsi128_si512, _mm_cvtsi32_si128, |
| 6 | +}; |
| 7 | + |
| 8 | +impl super::pclmulqdq::Accumulator { |
| 9 | + #[target_feature(enable = "vpclmulqdq", enable = "avx512f")] |
| 10 | + pub(super) unsafe fn fold_16_vpclmulqdq( |
| 11 | + &mut self, |
| 12 | + dst: &mut [u8], |
| 13 | + src: &mut &[u8], |
| 14 | + init_crc: &mut u32, |
| 15 | + ) -> usize { |
| 16 | + unsafe { self.fold_help_vpclmulqdq::<false>(dst, src, init_crc) } |
| 17 | + } |
| 18 | + |
| 19 | + #[target_feature(enable = "vpclmulqdq", enable = "avx512f")] |
| 20 | + pub(super) unsafe fn fold_16_vpclmulqdq_copy( |
| 21 | + &mut self, |
| 22 | + dst: &mut [u8], |
| 23 | + src: &mut &[u8], |
| 24 | + ) -> usize { |
| 25 | + unsafe { self.fold_help_vpclmulqdq::<true>(dst, src, &mut CRC32_INITIAL_VALUE) } |
| 26 | + } |
| 27 | + |
| 28 | + #[target_feature(enable = "vpclmulqdq", enable = "avx512f")] |
| 29 | + unsafe fn fold_help_vpclmulqdq<const COPY: bool>( |
| 30 | + &mut self, |
| 31 | + mut dst: &mut [u8], |
| 32 | + src: &mut &[u8], |
| 33 | + init_crc: &mut u32, |
| 34 | + ) -> usize { |
| 35 | + let [xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3] = &mut self.fold; |
| 36 | + let start_len = src.len(); |
| 37 | + |
| 38 | + unsafe { |
| 39 | + let len_tmp = src.len(); |
| 40 | + let zmm_fold4 = |
| 41 | + _mm512_set4_epi32(0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596u32 as i32); |
| 42 | + let zmm_fold16 = _mm512_set4_epi32(0x00000001, 0x1542778a, 0x00000001, 0x322d1430); |
| 43 | + |
| 44 | + // zmm register init |
| 45 | + let zmm_crc0 = _mm512_setzero_si512(); |
| 46 | + let mut zmm_t0 = _mm512_loadu_si512(src.as_ptr().cast::<__m512i>()); |
| 47 | + |
| 48 | + if !COPY && *init_crc != CRC32_INITIAL_VALUE { |
| 49 | + let xmm_initial = _mm_cvtsi32_si128(*init_crc as i32); |
| 50 | + let zmm_initial = _mm512_zextsi128_si512(xmm_initial); |
| 51 | + zmm_t0 = _mm512_xor_si512(zmm_t0, zmm_initial); |
| 52 | + *init_crc = CRC32_INITIAL_VALUE; |
| 53 | + } |
| 54 | + |
| 55 | + let mut zmm_crc1 = _mm512_loadu_si512(src.as_ptr().cast::<__m512i>().add(1)); |
| 56 | + let mut zmm_crc2 = _mm512_loadu_si512(src.as_ptr().cast::<__m512i>().add(2)); |
| 57 | + let mut zmm_crc3 = _mm512_loadu_si512(src.as_ptr().cast::<__m512i>().add(3)); |
| 58 | + |
| 59 | + /* already have intermediate CRC in xmm registers |
| 60 | + * fold4 with 4 xmm_crc to get zmm_crc0 |
| 61 | + */ |
| 62 | + let mut zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc0, 0); |
| 63 | + zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc1, 1); |
| 64 | + zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc2, 2); |
| 65 | + zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc3, 3); |
| 66 | + let mut z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01); |
| 67 | + zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10); |
| 68 | + dbg!(zmm_crc0, z0, zmm_t0, 0x96); |
| 69 | + zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_t0, 0x96); |
| 70 | + dbg!(zmm_crc0); |
| 71 | + |
| 72 | + if COPY { |
| 73 | + _mm512_storeu_si512(dst.as_mut_ptr().cast::<__m512i>(), zmm_t0); |
| 74 | + _mm512_storeu_si512(dst.as_mut_ptr().cast::<__m512i>().add(1), zmm_crc1); |
| 75 | + _mm512_storeu_si512(dst.as_mut_ptr().cast::<__m512i>().add(2), zmm_crc2); |
| 76 | + _mm512_storeu_si512(dst.as_mut_ptr().cast::<__m512i>().add(3), zmm_crc3); |
| 77 | + dst = &mut dst[256..]; |
| 78 | + } |
| 79 | + |
| 80 | + *src = &src[256..]; |
| 81 | + |
| 82 | + // fold-16 loops |
| 83 | + while src.len() >= 256 { |
| 84 | + let zmm_t0 = _mm512_loadu_si512(src.as_ptr().cast::<__m512i>()); |
| 85 | + let zmm_t1 = _mm512_loadu_si512(src.as_ptr().cast::<__m512i>().add(1)); |
| 86 | + let zmm_t2 = _mm512_loadu_si512(src.as_ptr().cast::<__m512i>().add(2)); |
| 87 | + let zmm_t3 = _mm512_loadu_si512(src.as_ptr().cast::<__m512i>().add(3)); |
| 88 | + |
| 89 | + let z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold16, 0x01); |
| 90 | + let z1 = _mm512_clmulepi64_epi128(zmm_crc1, zmm_fold16, 0x01); |
| 91 | + let z2 = _mm512_clmulepi64_epi128(zmm_crc2, zmm_fold16, 0x01); |
| 92 | + let z3 = _mm512_clmulepi64_epi128(zmm_crc3, zmm_fold16, 0x01); |
| 93 | + |
| 94 | + zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold16, 0x10); |
| 95 | + zmm_crc1 = _mm512_clmulepi64_epi128(zmm_crc1, zmm_fold16, 0x10); |
| 96 | + zmm_crc2 = _mm512_clmulepi64_epi128(zmm_crc2, zmm_fold16, 0x10); |
| 97 | + zmm_crc3 = _mm512_clmulepi64_epi128(zmm_crc3, zmm_fold16, 0x10); |
| 98 | + |
| 99 | + zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_t0, 0x96); |
| 100 | + zmm_crc1 = _mm512_ternarylogic_epi32(zmm_crc1, z1, zmm_t1, 0x96); |
| 101 | + zmm_crc2 = _mm512_ternarylogic_epi32(zmm_crc2, z2, zmm_t2, 0x96); |
| 102 | + zmm_crc3 = _mm512_ternarylogic_epi32(zmm_crc3, z3, zmm_t3, 0x96); |
| 103 | + |
| 104 | + if COPY { |
| 105 | + _mm512_storeu_si512(dst.as_mut_ptr().cast::<__m512i>(), zmm_t0); |
| 106 | + _mm512_storeu_si512(dst.as_mut_ptr().cast::<__m512i>().add(1), zmm_t1); |
| 107 | + _mm512_storeu_si512(dst.as_mut_ptr().cast::<__m512i>().add(2), zmm_t2); |
| 108 | + _mm512_storeu_si512(dst.as_mut_ptr().cast::<__m512i>().add(3), zmm_t3); |
| 109 | + dst = &mut dst[256..]; |
| 110 | + } |
| 111 | + |
| 112 | + *src = &src[256..]; |
| 113 | + } |
| 114 | + |
| 115 | + // zmm_crc[0,1,2,3] -> zmm_crc0 |
| 116 | + z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01); |
| 117 | + zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10); |
| 118 | + zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_crc1, 0x96); |
| 119 | + |
| 120 | + z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01); |
| 121 | + zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10); |
| 122 | + zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_crc2, 0x96); |
| 123 | + |
| 124 | + z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01); |
| 125 | + zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10); |
| 126 | + zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_crc3, 0x96); |
| 127 | + |
| 128 | + // zmm_crc0 -> xmm_crc[0, 1, 2, 3] |
| 129 | + *xmm_crc0 = _mm512_extracti32x4_epi32(zmm_crc0, 0); |
| 130 | + *xmm_crc1 = _mm512_extracti32x4_epi32(zmm_crc0, 1); |
| 131 | + *xmm_crc2 = _mm512_extracti32x4_epi32(zmm_crc0, 2); |
| 132 | + *xmm_crc3 = _mm512_extracti32x4_epi32(zmm_crc0, 3); |
| 133 | + |
| 134 | + // return n bytes processed |
| 135 | + start_len - src.len() |
| 136 | + } |
| 137 | + } |
| 138 | +} |
0 commit comments