diff --git a/Cargo.toml b/Cargo.toml index f9a95a9667..2fccc4c6f7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -49,9 +49,10 @@ include = [ "crypto/curve25519/curve25519_64_adx.c", "crypto/curve25519/curve25519_tables.h", "crypto/curve25519/internal.h", - "crypto/fipsmodule/aes/asm/aesni-x86.pl", "crypto/fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl", + "crypto/fipsmodule/aes/asm/aes-gcm-avx512-x86_64.pl", "crypto/fipsmodule/aes/asm/aesni-gcm-x86_64.pl", + "crypto/fipsmodule/aes/asm/aesni-x86.pl", "crypto/fipsmodule/aes/asm/aesni-x86_64.pl", "crypto/fipsmodule/aes/asm/aesv8-armx.pl", "crypto/fipsmodule/aes/asm/aesv8-gcm-armv8.pl", diff --git a/build.rs b/build.rs index ca7a7d4860..a1fd06f6b2 100644 --- a/build.rs +++ b/build.rs @@ -113,6 +113,7 @@ const RING_SRCS: &[(&[&str], &str)] = &[ (&[X86_64], "crypto/chacha/asm/chacha-x86_64.pl"), (&[X86_64], "crypto/curve25519/curve25519_64_adx.c"), (&[X86_64], "crypto/fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl"), + (&[X86_64], "crypto/fipsmodule/aes/asm/aes-gcm-avx512-x86_64.pl"), (&[X86_64], "crypto/fipsmodule/aes/asm/aesni-gcm-x86_64.pl"), (&[X86_64], "crypto/fipsmodule/aes/asm/aesni-x86_64.pl"), (&[X86_64], "crypto/fipsmodule/aes/asm/ghash-x86_64.pl"), @@ -912,8 +913,10 @@ fn prefix_all_symbols(pp: char, prefix_prefix: &str, prefix: &str) -> String { "LIMBS_window5_unsplit_window", "aes_gcm_dec_kernel", "aes_gcm_dec_update_vaes_avx2", + "aes_gcm_dec_update_vaes_avx512", "aes_gcm_enc_kernel", "aes_gcm_enc_update_vaes_avx2", + "aes_gcm_enc_update_vaes_avx512", "aes_hw_ctr32_encrypt_blocks", "aes_hw_set_encrypt_key", "aes_hw_set_encrypt_key_128", @@ -971,6 +974,7 @@ fn prefix_all_symbols(pp: char, prefix_prefix: &str, prefix: &str) -> String { "gcm_ghash_clmul", "gcm_ghash_neon", "gcm_ghash_vpclmulqdq_avx2_16", + "gcm_ghash_vpclmulqdq_avx512_16", "gcm_gmult_clmul", "gcm_gmult_neon", "gcm_gmult_v8", @@ -979,6 +983,7 @@ fn prefix_all_symbols(pp: char, prefix_prefix: &str, prefix: &str) -> String { "gcm_init_neon", "gcm_init_v8", "gcm_init_vpclmulqdq_avx2", + "gcm_init_vpclmulqdq_avx512", "k25519Precomp", "limbs_mul_add_limb", "little_endian_bytes_from_scalar", diff --git a/crypto/fipsmodule/aes/asm/aes-gcm-avx512-x86_64.pl b/crypto/fipsmodule/aes/asm/aes-gcm-avx512-x86_64.pl index ea73594d09..23cd756577 100644 --- a/crypto/fipsmodule/aes/asm/aes-gcm-avx512-x86_64.pl +++ b/crypto/fipsmodule/aes/asm/aes-gcm-avx512-x86_64.pl @@ -588,10 +588,11 @@ sub _ghash_4x { return $code; } -# void gcm_gmult_vpclmulqdq_avx512(uint8_t Xi[16], const u128 Htable[16]); -$code .= _begin_func "gcm_gmult_vpclmulqdq_avx512", 1; +# void gcm_ghash_vpclmulqdq_avx512_16(uint8_t Xi[16], const u128 Htable[16], +# const uint8_t aad[16], size_t aad_len_16);); +$code .= _begin_func "gcm_ghash_vpclmulqdq_avx512_16", 1; { - my ( $GHASH_ACC_PTR, $HTABLE ) = @argregs[ 0 .. 1 ]; + my ( $GHASH_ACC_PTR, $HTABLE, $AAD, $AAD_LEN_16 ) = @argregs[ 0 .. 3 ]; my ( $GHASH_ACC, $BSWAP_MASK, $H_POW1, $GFPOLY, $T0, $T1, $T2 ) = map( "%xmm$_", ( 0 .. 6 ) ); @@ -599,7 +600,12 @@ sub _ghash_4x { @{[ _save_xmmregs (6) ]} .seh_endprologue + # Load the GHASH accumulator. vmovdqu ($GHASH_ACC_PTR), $GHASH_ACC + + # XOR the AAD into the accumulator. + vpxor ($AAD), $GHASH_ACC, $GHASH_ACC + vmovdqu .Lbswap_mask(%rip), $BSWAP_MASK vmovdqu $OFFSETOFEND_H_POWERS-16($HTABLE), $H_POW1 vmovdqu .Lgfpoly(%rip), $GFPOLY @@ -615,127 +621,6 @@ sub _ghash_4x { } $code .= _end_func; -# void gcm_ghash_vpclmulqdq_avx512(uint8_t Xi[16], const u128 Htable[16], -# const uint8_t *in, size_t len); -# -# Using the key |Htable|, update the GHASH accumulator |Xi| with the data given -# by |in| and |len|. |len| must be a multiple of 16. -# -# This function handles large amounts of AAD efficiently, while also keeping the -# overhead low for small amounts of AAD which is the common case. TLS uses less -# than one block of AAD, but (uncommonly) other use cases may use much more. -$code .= _begin_func "gcm_ghash_vpclmulqdq_avx512", 1; -{ - # Function arguments - my ( $GHASH_ACC_PTR, $HTABLE, $AAD, $AADLEN ) = @argregs[ 0 .. 3 ]; - - # Additional local variables - my ( $GHASHDATA0, $GHASHDATA0_XMM ) = ( "%zmm0", "%xmm0" ); - my ( $GHASHDATA1, $GHASHDATA1_XMM ) = ( "%zmm1", "%xmm1" ); - my ( $GHASHDATA2, $GHASHDATA2_XMM ) = ( "%zmm2", "%xmm2" ); - my ( $GHASHDATA3, $GHASHDATA3_XMM ) = ( "%zmm3", "%xmm3" ); - my @GHASHDATA = ( $GHASHDATA0, $GHASHDATA1, $GHASHDATA2, $GHASHDATA3 ); - my @GHASHDATA_XMM = - ( $GHASHDATA0_XMM, $GHASHDATA1_XMM, $GHASHDATA2_XMM, $GHASHDATA3_XMM ); - my ( $BSWAP_MASK, $BSWAP_MASK_XMM ) = ( "%zmm4", "%xmm4" ); - my ( $GHASH_ACC, $GHASH_ACC_XMM ) = ( "%zmm5", "%xmm5" ); - my ( $H_POW4, $H_POW3, $H_POW2 ) = ( "%zmm6", "%zmm7", "%zmm8" ); - my ( $H_POW1, $H_POW1_XMM ) = ( "%zmm9", "%xmm9" ); - my ( $GFPOLY, $GFPOLY_XMM ) = ( "%zmm10", "%xmm10" ); - my ( $GHASHTMP0, $GHASHTMP1, $GHASHTMP2 ) = - ( "%zmm11", "%zmm12", "%zmm13" ); - - $code .= <<___; - @{[ _save_xmmregs (6 .. 13) ]} - .seh_endprologue - - # Load the bswap_mask and gfpoly constants. Since AADLEN is usually small, - # usually only 128-bit vectors will be used. So as an optimization, don't - # broadcast these constants to all 128-bit lanes quite yet. - vmovdqu .Lbswap_mask(%rip), $BSWAP_MASK_XMM - vmovdqu .Lgfpoly(%rip), $GFPOLY_XMM - - # Load the GHASH accumulator. - vmovdqu ($GHASH_ACC_PTR), $GHASH_ACC_XMM - vpshufb $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM - - # Optimize for AADLEN < 64 by checking for AADLEN < 64 before AADLEN < 256. - cmp \$64, $AADLEN - jb .Laad_blockbyblock - - # AADLEN >= 64, so we'll operate on full vectors. Broadcast bswap_mask and - # gfpoly to all 128-bit lanes. - vshufi64x2 \$0, $BSWAP_MASK, $BSWAP_MASK, $BSWAP_MASK - vshufi64x2 \$0, $GFPOLY, $GFPOLY, $GFPOLY - - # Load the lowest set of key powers. - vmovdqu8 $OFFSETOFEND_H_POWERS-1*64($HTABLE), $H_POW1 - - cmp \$256, $AADLEN - jb .Laad_loop_1x - - # AADLEN >= 256. Load the higher key powers. - vmovdqu8 $OFFSETOFEND_H_POWERS-4*64($HTABLE), $H_POW4 - vmovdqu8 $OFFSETOFEND_H_POWERS-3*64($HTABLE), $H_POW3 - vmovdqu8 $OFFSETOFEND_H_POWERS-2*64($HTABLE), $H_POW2 - - # Update GHASH with 256 bytes of AAD at a time. -.Laad_loop_4x: - vmovdqu8 0*64($AAD), $GHASHDATA0 - vmovdqu8 1*64($AAD), $GHASHDATA1 - vmovdqu8 2*64($AAD), $GHASHDATA2 - vmovdqu8 3*64($AAD), $GHASHDATA3 - @{[ _ghash_4x $BSWAP_MASK, @GHASHDATA, @GHASHDATA_XMM, $H_POW4, $H_POW3, - $H_POW2, $H_POW1, $GFPOLY, $GHASHTMP0, $GHASHTMP1, - $GHASHTMP2, $GHASH_ACC, $GHASH_ACC_XMM ]} - add \$256, $AAD - sub \$256, $AADLEN - cmp \$256, $AADLEN - jae .Laad_loop_4x - - # Update GHASH with 64 bytes of AAD at a time. - cmp \$64, $AADLEN - jb .Laad_large_done -.Laad_loop_1x: - vmovdqu8 ($AAD), $GHASHDATA0 - vpshufb $BSWAP_MASK, $GHASHDATA0, $GHASHDATA0 - vpxord $GHASHDATA0, $GHASH_ACC, $GHASH_ACC - @{[ _ghash_mul $H_POW1, $GHASH_ACC, $GHASH_ACC, $GFPOLY, - $GHASHDATA0, $GHASHDATA1, $GHASHDATA2 ]} - @{[ _horizontal_xor $GHASH_ACC, $GHASH_ACC_XMM, $GHASH_ACC_XMM, - $GHASHDATA0_XMM, $GHASHDATA1_XMM, $GHASHDATA2_XMM ]} - add \$64, $AAD - sub \$64, $AADLEN - cmp \$64, $AADLEN - jae .Laad_loop_1x - -.Laad_large_done: - - # GHASH the remaining data 16 bytes at a time, using xmm registers only. -.Laad_blockbyblock: - test $AADLEN, $AADLEN - jz .Laad_done - vmovdqu $OFFSETOFEND_H_POWERS-16($HTABLE), $H_POW1_XMM -.Laad_loop_blockbyblock: - vmovdqu ($AAD), $GHASHDATA0_XMM - vpshufb $BSWAP_MASK_XMM, $GHASHDATA0_XMM, $GHASHDATA0_XMM - vpxor $GHASHDATA0_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM - @{[ _ghash_mul $H_POW1_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM, $GFPOLY_XMM, - $GHASHDATA0_XMM, $GHASHDATA1_XMM, $GHASHDATA2_XMM ]} - add \$16, $AAD - sub \$16, $AADLEN - jnz .Laad_loop_blockbyblock - -.Laad_done: - # Store the updated GHASH accumulator back to memory. - vpshufb $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM - vmovdqu $GHASH_ACC_XMM, ($GHASH_ACC_PTR) - - vzeroupper # This is needed after using ymm or zmm registers. -___ -} -$code .= _end_func; - # Do one non-last round of AES encryption on the counter blocks in aesdata[0-3] # using the round key that has been broadcast to all 128-bit lanes of round_key. sub _vaesenc_4x { @@ -1292,11 +1177,6 @@ sub filter_and_print { my $postspace = $+{postspace}; if (exists $asmMap{$trimmed}) { $line = ${prespace} . $asmMap{$trimmed} . ${postspace}; - } else { - if($trimmed =~ /(vpclmulqdq|vaes).*%[yz]mm/) { - die ("found instruction not supported under old binutils, please update asmMap with the results of running\n" . - 'find target -name "*aes-gcm-avx512*.o" -exec python3 crypto/fipsmodule/aes/asm/make-avx-map-for-old-binutils.py \{\} \; | LC_ALL=C sort | uniq'); - } } } print $line,"\n"; diff --git a/src/aead/aes_gcm.rs b/src/aead/aes_gcm.rs index 4f10af12a5..61ce3655b6 100644 --- a/src/aead/aes_gcm.rs +++ b/src/aead/aes_gcm.rs @@ -38,6 +38,7 @@ use cpu::GetFeature as _; mod aarch64; mod aeshwclmulmovbe; mod vaesclmulavx2; +mod vaesclmulavx512; #[derive(Clone)] pub(super) struct Key(DynKey); @@ -77,6 +78,12 @@ enum DynKey { #[cfg(all(target_arch = "aarch64", target_endian = "little"))] AesHwClMul(Combo), + #[cfg(all( + target_arch = "x86_64", + not(any(target_os = "macos", target_vendor = "apple")) + ))] + VAesClMulAvx512(Combo), + #[cfg(target_arch = "x86_64")] VAesClMulAvx2(Combo), @@ -117,16 +124,25 @@ impl DynKey { if let Some((aes, gcm)) = cpu.get_feature() { let aes_key = aes::hw::Key::new(key, aes, cpu.get_feature()); let gcm_key_value = derive_gcm_key_value(&aes_key); - return if let Some(cpu) = cpu.get_feature() { + + #[cfg(not(any(target_os = "macos", target_vendor = "apple")))] + if let Some(cpu) = cpu.get_feature() { + let gcm_key = gcm::vclmulavx512::Key::new(gcm_key_value, cpu); + return Self::VAesClMulAvx512(Combo { aes_key, gcm_key }); + } + + if let Some(cpu) = cpu.get_feature() { let gcm_key = gcm::vclmulavx2::Key::new(gcm_key_value, cpu); - Self::VAesClMulAvx2(Combo { aes_key, gcm_key }) - } else if let Some(cpu) = cpu.get_feature() { + return Self::VAesClMulAvx2(Combo { aes_key, gcm_key }); + } + + if let Some(cpu) = cpu.get_feature() { let gcm_key = gcm::clmulavxmovbe::Key::new(gcm_key_value, cpu); - Self::AesHwClMulAvxMovbe(Combo { aes_key, gcm_key }) - } else { - let gcm_key = gcm::clmul_x86_64::Key::new(gcm_key_value, gcm); - Self::AesHwClMul(Combo { aes_key, gcm_key }) - }; + return Self::AesHwClMulAvxMovbe(Combo { aes_key, gcm_key }); + } + + let gcm_key = gcm::clmul_x86_64::Key::new(gcm_key_value, gcm); + return Self::AesHwClMul(Combo { aes_key, gcm_key }); } #[cfg(target_arch = "x86")] @@ -227,6 +243,12 @@ fn seal( seal_whole_partial(c, aad, in_out, ctr, tag_iv, aarch64::seal_whole) } + #[cfg(target_arch = "x86_64")] + #[cfg(not(any(target_os = "macos", target_vendor = "apple")))] + DynKey::VAesClMulAvx512(c) => { + seal_whole_partial(c, aad, in_out, ctr, tag_iv, vaesclmulavx512::seal_whole) + } + #[cfg(target_arch = "x86_64")] DynKey::VAesClMulAvx2(c) => seal_whole_partial( c, @@ -352,6 +374,12 @@ fn open( open_whole_partial(c, aad, in_out, ctr, tag_iv, aarch64::open_whole) } + #[cfg(target_arch = "x86_64")] + #[cfg(not(any(target_os = "macos", target_vendor = "apple")))] + DynKey::VAesClMulAvx512(c) => { + open_whole_partial(c, aad, in_out, ctr, tag_iv, vaesclmulavx512::open_whole) + } + #[cfg(target_arch = "x86_64")] DynKey::VAesClMulAvx2(c) => open_whole_partial( c, diff --git a/src/aead/aes_gcm/vaesclmulavx512.rs b/src/aead/aes_gcm/vaesclmulavx512.rs new file mode 100644 index 0000000000..6e23b63beb --- /dev/null +++ b/src/aead/aes_gcm/vaesclmulavx512.rs @@ -0,0 +1,90 @@ +// Copyright 2015-2025 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +#![cfg(target_arch = "x86_64")] +#![cfg_attr(any(target_os = "macos", target_vendor = "apple"), allow(dead_code))] + +use super::{aes, gcm, Counter, Overlapping, BLOCK_LEN}; +use crate::{c, polyfill::slice::AsChunksMut}; +use core::num::{NonZeroU32, NonZeroUsize}; + +pub(super) fn seal_whole( + aes_key: &aes::hw::Key, + auth: &mut gcm::Context, + ctr: &mut Counter, + mut in_out: AsChunksMut, +) { + prefixed_extern! { + fn aes_gcm_enc_update_vaes_avx512( + input: *const u8, + output: *mut u8, + len: c::NonZero_size_t, // TODO? zero OK? + key: &aes::hw::Key, + ivec: &Counter, + Htable: &gcm::vclmulavx512::Key, + Xi: &mut gcm::Xi); + } + + let in_out = in_out.as_flattened_mut(); + + // Precondition: Since we have a `gcm::Context` then the number of blocks + // must fit in `u32`. + let blocks = u32::try_from(in_out.len() / BLOCK_LEN).unwrap(); + + if let Some(len) = NonZeroUsize::new(in_out.len()) { + let (htable, xi) = auth.inner(); + let input = in_out.as_ptr(); + let output = in_out.as_mut_ptr(); + unsafe { aes_gcm_enc_update_vaes_avx512(input, output, len, aes_key, ctr, htable, xi) }; + let blocks = NonZeroU32::new(blocks).unwrap_or_else(|| { + unreachable!() // Due to previous checks. + }); + ctr.increment_by_less_safe(blocks); + } +} + +pub(super) fn open_whole( + aes_key: &aes::hw::Key, + auth: &mut gcm::Context, + in_out: Overlapping, + ctr: &mut Counter, +) { + prefixed_extern! { + fn aes_gcm_dec_update_vaes_avx512( + input: *const u8, + output: *mut u8, + len: c::NonZero_size_t, // TODO? zero OK? + key: &aes::hw::Key, + ivec: &mut Counter, + Htable: &gcm::vclmulavx512::Key, + Xi: &mut gcm::Xi); + } + + // Precondition. TODO: Create an overlapping::AsChunks for this. + assert_eq!(in_out.len() % BLOCK_LEN, 0); + // Precondition: Since we have a `gcm::Context` then the number of blocks + // must fit in `u32`. + let blocks = u32::try_from(in_out.len() / BLOCK_LEN).unwrap(); + + in_out.with_input_output_len(|input, output, len| { + if let Some(len) = NonZeroUsize::new(len) { + let (htable, xi) = auth.inner(); + unsafe { aes_gcm_dec_update_vaes_avx512(input, output, len, aes_key, ctr, htable, xi) }; + let blocks = NonZeroU32::new(blocks).unwrap_or_else(|| { + unreachable!() // Due to previous checks. + }); + ctr.increment_by_less_safe(blocks); + } + }) +} diff --git a/src/aead/gcm.rs b/src/aead/gcm.rs index 341d12406e..c66fc1be08 100644 --- a/src/aead/gcm.rs +++ b/src/aead/gcm.rs @@ -45,6 +45,7 @@ pub(super) mod clmulavxmovbe; pub(super) mod fallback; pub(super) mod neon; pub(super) mod vclmulavx2; +pub(super) mod vclmulavx512; pub(super) struct Context<'key, K> { Xi: Xi, @@ -128,6 +129,15 @@ impl Context<'_, vclmulavx2::Key> { } } +#[cfg(target_arch = "x86_64")] +impl Context<'_, vclmulavx512::Key> { + /// Access to `inner` for the integrated AES-GCM implementations only. + #[inline] + pub(super) fn inner(&mut self) -> (&vclmulavx512::Key, &mut Xi) { + (self.key, &mut self.Xi) + } +} + impl Context<'_, K> { #[inline(always)] pub fn update_blocks(&mut self, input: AsChunks) { diff --git a/src/aead/gcm/vclmulavx512.rs b/src/aead/gcm/vclmulavx512.rs new file mode 100644 index 0000000000..5043b568d6 --- /dev/null +++ b/src/aead/gcm/vclmulavx512.rs @@ -0,0 +1,62 @@ +// Copyright 2018-2025 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +#![cfg(target_arch = "x86_64")] +#![cfg_attr(any(target_os = "macos", target_vendor = "apple"), allow(dead_code))] + +use super::{ffi, ffi::KeyValue, UpdateBlock, Xi}; +use crate::{ + aead::gcm::ffi::BLOCK_LEN, + c, + cpu::intel::{Avx2, Avx512_BW_VL_ZMM, Bmi2, VAesClmul}, + polyfill::slice::AsChunks, +}; +use core::mem::MaybeUninit; + +#[derive(Clone)] +#[repr(transparent)] +pub struct Key([ffi::U128; 16]); + +impl Key { + pub(in super::super) fn new( + value: KeyValue, + _cpu: (Avx2, Avx512_BW_VL_ZMM, Bmi2, VAesClmul), + ) -> Self { + prefixed_extern! { + fn gcm_init_vpclmulqdq_avx512(HTable: *mut Key, h: &KeyValue); + } + let mut uninit = MaybeUninit::::uninit(); + unsafe { + gcm_init_vpclmulqdq_avx512(uninit.as_mut_ptr(), &value); + } + unsafe { uninit.assume_init() } + } +} + +impl UpdateBlock for Key { + fn update_block(&self, xi: &mut Xi, a: [u8; BLOCK_LEN]) { + prefixed_extern! { + fn gcm_ghash_vpclmulqdq_avx512_16( + xi: &mut Xi, + Htable: &Key, + inp: *const u8, + len: c::NonZero_size_t, + ); + } + let input: AsChunks = (&a).into(); + ffi::with_non_dangling_ptr(input, |input, len| unsafe { + gcm_ghash_vpclmulqdq_avx512_16(xi, self, input, len) + }) + } +} diff --git a/src/cpu.rs b/src/cpu.rs index 2bb68c55b9..3944c2b8e9 100644 --- a/src/cpu.rs +++ b/src/cpu.rs @@ -114,6 +114,20 @@ where } } +impl GetFeature<(A, B, C, D)> for features::Values +where + features::Values: GetFeature<(A, B)>, + features::Values: GetFeature<(C, D)>, +{ + #[inline(always)] + fn get_feature(&self) -> Option<(A, B, C, D)> { + match (self.get_feature(), self.get_feature()) { + (Some((a, b)), Some((c, d))) => Some((a, b, c, d)), + _ => None, + } + } +} + impl GetFeature for Features where features::Values: GetFeature, diff --git a/src/cpu/x86_64.rs b/src/cpu/x86_64.rs index a0c86c51b1..33572a1ba9 100644 --- a/src/cpu/x86_64.rs +++ b/src/cpu/x86_64.rs @@ -13,7 +13,7 @@ // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. // "Intel" citations are for "Intel 64 and IA-32 Architectures Software -// Developer’s Manual", Combined Volumes, December 2024. +// Developer’s Manual", Combined Volumes, June 2025. // "AMD" citations are for "AMD64 Technology AMD64 Architecture // Programmer’s Manual, Volumes 1-5" Revision 4.08 April 2024. @@ -81,13 +81,18 @@ pub(super) mod featureflags { } struct CpuidSummary { - is_intel: bool, + model: Model, leaf1_ecx: u32, extended_features_ecx: u32, extended_features_ebx: u32, xcr0: u64, } +enum Model { + Intel { family6_model: Option }, + Other, +} + // SAFETY: This unconditionally uses CPUID because we don't have a good // way to detect CPUID and because we don't know of a CPU that supports // SSE2 (that we currently statically require) but doesn't support @@ -106,14 +111,39 @@ unsafe fn cpuid_all() -> CpuidSummary { let is_intel = (leaf0.ebx == 0x756e6547) && (leaf0.edx == 0x49656e69) && (leaf0.ecx == 0x6c65746e); - let leaf1_ecx = if leaf0.eax >= 1 { - // SAFETY: `leaf0.eax >= 1` indicates leaf 1 is available. + let (model, leaf1_ecx) = if leaf0.eax >= 1 { + // SAFETY: `r0.eax >= 1` indicates leaf 1 is available. let leaf1 = unsafe { arch::__cpuid(1) }; - leaf1.ecx + let model = (leaf1.eax >> 4) & 0x0f; + let family_id = (leaf1.eax >> 8) & 0x0f; + let model = if family_id == 6 || family_id == 15 { + let model_extended = (leaf1.eax >> 16) & 0x0f; + (model_extended << 4) | model + } else { + model + }; + + let model = match (is_intel, family_id) { + (true, 6) => Model::Intel { + family6_model: Some(model), + }, + (true, _) => Model::Intel { + family6_model: None, + }, + (false, _) => Model::Other, + }; + (model, leaf1.ecx) } else { // Expected to be unreachable on any environment we currently // support. - 0 + let model = if is_intel { + Model::Intel { + family6_model: None, + } + } else { + Model::Other + }; + (model, 0) }; let (extended_features_ecx, extended_features_ebx) = if leaf0.eax >= 7 { @@ -131,7 +161,7 @@ unsafe fn cpuid_all() -> CpuidSummary { }; CpuidSummary { - is_intel, + model, leaf1_ecx, extended_features_ecx, extended_features_ebx, @@ -143,7 +173,7 @@ fn cpuid_to_caps_and_set_c_flags(r: CpuidSummary) -> u32 { use core::{mem::align_of, sync::atomic::AtomicU32}; let CpuidSummary { - is_intel, + model, leaf1_ecx, extended_features_ecx, extended_features_ebx, @@ -218,6 +248,58 @@ fn cpuid_to_caps_and_set_c_flags(r: CpuidSummary) -> u32 { // calling into the C code. let flag = unsafe { &avx2_available }; flag.store(1, core::sync::atomic::Ordering::Relaxed); + + // Intel: The OS isn't allowed to enable ZMM state w/o enabling YMM/XMM + // state. + let os_supports_zmm_ymm_xmm = + check(r.xcr0, 7) && check(r.xcr0, 6) && check(r.xcr0, 5) && os_supports_ymm_xmm; + + // Intel: "15.2 DETECTION OF AVX-512 FOUNDATION INSTRUCTIONS". + let f = os_supports_zmm_ymm_xmm && check(extended_features_ebx, 16); + + // Intel: "15.3 DETECTION OF 512-BIT INSTRUCTION GROUPS OF THE INTEL + // AVX-512 FAMILY" + let bw = os_supports_zmm_ymm_xmm && check(extended_features_ebx, 30); + + // Intel: "15.4 DETECTION OF INTEL AVX-512 INSTRUCTION GROUPS OPERATING + // AT 256 AND 128-BIT VECTOR LENGTHS". + let vl = os_supports_zmm_ymm_xmm && check(extended_features_ebx, 31); + + // Initial releases of macOS 12 had a serious bug w.r.t. AVX-512 + // support; see https://go-review.googlesource.com/c/sys/+/620256. + // Given that, plus Apple's transition to ARM, AVX-512 isn't worth + // supporting for their targets. + let is_darwin = cfg!(any(target_os = "macos", target_vendor = "apple")); + + // Linux 'intel.c': `zmm_exclusion_list`/`X86_FEATURE_PREFER_YMM`. + // LLVM `X86.td`: `SKXTuning`/`TGLTuning`/`ICLTuning`. + // (family_id == 6 implies family_id != 15 so the extended family + // ID is not relevant.) + let avoid_zmm = match model { + Model::Intel { + family6_model: Some(model), + } => { + matches!( + model, + 0x55 // Skylake X + | 0x6a // Ice Lake X + | 0x6c // Ice Lake D + | 0x7d // Ice Lake + | 0x7e // Ice Lake L + | 0x9d // Ice Lake NNPI + | 0x8c // Tiger Lake L + | 0x8d // Tiger Lake + ) + } + Model::Intel { + family6_model: None, + } => false, + Model::Other => false, + }; + + if f && bw && vl && !avoid_zmm && !is_darwin { + set(&mut caps, Shift::Avx512_BW_VL_ZMM); + } } // Intel: "12.13.4 Checking for Intel AES-NI Support" @@ -265,6 +347,10 @@ fn cpuid_to_caps_and_set_c_flags(r: CpuidSummary) -> u32 { set(&mut caps, Shift::Sha); } + let is_intel = match model { + Model::Intel { .. } => true, + Model::Other => false, + }; if is_intel { set(&mut caps, Shift::IntelCpu); } @@ -326,6 +412,8 @@ fn check + Copy + Eq + From + Shl>( } impl_get_feature! { + Avx512_BW_VL_ZMM, + VAesClmul, ClMul,