briansmith · briansmith · Feb 16, 2025
diff --git a/Cargo.toml b/Cargo.toml
@@ -49,9 +49,10 @@ include = [
     "crypto/curve25519/curve25519_64_adx.c",
     "crypto/curve25519/curve25519_tables.h",
     "crypto/curve25519/internal.h",
-    "crypto/fipsmodule/aes/asm/aesni-x86.pl",
     "crypto/fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl",
+    "crypto/fipsmodule/aes/asm/aes-gcm-avx512-x86_64.pl",
     "crypto/fipsmodule/aes/asm/aesni-gcm-x86_64.pl",
+    "crypto/fipsmodule/aes/asm/aesni-x86.pl",
     "crypto/fipsmodule/aes/asm/aesni-x86_64.pl",
     "crypto/fipsmodule/aes/asm/aesv8-armx.pl",
     "crypto/fipsmodule/aes/asm/aesv8-gcm-armv8.pl",

diff --git a/build.rs b/build.rs
@@ -113,6 +113,7 @@ const RING_SRCS: &[(&[&str], &str)] = &[
     (&[X86_64], "crypto/chacha/asm/chacha-x86_64.pl"),
     (&[X86_64], "crypto/curve25519/curve25519_64_adx.c"),
     (&[X86_64], "crypto/fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl"),
+    (&[X86_64], "crypto/fipsmodule/aes/asm/aes-gcm-avx512-x86_64.pl"),
     (&[X86_64], "crypto/fipsmodule/aes/asm/aesni-gcm-x86_64.pl"),
     (&[X86_64], "crypto/fipsmodule/aes/asm/aesni-x86_64.pl"),
     (&[X86_64], "crypto/fipsmodule/aes/asm/ghash-x86_64.pl"),
@@ -912,8 +913,10 @@ fn prefix_all_symbols(pp: char, prefix_prefix: &str, prefix: &str) -> String {
         "LIMBS_window5_unsplit_window",
         "aes_gcm_dec_kernel",
         "aes_gcm_dec_update_vaes_avx2",
+        "aes_gcm_dec_update_vaes_avx512",
         "aes_gcm_enc_kernel",
         "aes_gcm_enc_update_vaes_avx2",
+        "aes_gcm_enc_update_vaes_avx512",
         "aes_hw_ctr32_encrypt_blocks",
         "aes_hw_set_encrypt_key",
         "aes_hw_set_encrypt_key_128",
@@ -971,6 +974,7 @@ fn prefix_all_symbols(pp: char, prefix_prefix: &str, prefix: &str) -> String {
         "gcm_ghash_clmul",
         "gcm_ghash_neon",
         "gcm_ghash_vpclmulqdq_avx2_16",
+        "gcm_ghash_vpclmulqdq_avx512_16",
         "gcm_gmult_clmul",
         "gcm_gmult_neon",
         "gcm_gmult_v8",
@@ -979,6 +983,7 @@ fn prefix_all_symbols(pp: char, prefix_prefix: &str, prefix: &str) -> String {
         "gcm_init_neon",
         "gcm_init_v8",
         "gcm_init_vpclmulqdq_avx2",
+        "gcm_init_vpclmulqdq_avx512",
         "k25519Precomp",
         "limbs_mul_add_limb",
         "little_endian_bytes_from_scalar",

diff --git a/crypto/fipsmodule/aes/asm/aes-gcm-avx512-x86_64.pl b/crypto/fipsmodule/aes/asm/aes-gcm-avx512-x86_64.pl
@@ -588,18 +588,24 @@ sub _ghash_4x {
     return $code;
 }
 
-# void gcm_gmult_vpclmulqdq_avx512(uint8_t Xi[16], const u128 Htable[16]);
-$code .= _begin_func "gcm_gmult_vpclmulqdq_avx512", 1;
+# void gcm_ghash_vpclmulqdq_avx512_16(uint8_t Xi[16], const u128 Htable[16],
+#                                     const uint8_t aad[16], size_t aad_len_16););
+$code .= _begin_func "gcm_ghash_vpclmulqdq_avx512_16", 1;
 {
-    my ( $GHASH_ACC_PTR, $HTABLE ) = @argregs[ 0 .. 1 ];
+    my ( $GHASH_ACC_PTR, $HTABLE, $AAD, $AAD_LEN_16 ) = @argregs[ 0 .. 3 ];
     my ( $GHASH_ACC, $BSWAP_MASK, $H_POW1, $GFPOLY, $T0, $T1, $T2 ) =
       map( "%xmm$_", ( 0 .. 6 ) );
 
     $code .= <<___;
     @{[ _save_xmmregs (6) ]}
     .seh_endprologue
 
+    # Load the GHASH accumulator.
     vmovdqu         ($GHASH_ACC_PTR), $GHASH_ACC
+
+    # XOR the AAD into the accumulator.
+    vpxor           ($AAD), $GHASH_ACC, $GHASH_ACC
+
     vmovdqu         .Lbswap_mask(%rip), $BSWAP_MASK
     vmovdqu         $OFFSETOFEND_H_POWERS-16($HTABLE), $H_POW1
     vmovdqu         .Lgfpoly(%rip), $GFPOLY
@@ -615,127 +621,6 @@ sub _ghash_4x {
 }
 $code .= _end_func;
 
-# void gcm_ghash_vpclmulqdq_avx512(uint8_t Xi[16], const u128 Htable[16],
-#                                  const uint8_t *in, size_t len);
-#
-# Using the key |Htable|, update the GHASH accumulator |Xi| with the data given
-# by |in| and |len|.  |len| must be a multiple of 16.
-#
-# This function handles large amounts of AAD efficiently, while also keeping the
-# overhead low for small amounts of AAD which is the common case.  TLS uses less
-# than one block of AAD, but (uncommonly) other use cases may use much more.
-$code .= _begin_func "gcm_ghash_vpclmulqdq_avx512", 1;
-{
-    # Function arguments
-    my ( $GHASH_ACC_PTR, $HTABLE, $AAD, $AADLEN ) = @argregs[ 0 .. 3 ];
-
-    # Additional local variables
-    my ( $GHASHDATA0, $GHASHDATA0_XMM ) = ( "%zmm0", "%xmm0" );
-    my ( $GHASHDATA1, $GHASHDATA1_XMM ) = ( "%zmm1", "%xmm1" );
-    my ( $GHASHDATA2, $GHASHDATA2_XMM ) = ( "%zmm2", "%xmm2" );
-    my ( $GHASHDATA3, $GHASHDATA3_XMM ) = ( "%zmm3", "%xmm3" );
-    my @GHASHDATA = ( $GHASHDATA0, $GHASHDATA1, $GHASHDATA2, $GHASHDATA3 );
-    my @GHASHDATA_XMM =
-      ( $GHASHDATA0_XMM, $GHASHDATA1_XMM, $GHASHDATA2_XMM, $GHASHDATA3_XMM );
-    my ( $BSWAP_MASK, $BSWAP_MASK_XMM ) = ( "%zmm4", "%xmm4" );
-    my ( $GHASH_ACC, $GHASH_ACC_XMM )   = ( "%zmm5", "%xmm5" );
-    my ( $H_POW4, $H_POW3, $H_POW2 )    = ( "%zmm6", "%zmm7", "%zmm8" );
-    my ( $H_POW1, $H_POW1_XMM )         = ( "%zmm9", "%xmm9" );
-    my ( $GFPOLY, $GFPOLY_XMM )         = ( "%zmm10", "%xmm10" );
-    my ( $GHASHTMP0, $GHASHTMP1, $GHASHTMP2 ) =
-      ( "%zmm11", "%zmm12", "%zmm13" );
-
-    $code .= <<___;
-    @{[ _save_xmmregs (6 .. 13) ]}
-    .seh_endprologue
-
-    # Load the bswap_mask and gfpoly constants.  Since AADLEN is usually small,
-    # usually only 128-bit vectors will be used.  So as an optimization, don't
-    # broadcast these constants to all 128-bit lanes quite yet.
-    vmovdqu         .Lbswap_mask(%rip), $BSWAP_MASK_XMM
-    vmovdqu         .Lgfpoly(%rip), $GFPOLY_XMM
-
-    # Load the GHASH accumulator.
-    vmovdqu         ($GHASH_ACC_PTR), $GHASH_ACC_XMM
-    vpshufb         $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
-
-    # Optimize for AADLEN < 64 by checking for AADLEN < 64 before AADLEN < 256.
-    cmp             \$64, $AADLEN
-    jb              .Laad_blockbyblock
-
-    # AADLEN >= 64, so we'll operate on full vectors.  Broadcast bswap_mask and
-    # gfpoly to all 128-bit lanes.
-    vshufi64x2      \$0, $BSWAP_MASK, $BSWAP_MASK, $BSWAP_MASK
-    vshufi64x2      \$0, $GFPOLY, $GFPOLY, $GFPOLY
-
-    # Load the lowest set of key powers.
-    vmovdqu8        $OFFSETOFEND_H_POWERS-1*64($HTABLE), $H_POW1
-
-    cmp             \$256, $AADLEN
-    jb              .Laad_loop_1x
-
-    # AADLEN >= 256.  Load the higher key powers.
-    vmovdqu8        $OFFSETOFEND_H_POWERS-4*64($HTABLE), $H_POW4
-    vmovdqu8        $OFFSETOFEND_H_POWERS-3*64($HTABLE), $H_POW3
-    vmovdqu8        $OFFSETOFEND_H_POWERS-2*64($HTABLE), $H_POW2
-
-    # Update GHASH with 256 bytes of AAD at a time.
-.Laad_loop_4x:
-    vmovdqu8        0*64($AAD), $GHASHDATA0
-    vmovdqu8        1*64($AAD), $GHASHDATA1
-    vmovdqu8        2*64($AAD), $GHASHDATA2
-    vmovdqu8        3*64($AAD), $GHASHDATA3
-    @{[ _ghash_4x   $BSWAP_MASK, @GHASHDATA, @GHASHDATA_XMM, $H_POW4, $H_POW3,
-                    $H_POW2, $H_POW1, $GFPOLY, $GHASHTMP0, $GHASHTMP1,
-                    $GHASHTMP2, $GHASH_ACC, $GHASH_ACC_XMM ]}
-    add             \$256, $AAD
-    sub             \$256, $AADLEN
-    cmp             \$256, $AADLEN
-    jae             .Laad_loop_4x
-
-    # Update GHASH with 64 bytes of AAD at a time.
-    cmp             \$64, $AADLEN
-    jb              .Laad_large_done
-.Laad_loop_1x:
-    vmovdqu8        ($AAD), $GHASHDATA0
-    vpshufb         $BSWAP_MASK, $GHASHDATA0, $GHASHDATA0
-    vpxord          $GHASHDATA0, $GHASH_ACC, $GHASH_ACC
-    @{[ _ghash_mul  $H_POW1, $GHASH_ACC, $GHASH_ACC, $GFPOLY,
-                    $GHASHDATA0, $GHASHDATA1, $GHASHDATA2 ]}
-    @{[ _horizontal_xor $GHASH_ACC, $GHASH_ACC_XMM, $GHASH_ACC_XMM,
-                        $GHASHDATA0_XMM, $GHASHDATA1_XMM, $GHASHDATA2_XMM ]}
-    add             \$64, $AAD
-    sub             \$64, $AADLEN
-    cmp             \$64, $AADLEN
-    jae             .Laad_loop_1x
-
-.Laad_large_done:
-
-    # GHASH the remaining data 16 bytes at a time, using xmm registers only.
-.Laad_blockbyblock:
-    test            $AADLEN, $AADLEN
-    jz              .Laad_done
-    vmovdqu         $OFFSETOFEND_H_POWERS-16($HTABLE), $H_POW1_XMM
-.Laad_loop_blockbyblock:
-    vmovdqu         ($AAD), $GHASHDATA0_XMM
-    vpshufb         $BSWAP_MASK_XMM, $GHASHDATA0_XMM, $GHASHDATA0_XMM
-    vpxor           $GHASHDATA0_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
-    @{[ _ghash_mul  $H_POW1_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM, $GFPOLY_XMM,
-                    $GHASHDATA0_XMM, $GHASHDATA1_XMM, $GHASHDATA2_XMM ]}
-    add             \$16, $AAD
-    sub             \$16, $AADLEN
-    jnz             .Laad_loop_blockbyblock
-
-.Laad_done:
-    # Store the updated GHASH accumulator back to memory.
-    vpshufb         $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
-    vmovdqu         $GHASH_ACC_XMM, ($GHASH_ACC_PTR)
-
-    vzeroupper      # This is needed after using ymm or zmm registers.
-___
-}
-$code .= _end_func;
-
 # Do one non-last round of AES encryption on the counter blocks in aesdata[0-3]
 # using the round key that has been broadcast to all 128-bit lanes of round_key.
 sub _vaesenc_4x {
@@ -1292,11 +1177,6 @@ sub filter_and_print {
             my $postspace = $+{postspace};
             if (exists $asmMap{$trimmed}) {
                 $line = ${prespace} . $asmMap{$trimmed} . ${postspace};
-            } else {
-                if($trimmed =~ /(vpclmulqdq|vaes).*%[yz]mm/) {
-                    die ("found instruction not supported under old binutils, please update asmMap with the results of running\n" .
-                         'find target -name "*aes-gcm-avx512*.o" -exec python3 crypto/fipsmodule/aes/asm/make-avx-map-for-old-binutils.py \{\} \; | LC_ALL=C sort | uniq');
-                }
             }
         }
         print $line,"\n";

diff --git a/src/aead/aes_gcm.rs b/src/aead/aes_gcm.rs
@@ -38,6 +38,7 @@ use cpu::GetFeature as _;
 mod aarch64;
 mod aeshwclmulmovbe;
 mod vaesclmulavx2;
+mod vaesclmulavx512;
 
 #[derive(Clone)]
 pub(super) struct Key(DynKey);
@@ -77,6 +78,12 @@ enum DynKey {
     #[cfg(all(target_arch = "aarch64", target_endian = "little"))]
     AesHwClMul(Combo<aes::hw::Key, gcm::clmul_aarch64::Key>),
 
+    #[cfg(all(
+        target_arch = "x86_64",
+        not(any(target_os = "macos", target_vendor = "apple"))
+    ))]
+    VAesClMulAvx512(Combo<aes::hw::Key, gcm::vclmulavx512::Key>),
+
     #[cfg(target_arch = "x86_64")]
     VAesClMulAvx2(Combo<aes::hw::Key, gcm::vclmulavx2::Key>),
 
@@ -117,16 +124,25 @@ impl DynKey {
         if let Some((aes, gcm)) = cpu.get_feature() {
             let aes_key = aes::hw::Key::new(key, aes, cpu.get_feature());
             let gcm_key_value = derive_gcm_key_value(&aes_key);
-            return if let Some(cpu) = cpu.get_feature() {
+
+            #[cfg(not(any(target_os = "macos", target_vendor = "apple")))]
+            if let Some(cpu) = cpu.get_feature() {
+                let gcm_key = gcm::vclmulavx512::Key::new(gcm_key_value, cpu);
+                return Self::VAesClMulAvx512(Combo { aes_key, gcm_key });
+            }
+
+            if let Some(cpu) = cpu.get_feature() {
                 let gcm_key = gcm::vclmulavx2::Key::new(gcm_key_value, cpu);
-                Self::VAesClMulAvx2(Combo { aes_key, gcm_key })
-            } else if let Some(cpu) = cpu.get_feature() {
+                return Self::VAesClMulAvx2(Combo { aes_key, gcm_key });
+            }
+
+            if let Some(cpu) = cpu.get_feature() {
                 let gcm_key = gcm::clmulavxmovbe::Key::new(gcm_key_value, cpu);
-                Self::AesHwClMulAvxMovbe(Combo { aes_key, gcm_key })
-            } else {
-                let gcm_key = gcm::clmul_x86_64::Key::new(gcm_key_value, gcm);
-                Self::AesHwClMul(Combo { aes_key, gcm_key })
-            };
+                return Self::AesHwClMulAvxMovbe(Combo { aes_key, gcm_key });
+            }
+
+            let gcm_key = gcm::clmul_x86_64::Key::new(gcm_key_value, gcm);
+            return Self::AesHwClMul(Combo { aes_key, gcm_key });
         }
 
         #[cfg(target_arch = "x86")]
@@ -227,6 +243,12 @@ fn seal(
             seal_whole_partial(c, aad, in_out, ctr, tag_iv, aarch64::seal_whole)
         }
 
+        #[cfg(target_arch = "x86_64")]
+        #[cfg(not(any(target_os = "macos", target_vendor = "apple")))]
+        DynKey::VAesClMulAvx512(c) => {
+            seal_whole_partial(c, aad, in_out, ctr, tag_iv, vaesclmulavx512::seal_whole)
+        }
+
         #[cfg(target_arch = "x86_64")]
         DynKey::VAesClMulAvx2(c) => seal_whole_partial(
             c,
@@ -352,6 +374,12 @@ fn open(
             open_whole_partial(c, aad, in_out, ctr, tag_iv, aarch64::open_whole)
         }
 
+        #[cfg(target_arch = "x86_64")]
+        #[cfg(not(any(target_os = "macos", target_vendor = "apple")))]
+        DynKey::VAesClMulAvx512(c) => {
+            open_whole_partial(c, aad, in_out, ctr, tag_iv, vaesclmulavx512::open_whole)
+        }
+
         #[cfg(target_arch = "x86_64")]
         DynKey::VAesClMulAvx2(c) => open_whole_partial(
             c,

diff --git a/src/aead/aes_gcm/vaesclmulavx512.rs b/src/aead/aes_gcm/vaesclmulavx512.rs
@@ -0,0 +1,90 @@
+// Copyright 2015-2025 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+#![cfg(target_arch = "x86_64")]
+#![cfg_attr(any(target_os = "macos", target_vendor = "apple"), allow(dead_code))]
+
+use super::{aes, gcm, Counter, Overlapping, BLOCK_LEN};
+use crate::{c, polyfill::slice::AsChunksMut};
+use core::num::{NonZeroU32, NonZeroUsize};
+
+pub(super) fn seal_whole(
+    aes_key: &aes::hw::Key,
+    auth: &mut gcm::Context<gcm::vclmulavx512::Key>,
+    ctr: &mut Counter,
+    mut in_out: AsChunksMut<u8, BLOCK_LEN>,
+) {
+    prefixed_extern! {
+        fn aes_gcm_enc_update_vaes_avx512(
+            input: *const u8,
+            output: *mut u8,
+            len: c::NonZero_size_t, // TODO? zero OK?
+            key: &aes::hw::Key,
+            ivec: &Counter,
+            Htable: &gcm::vclmulavx512::Key,
+            Xi: &mut gcm::Xi);
+    }
+
+    let in_out = in_out.as_flattened_mut();
+
+    // Precondition: Since we have a `gcm::Context` then the number of blocks
+    // must fit in `u32`.
+    let blocks = u32::try_from(in_out.len() / BLOCK_LEN).unwrap();
+
+    if let Some(len) = NonZeroUsize::new(in_out.len()) {
+        let (htable, xi) = auth.inner();
+        let input = in_out.as_ptr();
+        let output = in_out.as_mut_ptr();
+        unsafe { aes_gcm_enc_update_vaes_avx512(input, output, len, aes_key, ctr, htable, xi) };
+        let blocks = NonZeroU32::new(blocks).unwrap_or_else(|| {
+            unreachable!() // Due to previous checks.
+        });
+        ctr.increment_by_less_safe(blocks);
+    }
+}
+
+pub(super) fn open_whole(
+    aes_key: &aes::hw::Key,
+    auth: &mut gcm::Context<gcm::vclmulavx512::Key>,
+    in_out: Overlapping,
+    ctr: &mut Counter,
+) {
+    prefixed_extern! {
+        fn aes_gcm_dec_update_vaes_avx512(
+            input: *const u8,
+            output: *mut u8,
+            len: c::NonZero_size_t, // TODO? zero OK?
+            key: &aes::hw::Key,
+            ivec: &mut Counter,
+            Htable: &gcm::vclmulavx512::Key,
+            Xi: &mut gcm::Xi);
+    }
+
+    // Precondition. TODO: Create an overlapping::AsChunks for this.
+    assert_eq!(in_out.len() % BLOCK_LEN, 0);
+    // Precondition: Since we have a `gcm::Context` then the number of blocks
+    // must fit in `u32`.
+    let blocks = u32::try_from(in_out.len() / BLOCK_LEN).unwrap();
+
+    in_out.with_input_output_len(|input, output, len| {
+        if let Some(len) = NonZeroUsize::new(len) {
+            let (htable, xi) = auth.inner();
+            unsafe { aes_gcm_dec_update_vaes_avx512(input, output, len, aes_key, ctr, htable, xi) };
+            let blocks = NonZeroU32::new(blocks).unwrap_or_else(|| {
+                unreachable!() // Due to previous checks.
+            });
+            ctr.increment_by_less_safe(blocks);
+        }
+    })
+}