diff --git a/Cargo.toml b/Cargo.toml
index f9a95a9667..2fccc4c6f7 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -49,9 +49,10 @@ include = [
     "crypto/curve25519/curve25519_64_adx.c",
     "crypto/curve25519/curve25519_tables.h",
     "crypto/curve25519/internal.h",
-    "crypto/fipsmodule/aes/asm/aesni-x86.pl",
     "crypto/fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl",
+    "crypto/fipsmodule/aes/asm/aes-gcm-avx512-x86_64.pl",
     "crypto/fipsmodule/aes/asm/aesni-gcm-x86_64.pl",
+    "crypto/fipsmodule/aes/asm/aesni-x86.pl",
     "crypto/fipsmodule/aes/asm/aesni-x86_64.pl",
     "crypto/fipsmodule/aes/asm/aesv8-armx.pl",
     "crypto/fipsmodule/aes/asm/aesv8-gcm-armv8.pl",
diff --git a/build.rs b/build.rs
index ca7a7d4860..a1fd06f6b2 100644
--- a/build.rs
+++ b/build.rs
@@ -113,6 +113,7 @@ const RING_SRCS: &[(&[&str], &str)] = &[
     (&[X86_64], "crypto/chacha/asm/chacha-x86_64.pl"),
     (&[X86_64], "crypto/curve25519/curve25519_64_adx.c"),
     (&[X86_64], "crypto/fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl"),
+    (&[X86_64], "crypto/fipsmodule/aes/asm/aes-gcm-avx512-x86_64.pl"),
     (&[X86_64], "crypto/fipsmodule/aes/asm/aesni-gcm-x86_64.pl"),
     (&[X86_64], "crypto/fipsmodule/aes/asm/aesni-x86_64.pl"),
     (&[X86_64], "crypto/fipsmodule/aes/asm/ghash-x86_64.pl"),
@@ -912,8 +913,10 @@ fn prefix_all_symbols(pp: char, prefix_prefix: &str, prefix: &str) -> String {
         "LIMBS_window5_unsplit_window",
         "aes_gcm_dec_kernel",
         "aes_gcm_dec_update_vaes_avx2",
+        "aes_gcm_dec_update_vaes_avx512",
         "aes_gcm_enc_kernel",
         "aes_gcm_enc_update_vaes_avx2",
+        "aes_gcm_enc_update_vaes_avx512",
         "aes_hw_ctr32_encrypt_blocks",
         "aes_hw_set_encrypt_key",
         "aes_hw_set_encrypt_key_128",
@@ -971,6 +974,7 @@ fn prefix_all_symbols(pp: char, prefix_prefix: &str, prefix: &str) -> String {
         "gcm_ghash_clmul",
         "gcm_ghash_neon",
         "gcm_ghash_vpclmulqdq_avx2_16",
+        "gcm_ghash_vpclmulqdq_avx512_16",
         "gcm_gmult_clmul",
         "gcm_gmult_neon",
         "gcm_gmult_v8",
@@ -979,6 +983,7 @@ fn prefix_all_symbols(pp: char, prefix_prefix: &str, prefix: &str) -> String {
         "gcm_init_neon",
         "gcm_init_v8",
         "gcm_init_vpclmulqdq_avx2",
+        "gcm_init_vpclmulqdq_avx512",
         "k25519Precomp",
         "limbs_mul_add_limb",
         "little_endian_bytes_from_scalar",
diff --git a/crypto/fipsmodule/aes/asm/aes-gcm-avx512-x86_64.pl b/crypto/fipsmodule/aes/asm/aes-gcm-avx512-x86_64.pl
index ea73594d09..23cd756577 100644
--- a/crypto/fipsmodule/aes/asm/aes-gcm-avx512-x86_64.pl
+++ b/crypto/fipsmodule/aes/asm/aes-gcm-avx512-x86_64.pl
@@ -588,10 +588,11 @@ sub _ghash_4x {
     return $code;
 }
 
-# void gcm_gmult_vpclmulqdq_avx512(uint8_t Xi[16], const u128 Htable[16]);
-$code .= _begin_func "gcm_gmult_vpclmulqdq_avx512", 1;
+# void gcm_ghash_vpclmulqdq_avx512_16(uint8_t Xi[16], const u128 Htable[16],
+#                                     const uint8_t aad[16], size_t aad_len_16););
+$code .= _begin_func "gcm_ghash_vpclmulqdq_avx512_16", 1;
 {
-    my ( $GHASH_ACC_PTR, $HTABLE ) = @argregs[ 0 .. 1 ];
+    my ( $GHASH_ACC_PTR, $HTABLE, $AAD, $AAD_LEN_16 ) = @argregs[ 0 .. 3 ];
     my ( $GHASH_ACC, $BSWAP_MASK, $H_POW1, $GFPOLY, $T0, $T1, $T2 ) =
       map( "%xmm$_", ( 0 .. 6 ) );
 
@@ -599,7 +600,12 @@ sub _ghash_4x {
     @{[ _save_xmmregs (6) ]}
     .seh_endprologue
 
+    # Load the GHASH accumulator.
     vmovdqu         ($GHASH_ACC_PTR), $GHASH_ACC
+
+    # XOR the AAD into the accumulator.
+    vpxor           ($AAD), $GHASH_ACC, $GHASH_ACC
+
     vmovdqu         .Lbswap_mask(%rip), $BSWAP_MASK
     vmovdqu         $OFFSETOFEND_H_POWERS-16($HTABLE), $H_POW1
     vmovdqu         .Lgfpoly(%rip), $GFPOLY
@@ -615,127 +621,6 @@ sub _ghash_4x {
 }
 $code .= _end_func;
 
-# void gcm_ghash_vpclmulqdq_avx512(uint8_t Xi[16], const u128 Htable[16],
-#                                  const uint8_t *in, size_t len);
-#
-# Using the key |Htable|, update the GHASH accumulator |Xi| with the data given
-# by |in| and |len|.  |len| must be a multiple of 16.
-#
-# This function handles large amounts of AAD efficiently, while also keeping the
-# overhead low for small amounts of AAD which is the common case.  TLS uses less
-# than one block of AAD, but (uncommonly) other use cases may use much more.
-$code .= _begin_func "gcm_ghash_vpclmulqdq_avx512", 1;
-{
-    # Function arguments
-    my ( $GHASH_ACC_PTR, $HTABLE, $AAD, $AADLEN ) = @argregs[ 0 .. 3 ];
-
-    # Additional local variables
-    my ( $GHASHDATA0, $GHASHDATA0_XMM ) = ( "%zmm0", "%xmm0" );
-    my ( $GHASHDATA1, $GHASHDATA1_XMM ) = ( "%zmm1", "%xmm1" );
-    my ( $GHASHDATA2, $GHASHDATA2_XMM ) = ( "%zmm2", "%xmm2" );
-    my ( $GHASHDATA3, $GHASHDATA3_XMM ) = ( "%zmm3", "%xmm3" );
-    my @GHASHDATA = ( $GHASHDATA0, $GHASHDATA1, $GHASHDATA2, $GHASHDATA3 );
-    my @GHASHDATA_XMM =
-      ( $GHASHDATA0_XMM, $GHASHDATA1_XMM, $GHASHDATA2_XMM, $GHASHDATA3_XMM );
-    my ( $BSWAP_MASK, $BSWAP_MASK_XMM ) = ( "%zmm4", "%xmm4" );
-    my ( $GHASH_ACC, $GHASH_ACC_XMM )   = ( "%zmm5", "%xmm5" );
-    my ( $H_POW4, $H_POW3, $H_POW2 )    = ( "%zmm6", "%zmm7", "%zmm8" );
-    my ( $H_POW1, $H_POW1_XMM )         = ( "%zmm9", "%xmm9" );
-    my ( $GFPOLY, $GFPOLY_XMM )         = ( "%zmm10", "%xmm10" );
-    my ( $GHASHTMP0, $GHASHTMP1, $GHASHTMP2 ) =
-      ( "%zmm11", "%zmm12", "%zmm13" );
-
-    $code .= <<___;
-    @{[ _save_xmmregs (6 .. 13) ]}
-    .seh_endprologue
-
-    # Load the bswap_mask and gfpoly constants.  Since AADLEN is usually small,
-    # usually only 128-bit vectors will be used.  So as an optimization, don't
-    # broadcast these constants to all 128-bit lanes quite yet.
-    vmovdqu         .Lbswap_mask(%rip), $BSWAP_MASK_XMM
-    vmovdqu         .Lgfpoly(%rip), $GFPOLY_XMM
-
-    # Load the GHASH accumulator.
-    vmovdqu         ($GHASH_ACC_PTR), $GHASH_ACC_XMM
-    vpshufb         $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
-
-    # Optimize for AADLEN < 64 by checking for AADLEN < 64 before AADLEN < 256.
-    cmp             \$64, $AADLEN
-    jb              .Laad_blockbyblock
-
-    # AADLEN >= 64, so we'll operate on full vectors.  Broadcast bswap_mask and
-    # gfpoly to all 128-bit lanes.
-    vshufi64x2      \$0, $BSWAP_MASK, $BSWAP_MASK, $BSWAP_MASK
-    vshufi64x2      \$0, $GFPOLY, $GFPOLY, $GFPOLY
-
-    # Load the lowest set of key powers.
-    vmovdqu8        $OFFSETOFEND_H_POWERS-1*64($HTABLE), $H_POW1
-
-    cmp             \$256, $AADLEN
-    jb              .Laad_loop_1x
-
-    # AADLEN >= 256.  Load the higher key powers.
-    vmovdqu8        $OFFSETOFEND_H_POWERS-4*64($HTABLE), $H_POW4
-    vmovdqu8        $OFFSETOFEND_H_POWERS-3*64($HTABLE), $H_POW3
-    vmovdqu8        $OFFSETOFEND_H_POWERS-2*64($HTABLE), $H_POW2
-
-    # Update GHASH with 256 bytes of AAD at a time.
-.Laad_loop_4x:
-    vmovdqu8        0*64($AAD), $GHASHDATA0
-    vmovdqu8        1*64($AAD), $GHASHDATA1
-    vmovdqu8        2*64($AAD), $GHASHDATA2
-    vmovdqu8        3*64($AAD), $GHASHDATA3
-    @{[ _ghash_4x   $BSWAP_MASK, @GHASHDATA, @GHASHDATA_XMM, $H_POW4, $H_POW3,
-                    $H_POW2, $H_POW1, $GFPOLY, $GHASHTMP0, $GHASHTMP1,
-                    $GHASHTMP2, $GHASH_ACC, $GHASH_ACC_XMM ]}
-    add             \$256, $AAD
-    sub             \$256, $AADLEN
-    cmp             \$256, $AADLEN
-    jae             .Laad_loop_4x
-
-    # Update GHASH with 64 bytes of AAD at a time.
-    cmp             \$64, $AADLEN
-    jb              .Laad_large_done
-.Laad_loop_1x:
-    vmovdqu8        ($AAD), $GHASHDATA0
-    vpshufb         $BSWAP_MASK, $GHASHDATA0, $GHASHDATA0
-    vpxord          $GHASHDATA0, $GHASH_ACC, $GHASH_ACC
-    @{[ _ghash_mul  $H_POW1, $GHASH_ACC, $GHASH_ACC, $GFPOLY,
-                    $GHASHDATA0, $GHASHDATA1, $GHASHDATA2 ]}
-    @{[ _horizontal_xor $GHASH_ACC, $GHASH_ACC_XMM, $GHASH_ACC_XMM,
-                        $GHASHDATA0_XMM, $GHASHDATA1_XMM, $GHASHDATA2_XMM ]}
-    add             \$64, $AAD
-    sub             \$64, $AADLEN
-    cmp             \$64, $AADLEN
-    jae             .Laad_loop_1x
-
-.Laad_large_done:
-
-    # GHASH the remaining data 16 bytes at a time, using xmm registers only.
-.Laad_blockbyblock:
-    test            $AADLEN, $AADLEN
-    jz              .Laad_done
-    vmovdqu         $OFFSETOFEND_H_POWERS-16($HTABLE), $H_POW1_XMM
-.Laad_loop_blockbyblock:
-    vmovdqu         ($AAD), $GHASHDATA0_XMM
-    vpshufb         $BSWAP_MASK_XMM, $GHASHDATA0_XMM, $GHASHDATA0_XMM
-    vpxor           $GHASHDATA0_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
-    @{[ _ghash_mul  $H_POW1_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM, $GFPOLY_XMM,
-                    $GHASHDATA0_XMM, $GHASHDATA1_XMM, $GHASHDATA2_XMM ]}
-    add             \$16, $AAD
-    sub             \$16, $AADLEN
-    jnz             .Laad_loop_blockbyblock
-
-.Laad_done:
-    # Store the updated GHASH accumulator back to memory.
-    vpshufb         $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
-    vmovdqu         $GHASH_ACC_XMM, ($GHASH_ACC_PTR)
-
-    vzeroupper      # This is needed after using ymm or zmm registers.
-___
-}
-$code .= _end_func;
-
 # Do one non-last round of AES encryption on the counter blocks in aesdata[0-3]
 # using the round key that has been broadcast to all 128-bit lanes of round_key.
 sub _vaesenc_4x {
@@ -1292,11 +1177,6 @@ sub filter_and_print {
             my $postspace = $+{postspace};
             if (exists $asmMap{$trimmed}) {
                 $line = ${prespace} . $asmMap{$trimmed} . ${postspace};
-            } else {
-                if($trimmed =~ /(vpclmulqdq|vaes).*%[yz]mm/) {
-                    die ("found instruction not supported under old binutils, please update asmMap with the results of running\n" .
-                         'find target -name "*aes-gcm-avx512*.o" -exec python3 crypto/fipsmodule/aes/asm/make-avx-map-for-old-binutils.py \{\} \; | LC_ALL=C sort | uniq');
-                }
             }
         }
         print $line,"\n";
diff --git a/src/aead/aes_gcm.rs b/src/aead/aes_gcm.rs
index 4f10af12a5..61ce3655b6 100644
--- a/src/aead/aes_gcm.rs
+++ b/src/aead/aes_gcm.rs
@@ -38,6 +38,7 @@ use cpu::GetFeature as _;
 mod aarch64;
 mod aeshwclmulmovbe;
 mod vaesclmulavx2;
+mod vaesclmulavx512;
 
 #[derive(Clone)]
 pub(super) struct Key(DynKey);
@@ -77,6 +78,12 @@ enum DynKey {
     #[cfg(all(target_arch = "aarch64", target_endian = "little"))]
     AesHwClMul(Combo<aes::hw::Key, gcm::clmul_aarch64::Key>),
 
+    #[cfg(all(
+        target_arch = "x86_64",
+        not(any(target_os = "macos", target_vendor = "apple"))
+    ))]
+    VAesClMulAvx512(Combo<aes::hw::Key, gcm::vclmulavx512::Key>),
+
     #[cfg(target_arch = "x86_64")]
     VAesClMulAvx2(Combo<aes::hw::Key, gcm::vclmulavx2::Key>),
 
@@ -117,16 +124,25 @@ impl DynKey {
         if let Some((aes, gcm)) = cpu.get_feature() {
             let aes_key = aes::hw::Key::new(key, aes, cpu.get_feature());
             let gcm_key_value = derive_gcm_key_value(&aes_key);
-            return if let Some(cpu) = cpu.get_feature() {
+
+            #[cfg(not(any(target_os = "macos", target_vendor = "apple")))]
+            if let Some(cpu) = cpu.get_feature() {
+                let gcm_key = gcm::vclmulavx512::Key::new(gcm_key_value, cpu);
+                return Self::VAesClMulAvx512(Combo { aes_key, gcm_key });
+            }
+
+            if let Some(cpu) = cpu.get_feature() {
                 let gcm_key = gcm::vclmulavx2::Key::new(gcm_key_value, cpu);
-                Self::VAesClMulAvx2(Combo { aes_key, gcm_key })
-            } else if let Some(cpu) = cpu.get_feature() {
+                return Self::VAesClMulAvx2(Combo { aes_key, gcm_key });
+            }
+
+            if let Some(cpu) = cpu.get_feature() {
                 let gcm_key = gcm::clmulavxmovbe::Key::new(gcm_key_value, cpu);
-                Self::AesHwClMulAvxMovbe(Combo { aes_key, gcm_key })
-            } else {
-                let gcm_key = gcm::clmul_x86_64::Key::new(gcm_key_value, gcm);
-                Self::AesHwClMul(Combo { aes_key, gcm_key })
-            };
+                return Self::AesHwClMulAvxMovbe(Combo { aes_key, gcm_key });
+            }
+
+            let gcm_key = gcm::clmul_x86_64::Key::new(gcm_key_value, gcm);
+            return Self::AesHwClMul(Combo { aes_key, gcm_key });
         }
 
         #[cfg(target_arch = "x86")]
@@ -227,6 +243,12 @@ fn seal(
             seal_whole_partial(c, aad, in_out, ctr, tag_iv, aarch64::seal_whole)
         }
 
+        #[cfg(target_arch = "x86_64")]
+        #[cfg(not(any(target_os = "macos", target_vendor = "apple")))]
+        DynKey::VAesClMulAvx512(c) => {
+            seal_whole_partial(c, aad, in_out, ctr, tag_iv, vaesclmulavx512::seal_whole)
+        }
+
         #[cfg(target_arch = "x86_64")]
         DynKey::VAesClMulAvx2(c) => seal_whole_partial(
             c,
@@ -352,6 +374,12 @@ fn open(
             open_whole_partial(c, aad, in_out, ctr, tag_iv, aarch64::open_whole)
         }
 
+        #[cfg(target_arch = "x86_64")]
+        #[cfg(not(any(target_os = "macos", target_vendor = "apple")))]
+        DynKey::VAesClMulAvx512(c) => {
+            open_whole_partial(c, aad, in_out, ctr, tag_iv, vaesclmulavx512::open_whole)
+        }
+
         #[cfg(target_arch = "x86_64")]
         DynKey::VAesClMulAvx2(c) => open_whole_partial(
             c,
diff --git a/src/aead/aes_gcm/vaesclmulavx512.rs b/src/aead/aes_gcm/vaesclmulavx512.rs
new file mode 100644
index 0000000000..6e23b63beb
--- /dev/null
+++ b/src/aead/aes_gcm/vaesclmulavx512.rs
@@ -0,0 +1,90 @@
+// Copyright 2015-2025 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+#![cfg(target_arch = "x86_64")]
+#![cfg_attr(any(target_os = "macos", target_vendor = "apple"), allow(dead_code))]
+
+use super::{aes, gcm, Counter, Overlapping, BLOCK_LEN};
+use crate::{c, polyfill::slice::AsChunksMut};
+use core::num::{NonZeroU32, NonZeroUsize};
+
+pub(super) fn seal_whole(
+    aes_key: &aes::hw::Key,
+    auth: &mut gcm::Context<gcm::vclmulavx512::Key>,
+    ctr: &mut Counter,
+    mut in_out: AsChunksMut<u8, BLOCK_LEN>,
+) {
+    prefixed_extern! {
+        fn aes_gcm_enc_update_vaes_avx512(
+            input: *const u8,
+            output: *mut u8,
+            len: c::NonZero_size_t, // TODO? zero OK?
+            key: &aes::hw::Key,
+            ivec: &Counter,
+            Htable: &gcm::vclmulavx512::Key,
+            Xi: &mut gcm::Xi);
+    }
+
+    let in_out = in_out.as_flattened_mut();
+
+    // Precondition: Since we have a `gcm::Context` then the number of blocks
+    // must fit in `u32`.
+    let blocks = u32::try_from(in_out.len() / BLOCK_LEN).unwrap();
+
+    if let Some(len) = NonZeroUsize::new(in_out.len()) {
+        let (htable, xi) = auth.inner();
+        let input = in_out.as_ptr();
+        let output = in_out.as_mut_ptr();
+        unsafe { aes_gcm_enc_update_vaes_avx512(input, output, len, aes_key, ctr, htable, xi) };
+        let blocks = NonZeroU32::new(blocks).unwrap_or_else(|| {
+            unreachable!() // Due to previous checks.
+        });
+        ctr.increment_by_less_safe(blocks);
+    }
+}
+
+pub(super) fn open_whole(
+    aes_key: &aes::hw::Key,
+    auth: &mut gcm::Context<gcm::vclmulavx512::Key>,
+    in_out: Overlapping,
+    ctr: &mut Counter,
+) {
+    prefixed_extern! {
+        fn aes_gcm_dec_update_vaes_avx512(
+            input: *const u8,
+            output: *mut u8,
+            len: c::NonZero_size_t, // TODO? zero OK?
+            key: &aes::hw::Key,
+            ivec: &mut Counter,
+            Htable: &gcm::vclmulavx512::Key,
+            Xi: &mut gcm::Xi);
+    }
+
+    // Precondition. TODO: Create an overlapping::AsChunks for this.
+    assert_eq!(in_out.len() % BLOCK_LEN, 0);
+    // Precondition: Since we have a `gcm::Context` then the number of blocks
+    // must fit in `u32`.
+    let blocks = u32::try_from(in_out.len() / BLOCK_LEN).unwrap();
+
+    in_out.with_input_output_len(|input, output, len| {
+        if let Some(len) = NonZeroUsize::new(len) {
+            let (htable, xi) = auth.inner();
+            unsafe { aes_gcm_dec_update_vaes_avx512(input, output, len, aes_key, ctr, htable, xi) };
+            let blocks = NonZeroU32::new(blocks).unwrap_or_else(|| {
+                unreachable!() // Due to previous checks.
+            });
+            ctr.increment_by_less_safe(blocks);
+        }
+    })
+}
diff --git a/src/aead/gcm.rs b/src/aead/gcm.rs
index 341d12406e..c66fc1be08 100644
--- a/src/aead/gcm.rs
+++ b/src/aead/gcm.rs
@@ -45,6 +45,7 @@ pub(super) mod clmulavxmovbe;
 pub(super) mod fallback;
 pub(super) mod neon;
 pub(super) mod vclmulavx2;
+pub(super) mod vclmulavx512;
 
 pub(super) struct Context<'key, K> {
     Xi: Xi,
@@ -128,6 +129,15 @@ impl Context<'_, vclmulavx2::Key> {
     }
 }
 
+#[cfg(target_arch = "x86_64")]
+impl Context<'_, vclmulavx512::Key> {
+    /// Access to `inner` for the integrated AES-GCM implementations only.
+    #[inline]
+    pub(super) fn inner(&mut self) -> (&vclmulavx512::Key, &mut Xi) {
+        (self.key, &mut self.Xi)
+    }
+}
+
 impl<K: UpdateBlocks> Context<'_, K> {
     #[inline(always)]
     pub fn update_blocks(&mut self, input: AsChunks<u8, BLOCK_LEN>) {
diff --git a/src/aead/gcm/vclmulavx512.rs b/src/aead/gcm/vclmulavx512.rs
new file mode 100644
index 0000000000..5043b568d6
--- /dev/null
+++ b/src/aead/gcm/vclmulavx512.rs
@@ -0,0 +1,62 @@
+// Copyright 2018-2025 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+#![cfg(target_arch = "x86_64")]
+#![cfg_attr(any(target_os = "macos", target_vendor = "apple"), allow(dead_code))]
+
+use super::{ffi, ffi::KeyValue, UpdateBlock, Xi};
+use crate::{
+    aead::gcm::ffi::BLOCK_LEN,
+    c,
+    cpu::intel::{Avx2, Avx512_BW_VL_ZMM, Bmi2, VAesClmul},
+    polyfill::slice::AsChunks,
+};
+use core::mem::MaybeUninit;
+
+#[derive(Clone)]
+#[repr(transparent)]
+pub struct Key([ffi::U128; 16]);
+
+impl Key {
+    pub(in super::super) fn new(
+        value: KeyValue,
+        _cpu: (Avx2, Avx512_BW_VL_ZMM, Bmi2, VAesClmul),
+    ) -> Self {
+        prefixed_extern! {
+            fn gcm_init_vpclmulqdq_avx512(HTable: *mut Key, h: &KeyValue);
+        }
+        let mut uninit = MaybeUninit::<Key>::uninit();
+        unsafe {
+            gcm_init_vpclmulqdq_avx512(uninit.as_mut_ptr(), &value);
+        }
+        unsafe { uninit.assume_init() }
+    }
+}
+
+impl UpdateBlock for Key {
+    fn update_block(&self, xi: &mut Xi, a: [u8; BLOCK_LEN]) {
+        prefixed_extern! {
+            fn gcm_ghash_vpclmulqdq_avx512_16(
+                xi: &mut Xi,
+                Htable: &Key,
+                inp: *const u8,
+                len: c::NonZero_size_t,
+            );
+        }
+        let input: AsChunks<u8, BLOCK_LEN> = (&a).into();
+        ffi::with_non_dangling_ptr(input, |input, len| unsafe {
+            gcm_ghash_vpclmulqdq_avx512_16(xi, self, input, len)
+        })
+    }
+}
diff --git a/src/cpu.rs b/src/cpu.rs
index 2bb68c55b9..3944c2b8e9 100644
--- a/src/cpu.rs
+++ b/src/cpu.rs
@@ -114,6 +114,20 @@ where
     }
 }
 
+impl<A, B, C, D> GetFeature<(A, B, C, D)> for features::Values
+where
+    features::Values: GetFeature<(A, B)>,
+    features::Values: GetFeature<(C, D)>,
+{
+    #[inline(always)]
+    fn get_feature(&self) -> Option<(A, B, C, D)> {
+        match (self.get_feature(), self.get_feature()) {
+            (Some((a, b)), Some((c, d))) => Some((a, b, c, d)),
+            _ => None,
+        }
+    }
+}
+
 impl<F> GetFeature<F> for Features
 where
     features::Values: GetFeature<F>,
diff --git a/src/cpu/x86_64.rs b/src/cpu/x86_64.rs
index a0c86c51b1..33572a1ba9 100644
--- a/src/cpu/x86_64.rs
+++ b/src/cpu/x86_64.rs
@@ -13,7 +13,7 @@
 // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 
 // "Intel" citations are for "Intel 64 and IA-32 Architectures Software
-// Developer’s Manual", Combined Volumes, December 2024.
+// Developer’s Manual", Combined Volumes, June 2025.
 // "AMD" citations are for "AMD64 Technology AMD64 Architecture
 // Programmer’s Manual, Volumes 1-5" Revision 4.08 April 2024.
 
@@ -81,13 +81,18 @@ pub(super) mod featureflags {
 }
 
 struct CpuidSummary {
-    is_intel: bool,
+    model: Model,
     leaf1_ecx: u32,
     extended_features_ecx: u32,
     extended_features_ebx: u32,
     xcr0: u64,
 }
 
+enum Model {
+    Intel { family6_model: Option<u32> },
+    Other,
+}
+
 // SAFETY: This unconditionally uses CPUID because we don't have a good
 // way to detect CPUID and because we don't know of a CPU that supports
 // SSE2 (that we currently statically require) but doesn't support
@@ -106,14 +111,39 @@ unsafe fn cpuid_all() -> CpuidSummary {
     let is_intel =
         (leaf0.ebx == 0x756e6547) && (leaf0.edx == 0x49656e69) && (leaf0.ecx == 0x6c65746e);
 
-    let leaf1_ecx = if leaf0.eax >= 1 {
-        // SAFETY: `leaf0.eax >= 1` indicates leaf 1 is available.
+    let (model, leaf1_ecx) = if leaf0.eax >= 1 {
+        // SAFETY: `r0.eax >= 1` indicates leaf 1 is available.
         let leaf1 = unsafe { arch::__cpuid(1) };
-        leaf1.ecx
+        let model = (leaf1.eax >> 4) & 0x0f;
+        let family_id = (leaf1.eax >> 8) & 0x0f;
+        let model = if family_id == 6 || family_id == 15 {
+            let model_extended = (leaf1.eax >> 16) & 0x0f;
+            (model_extended << 4) | model
+        } else {
+            model
+        };
+
+        let model = match (is_intel, family_id) {
+            (true, 6) => Model::Intel {
+                family6_model: Some(model),
+            },
+            (true, _) => Model::Intel {
+                family6_model: None,
+            },
+            (false, _) => Model::Other,
+        };
+        (model, leaf1.ecx)
     } else {
         // Expected to be unreachable on any environment we currently
         // support.
-        0
+        let model = if is_intel {
+            Model::Intel {
+                family6_model: None,
+            }
+        } else {
+            Model::Other
+        };
+        (model, 0)
     };
 
     let (extended_features_ecx, extended_features_ebx) = if leaf0.eax >= 7 {
@@ -131,7 +161,7 @@ unsafe fn cpuid_all() -> CpuidSummary {
     };
 
     CpuidSummary {
-        is_intel,
+        model,
         leaf1_ecx,
         extended_features_ecx,
         extended_features_ebx,
@@ -143,7 +173,7 @@ fn cpuid_to_caps_and_set_c_flags(r: CpuidSummary) -> u32 {
     use core::{mem::align_of, sync::atomic::AtomicU32};
 
     let CpuidSummary {
-        is_intel,
+        model,
         leaf1_ecx,
         extended_features_ecx,
         extended_features_ebx,
@@ -218,6 +248,58 @@ fn cpuid_to_caps_and_set_c_flags(r: CpuidSummary) -> u32 {
         // calling into the C code.
         let flag = unsafe { &avx2_available };
         flag.store(1, core::sync::atomic::Ordering::Relaxed);
+
+        // Intel: The OS isn't allowed to enable ZMM state w/o enabling YMM/XMM
+        // state.
+        let os_supports_zmm_ymm_xmm =
+            check(r.xcr0, 7) && check(r.xcr0, 6) && check(r.xcr0, 5) && os_supports_ymm_xmm;
+
+        // Intel: "15.2 DETECTION OF AVX-512 FOUNDATION INSTRUCTIONS".
+        let f = os_supports_zmm_ymm_xmm && check(extended_features_ebx, 16);
+
+        // Intel: "15.3 DETECTION OF 512-BIT INSTRUCTION GROUPS OF THE INTEL
+        // AVX-512 FAMILY"
+        let bw = os_supports_zmm_ymm_xmm && check(extended_features_ebx, 30);
+
+        // Intel: "15.4 DETECTION OF INTEL AVX-512 INSTRUCTION GROUPS OPERATING
+        // AT 256 AND 128-BIT VECTOR LENGTHS".
+        let vl = os_supports_zmm_ymm_xmm && check(extended_features_ebx, 31);
+
+        // Initial releases of macOS 12 had a serious bug w.r.t. AVX-512
+        // support; see https://go-review.googlesource.com/c/sys/+/620256.
+        // Given that, plus Apple's transition to ARM, AVX-512 isn't worth
+        // supporting for their targets.
+        let is_darwin = cfg!(any(target_os = "macos", target_vendor = "apple"));
+
+        // Linux 'intel.c': `zmm_exclusion_list`/`X86_FEATURE_PREFER_YMM`.
+        // LLVM `X86.td`: `SKXTuning`/`TGLTuning`/`ICLTuning`.
+        // (family_id == 6 implies family_id != 15 so the extended family
+        // ID is not relevant.)
+        let avoid_zmm = match model {
+            Model::Intel {
+                family6_model: Some(model),
+            } => {
+                matches!(
+                    model,
+                    0x55    // Skylake X
+                    | 0x6a // Ice Lake X
+                    | 0x6c // Ice Lake D
+                    | 0x7d // Ice Lake
+                    | 0x7e // Ice Lake L
+                    | 0x9d // Ice Lake NNPI
+                    | 0x8c // Tiger Lake L
+                    | 0x8d // Tiger Lake
+                )
+            }
+            Model::Intel {
+                family6_model: None,
+            } => false,
+            Model::Other => false,
+        };
+
+        if f && bw && vl && !avoid_zmm && !is_darwin {
+            set(&mut caps, Shift::Avx512_BW_VL_ZMM);
+        }
     }
 
     // Intel: "12.13.4 Checking for Intel AES-NI Support"
@@ -265,6 +347,10 @@ fn cpuid_to_caps_and_set_c_flags(r: CpuidSummary) -> u32 {
         set(&mut caps, Shift::Sha);
     }
 
+    let is_intel = match model {
+        Model::Intel { .. } => true,
+        Model::Other => false,
+    };
     if is_intel {
         set(&mut caps, Shift::IntelCpu);
     }
@@ -326,6 +412,8 @@ fn check<T: BitAnd<Output = T> + Copy + Eq + From<u8> + Shl<u32, Output = T>>(
 }
 
 impl_get_feature! {
+    Avx512_BW_VL_ZMM,
+
     VAesClmul,
 
     ClMul,