Skip to content

Commit e4d1361

Browse files
committed
aes-gcm: Enable AVX-512 implementation.
1 parent 89a67b4 commit e4d1361

File tree

9 files changed

+303
-142
lines changed

9 files changed

+303
-142
lines changed

Cargo.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,9 +49,10 @@ include = [
4949
"crypto/curve25519/curve25519_64_adx.c",
5050
"crypto/curve25519/curve25519_tables.h",
5151
"crypto/curve25519/internal.h",
52-
"crypto/fipsmodule/aes/asm/aesni-x86.pl",
5352
"crypto/fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl",
53+
"crypto/fipsmodule/aes/asm/aes-gcm-avx512-x86_64.pl",
5454
"crypto/fipsmodule/aes/asm/aesni-gcm-x86_64.pl",
55+
"crypto/fipsmodule/aes/asm/aesni-x86.pl",
5556
"crypto/fipsmodule/aes/asm/aesni-x86_64.pl",
5657
"crypto/fipsmodule/aes/asm/aesv8-armx.pl",
5758
"crypto/fipsmodule/aes/asm/aesv8-gcm-armv8.pl",

build.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,7 @@ const RING_SRCS: &[(&[&str], &str)] = &[
113113
(&[X86_64], "crypto/chacha/asm/chacha-x86_64.pl"),
114114
(&[X86_64], "crypto/curve25519/curve25519_64_adx.c"),
115115
(&[X86_64], "crypto/fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl"),
116+
(&[X86_64], "crypto/fipsmodule/aes/asm/aes-gcm-avx512-x86_64.pl"),
116117
(&[X86_64], "crypto/fipsmodule/aes/asm/aesni-gcm-x86_64.pl"),
117118
(&[X86_64], "crypto/fipsmodule/aes/asm/aesni-x86_64.pl"),
118119
(&[X86_64], "crypto/fipsmodule/aes/asm/ghash-x86_64.pl"),
@@ -912,8 +913,10 @@ fn prefix_all_symbols(pp: char, prefix_prefix: &str, prefix: &str) -> String {
912913
"LIMBS_window5_unsplit_window",
913914
"aes_gcm_dec_kernel",
914915
"aes_gcm_dec_update_vaes_avx2",
916+
"aes_gcm_dec_update_vaes_avx512",
915917
"aes_gcm_enc_kernel",
916918
"aes_gcm_enc_update_vaes_avx2",
919+
"aes_gcm_enc_update_vaes_avx512",
917920
"aes_hw_ctr32_encrypt_blocks",
918921
"aes_hw_set_encrypt_key",
919922
"aes_hw_set_encrypt_key_alt",
@@ -969,6 +972,7 @@ fn prefix_all_symbols(pp: char, prefix_prefix: &str, prefix: &str) -> String {
969972
"gcm_ghash_clmul",
970973
"gcm_ghash_neon",
971974
"gcm_ghash_vpclmulqdq_avx2_16",
975+
"gcm_ghash_vpclmulqdq_avx512_16",
972976
"gcm_gmult_clmul",
973977
"gcm_gmult_neon",
974978
"gcm_gmult_v8",
@@ -977,6 +981,7 @@ fn prefix_all_symbols(pp: char, prefix_prefix: &str, prefix: &str) -> String {
977981
"gcm_init_neon",
978982
"gcm_init_v8",
979983
"gcm_init_vpclmulqdq_avx2",
984+
"gcm_init_vpclmulqdq_avx512",
980985
"k25519Precomp",
981986
"limbs_mul_add_limb",
982987
"little_endian_bytes_from_scalar",

crypto/fipsmodule/aes/asm/aes-gcm-avx512-x86_64.pl

Lines changed: 9 additions & 129 deletions
Original file line numberDiff line numberDiff line change
@@ -588,18 +588,24 @@ sub _ghash_4x {
588588
return $code;
589589
}
590590

591-
# void gcm_gmult_vpclmulqdq_avx512(uint8_t Xi[16], const u128 Htable[16]);
592-
$code .= _begin_func "gcm_gmult_vpclmulqdq_avx512", 1;
591+
# void gcm_ghash_vpclmulqdq_avx512_16(uint8_t Xi[16], const u128 Htable[16],
592+
# const uint8_t aad[16], size_t aad_len_16););
593+
$code .= _begin_func "gcm_ghash_vpclmulqdq_avx512_16", 1;
593594
{
594-
my ( $GHASH_ACC_PTR, $HTABLE ) = @argregs[ 0 .. 1 ];
595+
my ( $GHASH_ACC_PTR, $HTABLE, $AAD, $AAD_LEN_16 ) = @argregs[ 0 .. 3 ];
595596
my ( $GHASH_ACC, $BSWAP_MASK, $H_POW1, $GFPOLY, $T0, $T1, $T2 ) =
596597
map( "%xmm$_", ( 0 .. 6 ) );
597598

598599
$code .= <<___;
599600
@{[ _save_xmmregs (6) ]}
600601
.seh_endprologue
601602
603+
# Load the GHASH accumulator.
602604
vmovdqu ($GHASH_ACC_PTR), $GHASH_ACC
605+
606+
# XOR the AAD into the accumulator.
607+
vpxor ($AAD), $GHASH_ACC, $GHASH_ACC
608+
603609
vmovdqu .Lbswap_mask(%rip), $BSWAP_MASK
604610
vmovdqu $OFFSETOFEND_H_POWERS-16($HTABLE), $H_POW1
605611
vmovdqu .Lgfpoly(%rip), $GFPOLY
@@ -615,127 +621,6 @@ sub _ghash_4x {
615621
}
616622
$code .= _end_func;
617623

618-
# void gcm_ghash_vpclmulqdq_avx512(uint8_t Xi[16], const u128 Htable[16],
619-
# const uint8_t *in, size_t len);
620-
#
621-
# Using the key |Htable|, update the GHASH accumulator |Xi| with the data given
622-
# by |in| and |len|. |len| must be a multiple of 16.
623-
#
624-
# This function handles large amounts of AAD efficiently, while also keeping the
625-
# overhead low for small amounts of AAD which is the common case. TLS uses less
626-
# than one block of AAD, but (uncommonly) other use cases may use much more.
627-
$code .= _begin_func "gcm_ghash_vpclmulqdq_avx512", 1;
628-
{
629-
# Function arguments
630-
my ( $GHASH_ACC_PTR, $HTABLE, $AAD, $AADLEN ) = @argregs[ 0 .. 3 ];
631-
632-
# Additional local variables
633-
my ( $GHASHDATA0, $GHASHDATA0_XMM ) = ( "%zmm0", "%xmm0" );
634-
my ( $GHASHDATA1, $GHASHDATA1_XMM ) = ( "%zmm1", "%xmm1" );
635-
my ( $GHASHDATA2, $GHASHDATA2_XMM ) = ( "%zmm2", "%xmm2" );
636-
my ( $GHASHDATA3, $GHASHDATA3_XMM ) = ( "%zmm3", "%xmm3" );
637-
my @GHASHDATA = ( $GHASHDATA0, $GHASHDATA1, $GHASHDATA2, $GHASHDATA3 );
638-
my @GHASHDATA_XMM =
639-
( $GHASHDATA0_XMM, $GHASHDATA1_XMM, $GHASHDATA2_XMM, $GHASHDATA3_XMM );
640-
my ( $BSWAP_MASK, $BSWAP_MASK_XMM ) = ( "%zmm4", "%xmm4" );
641-
my ( $GHASH_ACC, $GHASH_ACC_XMM ) = ( "%zmm5", "%xmm5" );
642-
my ( $H_POW4, $H_POW3, $H_POW2 ) = ( "%zmm6", "%zmm7", "%zmm8" );
643-
my ( $H_POW1, $H_POW1_XMM ) = ( "%zmm9", "%xmm9" );
644-
my ( $GFPOLY, $GFPOLY_XMM ) = ( "%zmm10", "%xmm10" );
645-
my ( $GHASHTMP0, $GHASHTMP1, $GHASHTMP2 ) =
646-
( "%zmm11", "%zmm12", "%zmm13" );
647-
648-
$code .= <<___;
649-
@{[ _save_xmmregs (6 .. 13) ]}
650-
.seh_endprologue
651-
652-
# Load the bswap_mask and gfpoly constants. Since AADLEN is usually small,
653-
# usually only 128-bit vectors will be used. So as an optimization, don't
654-
# broadcast these constants to all 128-bit lanes quite yet.
655-
vmovdqu .Lbswap_mask(%rip), $BSWAP_MASK_XMM
656-
vmovdqu .Lgfpoly(%rip), $GFPOLY_XMM
657-
658-
# Load the GHASH accumulator.
659-
vmovdqu ($GHASH_ACC_PTR), $GHASH_ACC_XMM
660-
vpshufb $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
661-
662-
# Optimize for AADLEN < 64 by checking for AADLEN < 64 before AADLEN < 256.
663-
cmp \$64, $AADLEN
664-
jb .Laad_blockbyblock
665-
666-
# AADLEN >= 64, so we'll operate on full vectors. Broadcast bswap_mask and
667-
# gfpoly to all 128-bit lanes.
668-
vshufi64x2 \$0, $BSWAP_MASK, $BSWAP_MASK, $BSWAP_MASK
669-
vshufi64x2 \$0, $GFPOLY, $GFPOLY, $GFPOLY
670-
671-
# Load the lowest set of key powers.
672-
vmovdqu8 $OFFSETOFEND_H_POWERS-1*64($HTABLE), $H_POW1
673-
674-
cmp \$256, $AADLEN
675-
jb .Laad_loop_1x
676-
677-
# AADLEN >= 256. Load the higher key powers.
678-
vmovdqu8 $OFFSETOFEND_H_POWERS-4*64($HTABLE), $H_POW4
679-
vmovdqu8 $OFFSETOFEND_H_POWERS-3*64($HTABLE), $H_POW3
680-
vmovdqu8 $OFFSETOFEND_H_POWERS-2*64($HTABLE), $H_POW2
681-
682-
# Update GHASH with 256 bytes of AAD at a time.
683-
.Laad_loop_4x:
684-
vmovdqu8 0*64($AAD), $GHASHDATA0
685-
vmovdqu8 1*64($AAD), $GHASHDATA1
686-
vmovdqu8 2*64($AAD), $GHASHDATA2
687-
vmovdqu8 3*64($AAD), $GHASHDATA3
688-
@{[ _ghash_4x $BSWAP_MASK, @GHASHDATA, @GHASHDATA_XMM, $H_POW4, $H_POW3,
689-
$H_POW2, $H_POW1, $GFPOLY, $GHASHTMP0, $GHASHTMP1,
690-
$GHASHTMP2, $GHASH_ACC, $GHASH_ACC_XMM ]}
691-
add \$256, $AAD
692-
sub \$256, $AADLEN
693-
cmp \$256, $AADLEN
694-
jae .Laad_loop_4x
695-
696-
# Update GHASH with 64 bytes of AAD at a time.
697-
cmp \$64, $AADLEN
698-
jb .Laad_large_done
699-
.Laad_loop_1x:
700-
vmovdqu8 ($AAD), $GHASHDATA0
701-
vpshufb $BSWAP_MASK, $GHASHDATA0, $GHASHDATA0
702-
vpxord $GHASHDATA0, $GHASH_ACC, $GHASH_ACC
703-
@{[ _ghash_mul $H_POW1, $GHASH_ACC, $GHASH_ACC, $GFPOLY,
704-
$GHASHDATA0, $GHASHDATA1, $GHASHDATA2 ]}
705-
@{[ _horizontal_xor $GHASH_ACC, $GHASH_ACC_XMM, $GHASH_ACC_XMM,
706-
$GHASHDATA0_XMM, $GHASHDATA1_XMM, $GHASHDATA2_XMM ]}
707-
add \$64, $AAD
708-
sub \$64, $AADLEN
709-
cmp \$64, $AADLEN
710-
jae .Laad_loop_1x
711-
712-
.Laad_large_done:
713-
714-
# GHASH the remaining data 16 bytes at a time, using xmm registers only.
715-
.Laad_blockbyblock:
716-
test $AADLEN, $AADLEN
717-
jz .Laad_done
718-
vmovdqu $OFFSETOFEND_H_POWERS-16($HTABLE), $H_POW1_XMM
719-
.Laad_loop_blockbyblock:
720-
vmovdqu ($AAD), $GHASHDATA0_XMM
721-
vpshufb $BSWAP_MASK_XMM, $GHASHDATA0_XMM, $GHASHDATA0_XMM
722-
vpxor $GHASHDATA0_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
723-
@{[ _ghash_mul $H_POW1_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM, $GFPOLY_XMM,
724-
$GHASHDATA0_XMM, $GHASHDATA1_XMM, $GHASHDATA2_XMM ]}
725-
add \$16, $AAD
726-
sub \$16, $AADLEN
727-
jnz .Laad_loop_blockbyblock
728-
729-
.Laad_done:
730-
# Store the updated GHASH accumulator back to memory.
731-
vpshufb $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
732-
vmovdqu $GHASH_ACC_XMM, ($GHASH_ACC_PTR)
733-
734-
vzeroupper # This is needed after using ymm or zmm registers.
735-
___
736-
}
737-
$code .= _end_func;
738-
739624
# Do one non-last round of AES encryption on the counter blocks in aesdata[0-3]
740625
# using the round key that has been broadcast to all 128-bit lanes of round_key.
741626
sub _vaesenc_4x {
@@ -1292,11 +1177,6 @@ sub filter_and_print {
12921177
my $postspace = $+{postspace};
12931178
if (exists $asmMap{$trimmed}) {
12941179
$line = ${prespace} . $asmMap{$trimmed} . ${postspace};
1295-
} else {
1296-
if($trimmed =~ /(vpclmulqdq|vaes).*%[yz]mm/) {
1297-
die ("found instruction not supported under old binutils, please update asmMap with the results of running\n" .
1298-
'find target -name "*aes-gcm-avx512*.o" -exec python3 crypto/fipsmodule/aes/asm/make-avx-map-for-old-binutils.py \{\} \; | LC_ALL=C sort | uniq');
1299-
}
13001180
}
13011181
}
13021182
print $line,"\n";

src/aead/aes_gcm.rs

Lines changed: 36 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ use cpu::GetFeature as _;
3535
mod aarch64;
3636
mod aeshwclmulmovbe;
3737
mod vaesclmulavx2;
38+
mod vaesclmulavx512;
3839

3940
#[derive(Clone)]
4041
pub(super) struct Key(DynKey);
@@ -74,6 +75,12 @@ enum DynKey {
7475
#[cfg(all(target_arch = "aarch64", target_endian = "little"))]
7576
AesHwClMul(Combo<aes::hw::Key, gcm::clmul_aarch64::Key>),
7677

78+
#[cfg(all(
79+
target_arch = "x86_64",
80+
not(any(target_os = "macos", target_vendor = "apple"))
81+
))]
82+
VAesClMulAvx512(Combo<aes::hw::Key, gcm::vclmulavx512::Key>),
83+
7784
#[cfg(target_arch = "x86_64")]
7885
VAesClMulAvx2(Combo<aes::hw::Key, gcm::vclmulavx2::Key>),
7986

@@ -114,16 +121,25 @@ impl DynKey {
114121
if let Some((aes, gcm)) = cpu.get_feature() {
115122
let aes_key = aes::hw::Key::new(key, aes, cpu.get_feature());
116123
let gcm_key_value = derive_gcm_key_value(&aes_key);
117-
return if let Some(cpu) = cpu.get_feature() {
124+
125+
#[cfg(not(any(target_os = "macos", target_vendor = "apple")))]
126+
if let Some(cpu) = cpu.get_feature() {
127+
let gcm_key = gcm::vclmulavx512::Key::new(gcm_key_value, cpu);
128+
return Self::VAesClMulAvx512(Combo { aes_key, gcm_key });
129+
}
130+
131+
if let Some(cpu) = cpu.get_feature() {
118132
let gcm_key = gcm::vclmulavx2::Key::new(gcm_key_value, cpu);
119-
Self::VAesClMulAvx2(Combo { aes_key, gcm_key })
120-
} else if let Some(cpu) = cpu.get_feature() {
133+
return Self::VAesClMulAvx2(Combo { aes_key, gcm_key });
134+
}
135+
136+
if let Some(cpu) = cpu.get_feature() {
121137
let gcm_key = gcm::clmulavxmovbe::Key::new(gcm_key_value, cpu);
122-
Self::AesHwClMulAvxMovbe(Combo { aes_key, gcm_key })
123-
} else {
124-
let gcm_key = gcm::clmul_x86_64::Key::new(gcm_key_value, gcm);
125-
Self::AesHwClMul(Combo { aes_key, gcm_key })
126-
};
138+
return Self::AesHwClMulAvxMovbe(Combo { aes_key, gcm_key });
139+
}
140+
141+
let gcm_key = gcm::clmul_x86_64::Key::new(gcm_key_value, gcm);
142+
return Self::AesHwClMul(Combo { aes_key, gcm_key });
127143
}
128144

129145
#[cfg(target_arch = "x86")]
@@ -224,6 +240,12 @@ fn seal(
224240
seal_whole_partial(c, aad, in_out, ctr, tag_iv, aarch64::seal_whole)
225241
}
226242

243+
#[cfg(target_arch = "x86_64")]
244+
#[cfg(not(any(target_os = "macos", target_vendor = "apple")))]
245+
DynKey::VAesClMulAvx512(c) => {
246+
seal_whole_partial(c, aad, in_out, ctr, tag_iv, vaesclmulavx512::seal_whole)
247+
}
248+
227249
#[cfg(target_arch = "x86_64")]
228250
DynKey::VAesClMulAvx2(c) => seal_whole_partial(
229251
c,
@@ -349,6 +371,12 @@ fn open(
349371
open_whole_partial(c, aad, in_out, ctr, tag_iv, aarch64::open_whole)
350372
}
351373

374+
#[cfg(target_arch = "x86_64")]
375+
#[cfg(not(any(target_os = "macos", target_vendor = "apple")))]
376+
DynKey::VAesClMulAvx512(c) => {
377+
open_whole_partial(c, aad, in_out, ctr, tag_iv, vaesclmulavx512::open_whole)
378+
}
379+
352380
#[cfg(target_arch = "x86_64")]
353381
DynKey::VAesClMulAvx2(c) => open_whole_partial(
354382
c,
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
// Copyright 2015-2025 Brian Smith.
2+
//
3+
// Permission to use, copy, modify, and/or distribute this software for any
4+
// purpose with or without fee is hereby granted, provided that the above
5+
// copyright notice and this permission notice appear in all copies.
6+
//
7+
// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
8+
// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
9+
// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
10+
// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
11+
// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
12+
// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
13+
// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
14+
15+
#![cfg(target_arch = "x86_64")]
16+
#![cfg_attr(any(target_os = "macos", target_vendor = "apple"), allow(dead_code))]
17+
18+
use super::{aes, gcm, Counter, Overlapping, BLOCK_LEN};
19+
use crate::{c, polyfill::slice::AsChunksMut};
20+
use core::num::{NonZeroU32, NonZeroUsize};
21+
22+
pub(super) fn seal_whole(
23+
aes_key: &aes::hw::Key,
24+
auth: &mut gcm::Context<gcm::vclmulavx512::Key>,
25+
ctr: &mut Counter,
26+
mut in_out: AsChunksMut<u8, BLOCK_LEN>,
27+
) {
28+
prefixed_extern! {
29+
fn aes_gcm_enc_update_vaes_avx512(
30+
input: *const u8,
31+
output: *mut u8,
32+
len: c::NonZero_size_t, // TODO? zero OK?
33+
key: &aes::hw::Key,
34+
ivec: &Counter,
35+
Htable: &gcm::vclmulavx512::Key,
36+
Xi: &mut gcm::Xi);
37+
}
38+
39+
let in_out = in_out.as_flattened_mut();
40+
41+
// Precondition: Since we have a `gcm::Context` then the number of blocks
42+
// must fit in `u32`.
43+
let blocks = u32::try_from(in_out.len() / BLOCK_LEN).unwrap();
44+
45+
if let Some(len) = NonZeroUsize::new(in_out.len()) {
46+
let (htable, xi) = auth.inner();
47+
let input = in_out.as_ptr();
48+
let output = in_out.as_mut_ptr();
49+
unsafe { aes_gcm_enc_update_vaes_avx512(input, output, len, aes_key, ctr, htable, xi) };
50+
let blocks = NonZeroU32::new(blocks).unwrap_or_else(|| {
51+
unreachable!() // Due to previous checks.
52+
});
53+
ctr.increment_by_less_safe(blocks);
54+
}
55+
}
56+
57+
pub(super) fn open_whole(
58+
aes_key: &aes::hw::Key,
59+
auth: &mut gcm::Context<gcm::vclmulavx512::Key>,
60+
in_out: Overlapping,
61+
ctr: &mut Counter,
62+
) {
63+
prefixed_extern! {
64+
fn aes_gcm_dec_update_vaes_avx512(
65+
input: *const u8,
66+
output: *mut u8,
67+
len: c::NonZero_size_t, // TODO? zero OK?
68+
key: &aes::hw::Key,
69+
ivec: &mut Counter,
70+
Htable: &gcm::vclmulavx512::Key,
71+
Xi: &mut gcm::Xi);
72+
}
73+
74+
// Precondition. TODO: Create an overlapping::AsChunks for this.
75+
assert_eq!(in_out.len() % BLOCK_LEN, 0);
76+
// Precondition: Since we have a `gcm::Context` then the number of blocks
77+
// must fit in `u32`.
78+
let blocks = u32::try_from(in_out.len() / BLOCK_LEN).unwrap();
79+
80+
in_out.with_input_output_len(|input, output, len| {
81+
if let Some(len) = NonZeroUsize::new(len) {
82+
let (htable, xi) = auth.inner();
83+
unsafe { aes_gcm_dec_update_vaes_avx512(input, output, len, aes_key, ctr, htable, xi) };
84+
let blocks = NonZeroU32::new(blocks).unwrap_or_else(|| {
85+
unreachable!() // Due to previous checks.
86+
});
87+
ctr.increment_by_less_safe(blocks);
88+
}
89+
})
90+
}

0 commit comments

Comments
 (0)