Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,10 @@ include = [
"crypto/curve25519/curve25519_64_adx.c",
"crypto/curve25519/curve25519_tables.h",
"crypto/curve25519/internal.h",
"crypto/fipsmodule/aes/asm/aesni-x86.pl",
"crypto/fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl",
"crypto/fipsmodule/aes/asm/aes-gcm-avx512-x86_64.pl",
"crypto/fipsmodule/aes/asm/aesni-gcm-x86_64.pl",
"crypto/fipsmodule/aes/asm/aesni-x86.pl",
"crypto/fipsmodule/aes/asm/aesni-x86_64.pl",
"crypto/fipsmodule/aes/asm/aesv8-armx.pl",
"crypto/fipsmodule/aes/asm/aesv8-gcm-armv8.pl",
Expand Down
5 changes: 5 additions & 0 deletions build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ const RING_SRCS: &[(&[&str], &str)] = &[
(&[X86_64], "crypto/chacha/asm/chacha-x86_64.pl"),
(&[X86_64], "crypto/curve25519/curve25519_64_adx.c"),
(&[X86_64], "crypto/fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl"),
(&[X86_64], "crypto/fipsmodule/aes/asm/aes-gcm-avx512-x86_64.pl"),
(&[X86_64], "crypto/fipsmodule/aes/asm/aesni-gcm-x86_64.pl"),
(&[X86_64], "crypto/fipsmodule/aes/asm/aesni-x86_64.pl"),
(&[X86_64], "crypto/fipsmodule/aes/asm/ghash-x86_64.pl"),
Expand Down Expand Up @@ -912,8 +913,10 @@ fn prefix_all_symbols(pp: char, prefix_prefix: &str, prefix: &str) -> String {
"LIMBS_window5_unsplit_window",
"aes_gcm_dec_kernel",
"aes_gcm_dec_update_vaes_avx2",
"aes_gcm_dec_update_vaes_avx512",
"aes_gcm_enc_kernel",
"aes_gcm_enc_update_vaes_avx2",
"aes_gcm_enc_update_vaes_avx512",
"aes_hw_ctr32_encrypt_blocks",
"aes_hw_set_encrypt_key",
"aes_hw_set_encrypt_key_128",
Expand Down Expand Up @@ -971,6 +974,7 @@ fn prefix_all_symbols(pp: char, prefix_prefix: &str, prefix: &str) -> String {
"gcm_ghash_clmul",
"gcm_ghash_neon",
"gcm_ghash_vpclmulqdq_avx2_16",
"gcm_ghash_vpclmulqdq_avx512_16",
"gcm_gmult_clmul",
"gcm_gmult_neon",
"gcm_gmult_v8",
Expand All @@ -979,6 +983,7 @@ fn prefix_all_symbols(pp: char, prefix_prefix: &str, prefix: &str) -> String {
"gcm_init_neon",
"gcm_init_v8",
"gcm_init_vpclmulqdq_avx2",
"gcm_init_vpclmulqdq_avx512",
"k25519Precomp",
"limbs_mul_add_limb",
"little_endian_bytes_from_scalar",
Expand Down
138 changes: 9 additions & 129 deletions crypto/fipsmodule/aes/asm/aes-gcm-avx512-x86_64.pl
Original file line number Diff line number Diff line change
Expand Up @@ -588,18 +588,24 @@ sub _ghash_4x {
return $code;
}

# void gcm_gmult_vpclmulqdq_avx512(uint8_t Xi[16], const u128 Htable[16]);
$code .= _begin_func "gcm_gmult_vpclmulqdq_avx512", 1;
# void gcm_ghash_vpclmulqdq_avx512_16(uint8_t Xi[16], const u128 Htable[16],
# const uint8_t aad[16], size_t aad_len_16););
$code .= _begin_func "gcm_ghash_vpclmulqdq_avx512_16", 1;
{
my ( $GHASH_ACC_PTR, $HTABLE ) = @argregs[ 0 .. 1 ];
my ( $GHASH_ACC_PTR, $HTABLE, $AAD, $AAD_LEN_16 ) = @argregs[ 0 .. 3 ];
my ( $GHASH_ACC, $BSWAP_MASK, $H_POW1, $GFPOLY, $T0, $T1, $T2 ) =
map( "%xmm$_", ( 0 .. 6 ) );

$code .= <<___;
@{[ _save_xmmregs (6) ]}
.seh_endprologue

# Load the GHASH accumulator.
vmovdqu ($GHASH_ACC_PTR), $GHASH_ACC

# XOR the AAD into the accumulator.
vpxor ($AAD), $GHASH_ACC, $GHASH_ACC

vmovdqu .Lbswap_mask(%rip), $BSWAP_MASK
vmovdqu $OFFSETOFEND_H_POWERS-16($HTABLE), $H_POW1
vmovdqu .Lgfpoly(%rip), $GFPOLY
Expand All @@ -615,127 +621,6 @@ sub _ghash_4x {
}
$code .= _end_func;

# void gcm_ghash_vpclmulqdq_avx512(uint8_t Xi[16], const u128 Htable[16],
# const uint8_t *in, size_t len);
#
# Using the key |Htable|, update the GHASH accumulator |Xi| with the data given
# by |in| and |len|. |len| must be a multiple of 16.
#
# This function handles large amounts of AAD efficiently, while also keeping the
# overhead low for small amounts of AAD which is the common case. TLS uses less
# than one block of AAD, but (uncommonly) other use cases may use much more.
$code .= _begin_func "gcm_ghash_vpclmulqdq_avx512", 1;
{
# Function arguments
my ( $GHASH_ACC_PTR, $HTABLE, $AAD, $AADLEN ) = @argregs[ 0 .. 3 ];

# Additional local variables
my ( $GHASHDATA0, $GHASHDATA0_XMM ) = ( "%zmm0", "%xmm0" );
my ( $GHASHDATA1, $GHASHDATA1_XMM ) = ( "%zmm1", "%xmm1" );
my ( $GHASHDATA2, $GHASHDATA2_XMM ) = ( "%zmm2", "%xmm2" );
my ( $GHASHDATA3, $GHASHDATA3_XMM ) = ( "%zmm3", "%xmm3" );
my @GHASHDATA = ( $GHASHDATA0, $GHASHDATA1, $GHASHDATA2, $GHASHDATA3 );
my @GHASHDATA_XMM =
( $GHASHDATA0_XMM, $GHASHDATA1_XMM, $GHASHDATA2_XMM, $GHASHDATA3_XMM );
my ( $BSWAP_MASK, $BSWAP_MASK_XMM ) = ( "%zmm4", "%xmm4" );
my ( $GHASH_ACC, $GHASH_ACC_XMM ) = ( "%zmm5", "%xmm5" );
my ( $H_POW4, $H_POW3, $H_POW2 ) = ( "%zmm6", "%zmm7", "%zmm8" );
my ( $H_POW1, $H_POW1_XMM ) = ( "%zmm9", "%xmm9" );
my ( $GFPOLY, $GFPOLY_XMM ) = ( "%zmm10", "%xmm10" );
my ( $GHASHTMP0, $GHASHTMP1, $GHASHTMP2 ) =
( "%zmm11", "%zmm12", "%zmm13" );

$code .= <<___;
@{[ _save_xmmregs (6 .. 13) ]}
.seh_endprologue

# Load the bswap_mask and gfpoly constants. Since AADLEN is usually small,
# usually only 128-bit vectors will be used. So as an optimization, don't
# broadcast these constants to all 128-bit lanes quite yet.
vmovdqu .Lbswap_mask(%rip), $BSWAP_MASK_XMM
vmovdqu .Lgfpoly(%rip), $GFPOLY_XMM

# Load the GHASH accumulator.
vmovdqu ($GHASH_ACC_PTR), $GHASH_ACC_XMM
vpshufb $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM

# Optimize for AADLEN < 64 by checking for AADLEN < 64 before AADLEN < 256.
cmp \$64, $AADLEN
jb .Laad_blockbyblock

# AADLEN >= 64, so we'll operate on full vectors. Broadcast bswap_mask and
# gfpoly to all 128-bit lanes.
vshufi64x2 \$0, $BSWAP_MASK, $BSWAP_MASK, $BSWAP_MASK
vshufi64x2 \$0, $GFPOLY, $GFPOLY, $GFPOLY

# Load the lowest set of key powers.
vmovdqu8 $OFFSETOFEND_H_POWERS-1*64($HTABLE), $H_POW1

cmp \$256, $AADLEN
jb .Laad_loop_1x

# AADLEN >= 256. Load the higher key powers.
vmovdqu8 $OFFSETOFEND_H_POWERS-4*64($HTABLE), $H_POW4
vmovdqu8 $OFFSETOFEND_H_POWERS-3*64($HTABLE), $H_POW3
vmovdqu8 $OFFSETOFEND_H_POWERS-2*64($HTABLE), $H_POW2

# Update GHASH with 256 bytes of AAD at a time.
.Laad_loop_4x:
vmovdqu8 0*64($AAD), $GHASHDATA0
vmovdqu8 1*64($AAD), $GHASHDATA1
vmovdqu8 2*64($AAD), $GHASHDATA2
vmovdqu8 3*64($AAD), $GHASHDATA3
@{[ _ghash_4x $BSWAP_MASK, @GHASHDATA, @GHASHDATA_XMM, $H_POW4, $H_POW3,
$H_POW2, $H_POW1, $GFPOLY, $GHASHTMP0, $GHASHTMP1,
$GHASHTMP2, $GHASH_ACC, $GHASH_ACC_XMM ]}
add \$256, $AAD
sub \$256, $AADLEN
cmp \$256, $AADLEN
jae .Laad_loop_4x

# Update GHASH with 64 bytes of AAD at a time.
cmp \$64, $AADLEN
jb .Laad_large_done
.Laad_loop_1x:
vmovdqu8 ($AAD), $GHASHDATA0
vpshufb $BSWAP_MASK, $GHASHDATA0, $GHASHDATA0
vpxord $GHASHDATA0, $GHASH_ACC, $GHASH_ACC
@{[ _ghash_mul $H_POW1, $GHASH_ACC, $GHASH_ACC, $GFPOLY,
$GHASHDATA0, $GHASHDATA1, $GHASHDATA2 ]}
@{[ _horizontal_xor $GHASH_ACC, $GHASH_ACC_XMM, $GHASH_ACC_XMM,
$GHASHDATA0_XMM, $GHASHDATA1_XMM, $GHASHDATA2_XMM ]}
add \$64, $AAD
sub \$64, $AADLEN
cmp \$64, $AADLEN
jae .Laad_loop_1x

.Laad_large_done:

# GHASH the remaining data 16 bytes at a time, using xmm registers only.
.Laad_blockbyblock:
test $AADLEN, $AADLEN
jz .Laad_done
vmovdqu $OFFSETOFEND_H_POWERS-16($HTABLE), $H_POW1_XMM
.Laad_loop_blockbyblock:
vmovdqu ($AAD), $GHASHDATA0_XMM
vpshufb $BSWAP_MASK_XMM, $GHASHDATA0_XMM, $GHASHDATA0_XMM
vpxor $GHASHDATA0_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
@{[ _ghash_mul $H_POW1_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM, $GFPOLY_XMM,
$GHASHDATA0_XMM, $GHASHDATA1_XMM, $GHASHDATA2_XMM ]}
add \$16, $AAD
sub \$16, $AADLEN
jnz .Laad_loop_blockbyblock

.Laad_done:
# Store the updated GHASH accumulator back to memory.
vpshufb $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
vmovdqu $GHASH_ACC_XMM, ($GHASH_ACC_PTR)

vzeroupper # This is needed after using ymm or zmm registers.
___
}
$code .= _end_func;

# Do one non-last round of AES encryption on the counter blocks in aesdata[0-3]
# using the round key that has been broadcast to all 128-bit lanes of round_key.
sub _vaesenc_4x {
Expand Down Expand Up @@ -1292,11 +1177,6 @@ sub filter_and_print {
my $postspace = $+{postspace};
if (exists $asmMap{$trimmed}) {
$line = ${prespace} . $asmMap{$trimmed} . ${postspace};
} else {
if($trimmed =~ /(vpclmulqdq|vaes).*%[yz]mm/) {
die ("found instruction not supported under old binutils, please update asmMap with the results of running\n" .
'find target -name "*aes-gcm-avx512*.o" -exec python3 crypto/fipsmodule/aes/asm/make-avx-map-for-old-binutils.py \{\} \; | LC_ALL=C sort | uniq');
}
}
}
print $line,"\n";
Expand Down
44 changes: 36 additions & 8 deletions src/aead/aes_gcm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ use cpu::GetFeature as _;
mod aarch64;
mod aeshwclmulmovbe;
mod vaesclmulavx2;
mod vaesclmulavx512;

#[derive(Clone)]
pub(super) struct Key(DynKey);
Expand Down Expand Up @@ -77,6 +78,12 @@ enum DynKey {
#[cfg(all(target_arch = "aarch64", target_endian = "little"))]
AesHwClMul(Combo<aes::hw::Key, gcm::clmul_aarch64::Key>),

#[cfg(all(
target_arch = "x86_64",
not(any(target_os = "macos", target_vendor = "apple"))
))]
VAesClMulAvx512(Combo<aes::hw::Key, gcm::vclmulavx512::Key>),

#[cfg(target_arch = "x86_64")]
VAesClMulAvx2(Combo<aes::hw::Key, gcm::vclmulavx2::Key>),

Expand Down Expand Up @@ -117,16 +124,25 @@ impl DynKey {
if let Some((aes, gcm)) = cpu.get_feature() {
let aes_key = aes::hw::Key::new(key, aes, cpu.get_feature());
let gcm_key_value = derive_gcm_key_value(&aes_key);
return if let Some(cpu) = cpu.get_feature() {

#[cfg(not(any(target_os = "macos", target_vendor = "apple")))]
if let Some(cpu) = cpu.get_feature() {
let gcm_key = gcm::vclmulavx512::Key::new(gcm_key_value, cpu);
return Self::VAesClMulAvx512(Combo { aes_key, gcm_key });
}

if let Some(cpu) = cpu.get_feature() {
let gcm_key = gcm::vclmulavx2::Key::new(gcm_key_value, cpu);
Self::VAesClMulAvx2(Combo { aes_key, gcm_key })
} else if let Some(cpu) = cpu.get_feature() {
return Self::VAesClMulAvx2(Combo { aes_key, gcm_key });
}

if let Some(cpu) = cpu.get_feature() {
let gcm_key = gcm::clmulavxmovbe::Key::new(gcm_key_value, cpu);
Self::AesHwClMulAvxMovbe(Combo { aes_key, gcm_key })
} else {
let gcm_key = gcm::clmul_x86_64::Key::new(gcm_key_value, gcm);
Self::AesHwClMul(Combo { aes_key, gcm_key })
};
return Self::AesHwClMulAvxMovbe(Combo { aes_key, gcm_key });
}

let gcm_key = gcm::clmul_x86_64::Key::new(gcm_key_value, gcm);
return Self::AesHwClMul(Combo { aes_key, gcm_key });
}

#[cfg(target_arch = "x86")]
Expand Down Expand Up @@ -227,6 +243,12 @@ fn seal(
seal_whole_partial(c, aad, in_out, ctr, tag_iv, aarch64::seal_whole)
}

#[cfg(target_arch = "x86_64")]
#[cfg(not(any(target_os = "macos", target_vendor = "apple")))]
DynKey::VAesClMulAvx512(c) => {
seal_whole_partial(c, aad, in_out, ctr, tag_iv, vaesclmulavx512::seal_whole)
}

#[cfg(target_arch = "x86_64")]
DynKey::VAesClMulAvx2(c) => seal_whole_partial(
c,
Expand Down Expand Up @@ -352,6 +374,12 @@ fn open(
open_whole_partial(c, aad, in_out, ctr, tag_iv, aarch64::open_whole)
}

#[cfg(target_arch = "x86_64")]
#[cfg(not(any(target_os = "macos", target_vendor = "apple")))]
DynKey::VAesClMulAvx512(c) => {
open_whole_partial(c, aad, in_out, ctr, tag_iv, vaesclmulavx512::open_whole)
}

#[cfg(target_arch = "x86_64")]
DynKey::VAesClMulAvx2(c) => open_whole_partial(
c,
Expand Down
90 changes: 90 additions & 0 deletions src/aead/aes_gcm/vaesclmulavx512.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
// Copyright 2015-2025 Brian Smith.
//
// Permission to use, copy, modify, and/or distribute this software for any
// purpose with or without fee is hereby granted, provided that the above
// copyright notice and this permission notice appear in all copies.
//
// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

#![cfg(target_arch = "x86_64")]
#![cfg_attr(any(target_os = "macos", target_vendor = "apple"), allow(dead_code))]

use super::{aes, gcm, Counter, Overlapping, BLOCK_LEN};
use crate::{c, polyfill::slice::AsChunksMut};
use core::num::{NonZeroU32, NonZeroUsize};

pub(super) fn seal_whole(
aes_key: &aes::hw::Key,
auth: &mut gcm::Context<gcm::vclmulavx512::Key>,
ctr: &mut Counter,
mut in_out: AsChunksMut<u8, BLOCK_LEN>,
) {
prefixed_extern! {
fn aes_gcm_enc_update_vaes_avx512(
input: *const u8,
output: *mut u8,
len: c::NonZero_size_t, // TODO? zero OK?
key: &aes::hw::Key,
ivec: &Counter,
Htable: &gcm::vclmulavx512::Key,
Xi: &mut gcm::Xi);
}

let in_out = in_out.as_flattened_mut();

// Precondition: Since we have a `gcm::Context` then the number of blocks
// must fit in `u32`.
let blocks = u32::try_from(in_out.len() / BLOCK_LEN).unwrap();

if let Some(len) = NonZeroUsize::new(in_out.len()) {
let (htable, xi) = auth.inner();
let input = in_out.as_ptr();
let output = in_out.as_mut_ptr();
unsafe { aes_gcm_enc_update_vaes_avx512(input, output, len, aes_key, ctr, htable, xi) };
let blocks = NonZeroU32::new(blocks).unwrap_or_else(|| {
unreachable!() // Due to previous checks.
});
ctr.increment_by_less_safe(blocks);
}
}

pub(super) fn open_whole(
aes_key: &aes::hw::Key,
auth: &mut gcm::Context<gcm::vclmulavx512::Key>,
in_out: Overlapping,
ctr: &mut Counter,
) {
prefixed_extern! {
fn aes_gcm_dec_update_vaes_avx512(
input: *const u8,
output: *mut u8,
len: c::NonZero_size_t, // TODO? zero OK?
key: &aes::hw::Key,
ivec: &mut Counter,
Htable: &gcm::vclmulavx512::Key,
Xi: &mut gcm::Xi);
}

// Precondition. TODO: Create an overlapping::AsChunks for this.
assert_eq!(in_out.len() % BLOCK_LEN, 0);
// Precondition: Since we have a `gcm::Context` then the number of blocks
// must fit in `u32`.
let blocks = u32::try_from(in_out.len() / BLOCK_LEN).unwrap();

in_out.with_input_output_len(|input, output, len| {
if let Some(len) = NonZeroUsize::new(len) {
let (htable, xi) = auth.inner();
unsafe { aes_gcm_dec_update_vaes_avx512(input, output, len, aes_key, ctr, htable, xi) };
let blocks = NonZeroU32::new(blocks).unwrap_or_else(|| {
unreachable!() // Due to previous checks.
});
ctr.increment_by_less_safe(blocks);
}
})
}
Loading