Skip to content

Commit bcf68dd

Browse files
committed
aes_gcm/x86_64: Tweak gcm_ghash_vpclmulqdq_avx2_16.
Instead of starting with the body of the original `gcm_ghash_vpclmulqdq_avx2` and removing the multi-block support, start with `gcm_gmult_vpclmulqdq_avx2` and add the XOR of `aad`. The instruction scheduling seems a bit better. Also, this computes `bswap(Xi ^ aad)` instead of `bswap(Xi) ^ bswap(aad)`, saving one pshufb. Rename the function to `gcm_ghash_vpclmulqdq_avx2_16` to better reflect its constraint on `aad_len_16`. This is the diff between this function and BoringSSL's `gcm_gmult_vpclmulqdq_avx2`, as of 14d05a3. ```diff --- a/crypto/fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl +++ b/crypto/fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl @@ -436,10 +436,17 @@ sub _ghash_4x { return $code; } -# void gcm_gmult_vpclmulqdq_avx2(uint8_t Xi[16], const u128 Htable[16]); -$code .= _begin_func "gcm_gmult_vpclmulqdq_avx2", 1; +# void gcm_ghash_vpclmulqdq_avx2_16(uint8_t Xi[16], const u128 Htable[16], +# const uint8_t aad[16], size_t aad_len_16); +# +# Using the key |Htable|, update the GHASH accumulator |Xi| with the data given +# by |aad| and |aad_len_16|. |aad_len_16| must be exactly 16. +# +# This has the same signature `gcm_ghash_vpclmulqdq_avx2` but uses the +# implementation from `gcm_gmult_vpclmulqdq_avx2`, with the XOR of `aad` added. +$code .= _begin_func "gcm_ghash_vpclmulqdq_avx2_16", 1; { - my ( $GHASH_ACC_PTR, $HTABLE ) = @argregs[ 0 .. 1 ]; + my ( $GHASH_ACC_PTR, $HTABLE, $AAD, $AAD_LEN_16 ) = @argregs[ 0 .. 3 ]; my ( $GHASH_ACC, $BSWAP_MASK, $H_POW1, $GFPOLY, $T0, $T1, $T2 ) = map( "%xmm$_", ( 0 .. 6 ) ); @@ -448,6 +455,10 @@ $code .= _begin_func "gcm_gmult_vpclmulqdq_avx2", 1; .seh_endprologue vmovdqu ($GHASH_ACC_PTR), $GHASH_ACC + + # XOR the AAD into the accumulator. + vpxor ($AAD), $GHASH_ACC, $GHASH_ACC + vmovdqu .Lbswap_mask(%rip), $BSWAP_MASK vmovdqu $OFFSETOFEND_H_POWERS-16($HTABLE), $H_POW1 vmovdqu .Lgfpoly(%rip), $GFPOLY @@ -463,108 +474,6 @@ ___ } $code .= _end_func; ``` See the full diff: ``` git difftool 14d05a3 \ crypto/fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl ```
1 parent ec4f5be commit bcf68dd

File tree

3 files changed

+24
-42
lines changed

3 files changed

+24
-42
lines changed

build.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -949,7 +949,7 @@ fn prefix_all_symbols(pp: char, prefix_prefix: &str, prefix: &str) -> String {
949949
"gcm_ghash_avx",
950950
"gcm_ghash_clmul",
951951
"gcm_ghash_neon",
952-
"gcm_ghash_vpclmulqdq_avx2_1",
952+
"gcm_ghash_vpclmulqdq_avx2_16",
953953
"gcm_gmult_clmul",
954954
"gcm_gmult_neon",
955955
"gcm_init_avx",

crypto/fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl

Lines changed: 22 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -436,58 +436,40 @@ sub _ghash_4x {
436436
return $code;
437437
}
438438

439-
# void gcm_ghash_vpclmulqdq_avx2(uint8_t Xi[16], const u128 Htable[16],
440-
# const uint8_t *in, size_t len);
439+
# void gcm_ghash_vpclmulqdq_avx2_16(uint8_t Xi[16], const u128 Htable[16],
440+
# const uint8_t aad[16], size_t aad_len_16);
441441
#
442442
# Using the key |Htable|, update the GHASH accumulator |Xi| with the data given
443-
# by |in| and |len|. |len| must be exactly 16.
444-
$code .= _begin_func "gcm_ghash_vpclmulqdq_avx2_1", 1;
443+
# by |aad| and |aad_len_16|. |aad_len_16| must be exactly 16.
444+
#
445+
# This has the same signature `gcm_ghash_vpclmulqdq_avx2` but uses the
446+
# implementation from `gcm_gmult_vpclmulqdq_avx2`, with the XOR of `aad` added.
447+
$code .= _begin_func "gcm_ghash_vpclmulqdq_avx2_16", 1;
445448
{
446-
# Function arguments
447-
my ( $GHASH_ACC_PTR, $HTABLE, $AAD, $AADLEN ) = @argregs[ 0 .. 3 ];
448-
449-
# Additional local variables
450-
my ( $TMP0, $TMP0_XMM ) = ( "%ymm0", "%xmm0" );
451-
my ( $TMP1, $TMP1_XMM ) = ( "%ymm1", "%xmm1" );
452-
my ( $TMP2, $TMP2_XMM ) = ( "%ymm2", "%xmm2" );
453-
my ( $LO, $LO_XMM ) = ( "%ymm3", "%xmm3" );
454-
my ( $MI, $MI_XMM ) = ( "%ymm4", "%xmm4" );
455-
my ( $GHASH_ACC, $GHASH_ACC_XMM ) = ( "%ymm5", "%xmm5" );
456-
my ( $BSWAP_MASK, $BSWAP_MASK_XMM ) = ( "%ymm6", "%xmm6" );
457-
my ( $GFPOLY, $GFPOLY_XMM ) = ( "%ymm7", "%xmm7" );
458-
my $H_POW2_XORED = "%ymm8";
459-
my $H_POW1_XORED = "%ymm9";
449+
my ( $GHASH_ACC_PTR, $HTABLE, $AAD, $AAD_LEN_16 ) = @argregs[ 0 .. 3 ];
450+
my ( $GHASH_ACC, $BSWAP_MASK, $H_POW1, $GFPOLY, $T0, $T1, $T2 ) =
451+
map( "%xmm$_", ( 0 .. 6 ) );
460452

461453
$code .= <<___;
462-
@{[ _save_xmmregs (6 .. 9) ]}
454+
@{[ _save_xmmregs (6) ]}
463455
.seh_endprologue
464456
465-
# Load the bswap_mask and gfpoly constants. Since AADLEN is usually small,
466-
# usually only 128-bit vectors will be used. So as an optimization, don't
467-
# broadcast these constants to both 128-bit lanes quite yet.
468-
vmovdqu .Lbswap_mask(%rip), $BSWAP_MASK_XMM
469-
vmovdqu .Lgfpoly(%rip), $GFPOLY_XMM
457+
vmovdqu ($GHASH_ACC_PTR), $GHASH_ACC
470458
471-
# Load the GHASH accumulator.
472-
vmovdqu ($GHASH_ACC_PTR), $GHASH_ACC_XMM
473-
vpshufb $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
459+
# XOR the AAD into the accumulator.
460+
vpxor ($AAD), $GHASH_ACC, $GHASH_ACC
474461
462+
vmovdqu .Lbswap_mask(%rip), $BSWAP_MASK
463+
vmovdqu $OFFSETOFEND_H_POWERS-16($HTABLE), $H_POW1
464+
vmovdqu .Lgfpoly(%rip), $GFPOLY
465+
vpshufb $BSWAP_MASK, $GHASH_ACC, $GHASH_ACC
475466
476-
# Update GHASH with the remaining 16-byte block if any.
477-
.Lghash_lastblock:
478-
vmovdqu ($AAD), $TMP0_XMM
479-
vpshufb $BSWAP_MASK_XMM, $TMP0_XMM, $TMP0_XMM
480-
vpxor $TMP0_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
481-
vmovdqu $OFFSETOFEND_H_POWERS-16($HTABLE), $TMP0_XMM
482-
@{[ _ghash_mul $TMP0_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM, $GFPOLY_XMM,
483-
$TMP1_XMM, $TMP2_XMM, $LO_XMM ]}
467+
@{[ _ghash_mul $H_POW1, $GHASH_ACC, $GHASH_ACC, $GFPOLY, $T0, $T1, $T2 ]}
484468
485-
.Lghash_done:
486-
# Store the updated GHASH accumulator back to memory.
487-
vpshufb $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
488-
vmovdqu $GHASH_ACC_XMM, ($GHASH_ACC_PTR)
469+
vpshufb $BSWAP_MASK, $GHASH_ACC, $GHASH_ACC
470+
vmovdqu $GHASH_ACC, ($GHASH_ACC_PTR)
489471
490-
vzeroupper
472+
# No need for vzeroupper, since only xmm registers were used.
491473
___
492474
}
493475
$code .= _end_func;

src/aead/gcm/vclmulavx2.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,6 @@ impl Key {
4141
impl UpdateBlock for Key {
4242
fn update_block(&self, xi: &mut Xi, a: [u8; BLOCK_LEN]) {
4343
let input: AsChunks<u8, BLOCK_LEN> = (&a).into();
44-
unsafe { ghash!(gcm_ghash_vpclmulqdq_avx2_1, xi, &self.h_table, input) }
44+
unsafe { ghash!(gcm_ghash_vpclmulqdq_avx2_16, xi, &self.h_table, input) }
4545
}
4646
}

0 commit comments

Comments
 (0)