aes_gcm/x86_64: Tweak gcm_ghash_vpclmulqdq_avx2_16.

briansmith · briansmith · commit bcf68dd27a07 · 2025-03-19T17:39:40.000-07:00
Instead of starting with the body of the original `gcm_ghash_vpclmulqdq_avx2` and removing the multi-block support, start with `gcm_gmult_vpclmulqdq_avx2` and add the XOR of `aad`. The instruction scheduling seems a bit better. Also, this computes `bswap(Xi ^ aad)` instead of `bswap(Xi) ^ bswap(aad)`, saving one pshufb. Rename the function to `gcm_ghash_vpclmulqdq_avx2_16` to better reflect its constraint on `aad_len_16`. This is the diff between this function and BoringSSL's `gcm_gmult_vpclmulqdq_avx2`, as of 14d05a3. ```diff --- a/crypto/fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl +++ b/crypto/fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl @@ -436,10 +436,17 @@ sub _ghash_4x { return $code; } -# void gcm_gmult_vpclmulqdq_avx2(uint8_t Xi[16], const u128 Htable[16]); -$code .= _begin_func "gcm_gmult_vpclmulqdq_avx2", 1; +# void gcm_ghash_vpclmulqdq_avx2_16(uint8_t Xi[16], const u128 Htable[16], +# const uint8_t aad[16], size_t aad_len_16); +# +# Using the key |Htable|, update the GHASH accumulator |Xi| with the data given +# by |aad| and |aad_len_16|. |aad_len_16| must be exactly 16. +# +# This has the same signature `gcm_ghash_vpclmulqdq_avx2` but uses the +# implementation from `gcm_gmult_vpclmulqdq_avx2`, with the XOR of `aad` added. +$code .= _begin_func "gcm_ghash_vpclmulqdq_avx2_16", 1; { - my ( $GHASH_ACC_PTR, $HTABLE ) = @argregs[ 0 .. 1 ]; + my ( $GHASH_ACC_PTR, $HTABLE, $AAD, $AAD_LEN_16 ) = @argregs[ 0 .. 3 ]; my ( $GHASH_ACC, $BSWAP_MASK, $H_POW1, $GFPOLY, $T0, $T1, $T2 ) = map( "%xmm$_", ( 0 .. 6 ) ); @@ -448,6 +455,10 @@ $code .= _begin_func "gcm_gmult_vpclmulqdq_avx2", 1; .seh_endprologue vmovdqu ($GHASH_ACC_PTR), $GHASH_ACC + + # XOR the AAD into the accumulator. + vpxor ($AAD), $GHASH_ACC, $GHASH_ACC + vmovdqu .Lbswap_mask(%rip), $BSWAP_MASK vmovdqu $OFFSETOFEND_H_POWERS-16($HTABLE), $H_POW1 vmovdqu .Lgfpoly(%rip), $GFPOLY @@ -463,108 +474,6 @@ ___ } $code .= _end_func; ``` See the full diff: ``` git difftool 14d05a3 \ crypto/fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl ```
diff --git a/build.rs b/build.rs
@@ -949,7 +949,7 @@ fn prefix_all_symbols(pp: char, prefix_prefix: &str, prefix: &str) -> String {
         "gcm_ghash_avx",
         "gcm_ghash_clmul",
         "gcm_ghash_neon",
-        "gcm_ghash_vpclmulqdq_avx2_1",
+        "gcm_ghash_vpclmulqdq_avx2_16",
         "gcm_gmult_clmul",
         "gcm_gmult_neon",
         "gcm_init_avx",
diff --git a/crypto/fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl b/crypto/fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl
@@ -436,58 +436,40 @@ sub _ghash_4x {
     return $code;
 }
 
-# void gcm_ghash_vpclmulqdq_avx2(uint8_t Xi[16], const u128 Htable[16],
-#                                const uint8_t *in, size_t len);
+# void gcm_ghash_vpclmulqdq_avx2_16(uint8_t Xi[16], const u128 Htable[16],
+#                                   const uint8_t aad[16], size_t aad_len_16);
 #
 # Using the key |Htable|, update the GHASH accumulator |Xi| with the data given
-# by |in| and |len|.  |len| must be exactly 16.
-$code .= _begin_func "gcm_ghash_vpclmulqdq_avx2_1", 1;
+# by |aad| and |aad_len_16|. |aad_len_16| must be exactly 16.
+#
+# This has the same signature `gcm_ghash_vpclmulqdq_avx2` but uses the
+# implementation from `gcm_gmult_vpclmulqdq_avx2`, with the XOR of `aad` added.
+$code .= _begin_func "gcm_ghash_vpclmulqdq_avx2_16", 1;
 {
-    # Function arguments
-    my ( $GHASH_ACC_PTR, $HTABLE, $AAD, $AADLEN ) = @argregs[ 0 .. 3 ];
-
-    # Additional local variables
-    my ( $TMP0,       $TMP0_XMM )       = ( "%ymm0", "%xmm0" );
-    my ( $TMP1,       $TMP1_XMM )       = ( "%ymm1", "%xmm1" );
-    my ( $TMP2,       $TMP2_XMM )       = ( "%ymm2", "%xmm2" );
-    my ( $LO,         $LO_XMM )         = ( "%ymm3", "%xmm3" );
-    my ( $MI,         $MI_XMM )         = ( "%ymm4", "%xmm4" );
-    my ( $GHASH_ACC,  $GHASH_ACC_XMM )  = ( "%ymm5", "%xmm5" );
-    my ( $BSWAP_MASK, $BSWAP_MASK_XMM ) = ( "%ymm6", "%xmm6" );
-    my ( $GFPOLY,     $GFPOLY_XMM )     = ( "%ymm7", "%xmm7" );
-    my $H_POW2_XORED = "%ymm8";
-    my $H_POW1_XORED = "%ymm9";
+    my ( $GHASH_ACC_PTR, $HTABLE, $AAD, $AAD_LEN_16 ) = @argregs[ 0 .. 3 ];
+    my ( $GHASH_ACC, $BSWAP_MASK, $H_POW1, $GFPOLY, $T0, $T1, $T2 ) =
+      map( "%xmm$_", ( 0 .. 6 ) );
 
     $code .= <<___;
-    @{[ _save_xmmregs (6 .. 9) ]}
+    @{[ _save_xmmregs (6) ]}
     .seh_endprologue
 
-    # Load the bswap_mask and gfpoly constants.  Since AADLEN is usually small,
-    # usually only 128-bit vectors will be used.  So as an optimization, don't
-    # broadcast these constants to both 128-bit lanes quite yet.
-    vmovdqu         .Lbswap_mask(%rip), $BSWAP_MASK_XMM
-    vmovdqu         .Lgfpoly(%rip), $GFPOLY_XMM
+    vmovdqu         ($GHASH_ACC_PTR), $GHASH_ACC
 
-    # Load the GHASH accumulator.
-    vmovdqu         ($GHASH_ACC_PTR), $GHASH_ACC_XMM
-    vpshufb         $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
+    # XOR the AAD into the accumulator.
+    vpxor           ($AAD), $GHASH_ACC, $GHASH_ACC
 
+    vmovdqu         .Lbswap_mask(%rip), $BSWAP_MASK
+    vmovdqu         $OFFSETOFEND_H_POWERS-16($HTABLE), $H_POW1
+    vmovdqu         .Lgfpoly(%rip), $GFPOLY
+    vpshufb         $BSWAP_MASK, $GHASH_ACC, $GHASH_ACC
 
-    # Update GHASH with the remaining 16-byte block if any.
-.Lghash_lastblock:
-    vmovdqu         ($AAD), $TMP0_XMM
-    vpshufb         $BSWAP_MASK_XMM, $TMP0_XMM, $TMP0_XMM
-    vpxor           $TMP0_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
-    vmovdqu         $OFFSETOFEND_H_POWERS-16($HTABLE), $TMP0_XMM
-    @{[ _ghash_mul  $TMP0_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM, $GFPOLY_XMM,
-                    $TMP1_XMM, $TMP2_XMM, $LO_XMM ]}
+    @{[ _ghash_mul  $H_POW1, $GHASH_ACC, $GHASH_ACC, $GFPOLY, $T0, $T1, $T2 ]}
 
-.Lghash_done:
-    # Store the updated GHASH accumulator back to memory.
-    vpshufb         $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
-    vmovdqu         $GHASH_ACC_XMM, ($GHASH_ACC_PTR)
+    vpshufb         $BSWAP_MASK, $GHASH_ACC, $GHASH_ACC
+    vmovdqu         $GHASH_ACC, ($GHASH_ACC_PTR)
 
-    vzeroupper
+    # No need for vzeroupper, since only xmm registers were used.
 ___
 }
 $code .= _end_func;
diff --git a/src/aead/gcm/vclmulavx2.rs b/src/aead/gcm/vclmulavx2.rs
@@ -41,6 +41,6 @@ impl Key {
 impl UpdateBlock for Key {
     fn update_block(&self, xi: &mut Xi, a: [u8; BLOCK_LEN]) {
         let input: AsChunks<u8, BLOCK_LEN> = (&a).into();
-        unsafe { ghash!(gcm_ghash_vpclmulqdq_avx2_1, xi, &self.h_table, input) }
+        unsafe { ghash!(gcm_ghash_vpclmulqdq_avx2_16, xi, &self.h_table, input) }
     }
 }

Original file line number	Diff line number	Diff line change
`@@ -41,6 +41,6 @@ impl Key {`
`41`	`41`	`impl UpdateBlock for Key {`
`42`	`42`	`fn update_block(&self, xi: &mut Xi, a: [u8; BLOCK_LEN]) {`
`43`	`43`	`let input: AsChunks<u8, BLOCK_LEN> = (&a).into();`
`44`		`- unsafe { ghash!(gcm_ghash_vpclmulqdq_avx2_1, xi, &self.h_table, input) }`
	`44`	`+ unsafe { ghash!(gcm_ghash_vpclmulqdq_avx2_16, xi, &self.h_table, input) }`
`45`	`45`	`}`
`46`	`46`	`}`