@@ -588,18 +588,24 @@ sub _ghash_4x {
588588    return  $code ;
589589}
590590
591- #  void gcm_gmult_vpclmulqdq_avx512(uint8_t Xi[16], const u128 Htable[16]);
592- $code  .= _begin_func " gcm_gmult_vpclmulqdq_avx512" 
591+ #  void gcm_ghash_vpclmulqdq_avx512_16(uint8_t Xi[16], const u128 Htable[16],
592+ #                                      const uint8_t aad[16], size_t aad_len_16););
593+ $code  .= _begin_func " gcm_ghash_vpclmulqdq_avx512_16" 
593594{
594-     my  ( $GHASH_ACC_PTR , $HTABLE   ) = @argregs [ 0 .. 1  ];
595+     my  ( $GHASH_ACC_PTR , $HTABLE ,  $AAD ,  $AAD_LEN_16   ) = @argregs [ 0 .. 3  ];
595596    my  ( $GHASH_ACC , $BSWAP_MASK , $H_POW1 , $GFPOLY , $T0 , $T1 , $T2  ) =
596597      map ( " %xmm$_ " 
597598
598599    $code  .= <<___ 
599600    @{[ _save_xmmregs (6) ]} 
600601    .seh_endprologue 
601602
603+     # Load the GHASH accumulator. 
602604    vmovdqu         ($GHASH_ACC_PTR ), $GHASH_ACC  
605+ 
606+     # XOR the AAD into the accumulator. 
607+     vpxor           ($AAD ), $GHASH_ACC , $GHASH_ACC  
608+ 
603609    vmovdqu         .Lbswap_mask(%rip ), $BSWAP_MASK  
604610    vmovdqu         $OFFSETOFEND_H_POWERS -16($HTABLE ), $H_POW1  
605611    vmovdqu         .Lgfpoly(%rip ), $GFPOLY  
@@ -615,127 +621,6 @@ sub _ghash_4x {
615621}
616622$code  .= _end_func;
617623
618- #  void gcm_ghash_vpclmulqdq_avx512(uint8_t Xi[16], const u128 Htable[16],
619- #                                   const uint8_t *in, size_t len);
620- # 
621- #  Using the key |Htable|, update the GHASH accumulator |Xi| with the data given
622- #  by |in| and |len|.  |len| must be a multiple of 16.
623- # 
624- #  This function handles large amounts of AAD efficiently, while also keeping the
625- #  overhead low for small amounts of AAD which is the common case.  TLS uses less
626- #  than one block of AAD, but (uncommonly) other use cases may use much more.
627- $code  .= _begin_func " gcm_ghash_vpclmulqdq_avx512" 
628- {
629-     #  Function arguments
630-     my  ( $GHASH_ACC_PTR , $HTABLE , $AAD , $AADLEN  ) = @argregs [ 0 .. 3 ];
631- 
632-     #  Additional local variables
633-     my  ( $GHASHDATA0 , $GHASHDATA0_XMM  ) = ( " %zmm0 " " %xmm0 " 
634-     my  ( $GHASHDATA1 , $GHASHDATA1_XMM  ) = ( " %zmm1 " " %xmm1 " 
635-     my  ( $GHASHDATA2 , $GHASHDATA2_XMM  ) = ( " %zmm2 " " %xmm2 " 
636-     my  ( $GHASHDATA3 , $GHASHDATA3_XMM  ) = ( " %zmm3 " " %xmm3 " 
637-     my  @GHASHDATA  = ( $GHASHDATA0 , $GHASHDATA1 , $GHASHDATA2 , $GHASHDATA3  );
638-     my  @GHASHDATA_XMM  =
639-       ( $GHASHDATA0_XMM , $GHASHDATA1_XMM , $GHASHDATA2_XMM , $GHASHDATA3_XMM  );
640-     my  ( $BSWAP_MASK , $BSWAP_MASK_XMM  ) = ( " %zmm4 " " %xmm4 " 
641-     my  ( $GHASH_ACC , $GHASH_ACC_XMM  )   = ( " %zmm5 " " %xmm5 " 
642-     my  ( $H_POW4 , $H_POW3 , $H_POW2  )    = ( " %zmm6 " " %zmm7 " " %zmm8 " 
643-     my  ( $H_POW1 , $H_POW1_XMM  )         = ( " %zmm9 " " %xmm9 " 
644-     my  ( $GFPOLY , $GFPOLY_XMM  )         = ( " %zmm10 " " %xmm10 " 
645-     my  ( $GHASHTMP0 , $GHASHTMP1 , $GHASHTMP2  ) =
646-       ( " %zmm11 " " %zmm12 " " %zmm13 " 
647- 
648-     $code  .= <<___ 
649-     @{[ _save_xmmregs (6 .. 13) ]} 
650-     .seh_endprologue 
651- 
652-     # Load the bswap_mask and gfpoly constants.  Since AADLEN is usually small, 
653-     # usually only 128-bit vectors will be used.  So as an optimization, don't 
654-     # broadcast these constants to all 128-bit lanes quite yet. 
655-     vmovdqu         .Lbswap_mask(%rip ), $BSWAP_MASK_XMM  
656-     vmovdqu         .Lgfpoly(%rip ), $GFPOLY_XMM  
657- 
658-     # Load the GHASH accumulator. 
659-     vmovdqu         ($GHASH_ACC_PTR ), $GHASH_ACC_XMM  
660-     vpshufb         $BSWAP_MASK_XMM , $GHASH_ACC_XMM , $GHASH_ACC_XMM  
661- 
662-     # Optimize for AADLEN < 64 by checking for AADLEN < 64 before AADLEN < 256. 
663-     cmp             \$ 64, $AADLEN  
664-     jb              .Laad_blockbyblock 
665- 
666-     # AADLEN >= 64, so we'll operate on full vectors.  Broadcast bswap_mask and 
667-     # gfpoly to all 128-bit lanes. 
668-     vshufi64x2      \$ 0, $BSWAP_MASK , $BSWAP_MASK , $BSWAP_MASK  
669-     vshufi64x2      \$ 0, $GFPOLY , $GFPOLY , $GFPOLY  
670- 
671-     # Load the lowest set of key powers. 
672-     vmovdqu8        $OFFSETOFEND_H_POWERS -1*64($HTABLE ), $H_POW1  
673- 
674-     cmp             \$ 256, $AADLEN  
675-     jb              .Laad_loop_1x 
676- 
677-     # AADLEN >= 256.  Load the higher key powers. 
678-     vmovdqu8        $OFFSETOFEND_H_POWERS -4*64($HTABLE ), $H_POW4  
679-     vmovdqu8        $OFFSETOFEND_H_POWERS -3*64($HTABLE ), $H_POW3  
680-     vmovdqu8        $OFFSETOFEND_H_POWERS -2*64($HTABLE ), $H_POW2  
681- 
682-     # Update GHASH with 256 bytes of AAD at a time. 
683- .Laad_loop_4x: 
684-     vmovdqu8        0*64($AAD ), $GHASHDATA0  
685-     vmovdqu8        1*64($AAD ), $GHASHDATA1  
686-     vmovdqu8        2*64($AAD ), $GHASHDATA2  
687-     vmovdqu8        3*64($AAD ), $GHASHDATA3  
688-     @{[ _ghash_4x   $BSWAP_MASK , @GHASHDATA , @GHASHDATA_XMM , $H_POW4 , $H_POW3 , 
689-                     $H_POW2 , $H_POW1 , $GFPOLY , $GHASHTMP0 , $GHASHTMP1 , 
690-                     $GHASHTMP2 , $GHASH_ACC , $GHASH_ACC_XMM  ]} 
691-     add             \$ 256, $AAD  
692-     sub             \$ 256, $AADLEN  
693-     cmp             \$ 256, $AADLEN  
694-     jae             .Laad_loop_4x 
695- 
696-     # Update GHASH with 64 bytes of AAD at a time. 
697-     cmp             \$ 64, $AADLEN  
698-     jb              .Laad_large_done 
699- .Laad_loop_1x: 
700-     vmovdqu8        ($AAD ), $GHASHDATA0  
701-     vpshufb         $BSWAP_MASK , $GHASHDATA0 , $GHASHDATA0  
702-     vpxord          $GHASHDATA0 , $GHASH_ACC , $GHASH_ACC  
703-     @{[ _ghash_mul  $H_POW1 , $GHASH_ACC , $GHASH_ACC , $GFPOLY , 
704-                     $GHASHDATA0 , $GHASHDATA1 , $GHASHDATA2  ]} 
705-     @{[ _horizontal_xor $GHASH_ACC , $GHASH_ACC_XMM , $GHASH_ACC_XMM , 
706-                         $GHASHDATA0_XMM , $GHASHDATA1_XMM , $GHASHDATA2_XMM  ]} 
707-     add             \$ 64, $AAD  
708-     sub             \$ 64, $AADLEN  
709-     cmp             \$ 64, $AADLEN  
710-     jae             .Laad_loop_1x 
711- 
712- .Laad_large_done: 
713- 
714-     # GHASH the remaining data 16 bytes at a time, using xmm registers only. 
715- .Laad_blockbyblock: 
716-     test            $AADLEN , $AADLEN  
717-     jz              .Laad_done 
718-     vmovdqu         $OFFSETOFEND_H_POWERS -16($HTABLE ), $H_POW1_XMM  
719- .Laad_loop_blockbyblock: 
720-     vmovdqu         ($AAD ), $GHASHDATA0_XMM  
721-     vpshufb         $BSWAP_MASK_XMM , $GHASHDATA0_XMM , $GHASHDATA0_XMM  
722-     vpxor           $GHASHDATA0_XMM , $GHASH_ACC_XMM , $GHASH_ACC_XMM  
723-     @{[ _ghash_mul  $H_POW1_XMM , $GHASH_ACC_XMM , $GHASH_ACC_XMM , $GFPOLY_XMM , 
724-                     $GHASHDATA0_XMM , $GHASHDATA1_XMM , $GHASHDATA2_XMM  ]} 
725-     add             \$ 16, $AAD  
726-     sub             \$ 16, $AADLEN  
727-     jnz             .Laad_loop_blockbyblock 
728- 
729- .Laad_done: 
730-     # Store the updated GHASH accumulator back to memory. 
731-     vpshufb         $BSWAP_MASK_XMM , $GHASH_ACC_XMM , $GHASH_ACC_XMM  
732-     vmovdqu         $GHASH_ACC_XMM , ($GHASH_ACC_PTR ) 
733- 
734-     vzeroupper      # This is needed after using ymm or zmm registers. 
735- ___ 
736- }
737- $code  .= _end_func;
738- 
739624#  Do one non-last round of AES encryption on the counter blocks in aesdata[0-3]
740625#  using the round key that has been broadcast to all 128-bit lanes of round_key.
741626sub  _vaesenc_4x  {
@@ -1292,11 +1177,6 @@ sub filter_and_print {
12921177            my  $postspace  = $+ {postspace };
12931178            if  (exists  $asmMap {$trimmed }) {
12941179                $line  = ${prespace}  . $asmMap {$trimmed } . ${postspace} ;
1295-             } else  {
1296-                 if ($trimmed  =~ / (vpclmulqdq|vaes).*%[yz]mm/ 
1297-                     die  (" found instruction not supported under old binutils, please update asmMap with the results of running\n " 
1298-                          ' find target -name "*aes-gcm-avx512*.o" -exec python3 crypto/fipsmodule/aes/asm/make-avx-map-for-old-binutils.py \{\} \; | LC_ALL=C sort | uniq' 
1299-                 }
13001180            }
13011181        }
13021182        print  $line ," \n " 
0 commit comments