@@ -848,6 +848,10 @@ fn KTHash(
848848        final_state : ? StateType , // Running TurboSHAKE state for final node 
849849        num_leaves : usize , // Count of leaves processed (after first chunk) 
850850
851+         // SIMD chunk batching 
852+         pending_chunks : [8  *  chunk_size ]u8  align (cache_line_size ), // Buffer for up to 8 chunks 
853+         pending_count : usize , // Number of complete chunks in pending_chunks 
854+ 
851855        /// Initialize a KangarooTwelve hashing context. 
852856        /// The customization string is optional and used for domain separation. 
853857        pub  fn  init (options : Options ) Self  {
@@ -861,9 +865,48 @@ fn KTHash(
861865                .first_chunk  =  null ,
862866                .final_state  =  null ,
863867                .num_leaves  =  0 ,
868+                 .pending_chunks  =  undefined ,
869+                 .pending_count  =  0 ,
864870            };
865871        }
866872
873+         /// Flush all pending chunks using SIMD when possible 
874+         fn  flushPendingChunks (self : * Self ) void  {
875+             const  cv_size  =  Variant .cv_size ;
876+ 
877+             // Process all pending chunks using the largest SIMD batch sizes possible 
878+             while  (self .pending_count  >  0 ) {
879+                 // Try SIMD batches in decreasing size order 
880+                 inline  for  ([_ ]usize { 8 , 4 , 2  }) | batch_size |  {
881+                     if  (optimal_vector_len  >=  batch_size  and  self .pending_count  >=  batch_size ) {
882+                         var  leaf_cvs : [batch_size  *  cv_size ]u8  align (cache_line_size ) =  undefined ;
883+                         processLeaves (Variant , batch_size , self .pending_chunks [0  ..  batch_size  *  chunk_size ], & leaf_cvs );
884+                         self .final_state .? .update (& leaf_cvs );
885+                         self .num_leaves  +=  batch_size ;
886+                         self .pending_count  -=  batch_size ;
887+ 
888+                         // Shift remaining chunks to the front 
889+                         if  (self .pending_count  >  0 ) {
890+                             const  remaining_bytes  =  self .pending_count  *  chunk_size ;
891+                             @memcpy (self .pending_chunks [0.. remaining_bytes ], self .pending_chunks [batch_size  *  chunk_size  .. ][0.. remaining_bytes ]);
892+                         }
893+                         break ; // Continue outer loop to try next batch 
894+                     }
895+                 }
896+ 
897+                 // If no SIMD batch was possible, process one chunk with scalar code 
898+                 if  (self .pending_count  >  0  and  self .pending_count  <  2 ) {
899+                     var  cv_buffer : [64 ]u8  =  undefined ;
900+                     const  cv_slice  =  MultiSliceView .init (self .pending_chunks [0.. chunk_size ], &[_ ]u8 {}, &[_ ]u8 {});
901+                     Variant .turboSHAKEToBuffer (& cv_slice , 0x0B , cv_buffer [0.. cv_size ]);
902+                     self .final_state .? .update (cv_buffer [0.. cv_size ]);
903+                     self .num_leaves  +=  1 ;
904+                     self .pending_count  -=  1 ;
905+                     break ; // No more chunks to process 
906+                 }
907+             }
908+         }
909+ 
867910        /// Absorb data into the hash state. 
868911        /// Can be called multiple times to incrementally add data. 
869912        pub  fn  update (self : * Self , data : []const  u8 ) void  {
@@ -895,15 +938,21 @@ fn KTHash(
895938                        const  padding  =  [_ ]u8 { 0x03 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00  };
896939                        self .final_state .? .update (& padding );
897940                    } else  {
898-                         // Subsequent chunks - process as leaf and absorb CV 
899-                         const  cv_size  =  Variant .cv_size ;
900-                         var  cv_buffer : [64 ]u8  =  undefined ; // Max CV size 
901-                         const  cv_slice  =  MultiSliceView .init (& self .buffer , &[_ ]u8 {}, &[_ ]u8 {});
902-                         Variant .turboSHAKEToBuffer (& cv_slice , 0x0B , cv_buffer [0.. cv_size ]);
903- 
904-                         // Absorb CV into final state immediately 
905-                         self .final_state .? .update (cv_buffer [0.. cv_size ]);
906-                         self .num_leaves  +=  1 ;
941+                         // Add chunk to pending buffer for SIMD batch processing 
942+                         @memcpy (self .pending_chunks [self .pending_count  *  chunk_size  .. ][0.. chunk_size ], & self .buffer );
943+                         self .pending_count  +=  1 ;
944+ 
945+                         // Flush when we have enough chunks for optimal SIMD batch 
946+                         // Determine best batch size for this architecture 
947+                         const  optimal_batch_size  =  comptime  blk : {
948+                             if  (optimal_vector_len  >=  8 ) break  :blk  8 ;
949+                             if  (optimal_vector_len  >=  4 ) break  :blk  4 ;
950+                             if  (optimal_vector_len  >=  2 ) break  :blk  2 ;
951+                             break  :blk  1 ;
952+                         };
953+                         if  (self .pending_count  >=  optimal_batch_size ) {
954+                             self .flushPendingChunks ();
955+                         }
907956                    }
908957                    self .buffer_len  =  0 ;
909958                }
@@ -931,24 +980,65 @@ fn KTHash(
931980                return ;
932981            }
933982
934-             // Tree mode: we've already absorbed first_chunk + padding + intermediate CVs 
935-             // Now handle remaining buffer data 
936-             const  remaining_with_custom_len  =  self .buffer_len  +  self .customization .len  +  self .custom_len_enc .len ;
983+             // Flush any pending chunks with SIMD 
984+             self .flushPendingChunks ();
985+ 
986+             // Build view over remaining data (buffer + customization + encoding) 
987+             const  remaining_view  =  MultiSliceView .init (
988+                 self .buffer [0.. self .buffer_len ],
989+                 self .customization ,
990+                 self .custom_len_enc .slice (),
991+             );
992+             const  remaining_len  =  remaining_view .totalLen ();
993+ 
937994            var  final_leaves  =  self .num_leaves ;
995+             var  leaf_start : usize  =  0 ;
996+ 
997+             // Tree mode: initialize if not already done (lazy initialization) 
998+             if  (self .final_state  ==  null  and  remaining_len  >  0 ) {
999+                 self .final_state  =  StateType .init (.{});
1000+ 
1001+                 // Absorb first chunk (up to chunk_size bytes from remaining data) 
1002+                 const  first_chunk_len  =  @min (chunk_size , remaining_len );
1003+                 if  (remaining_view .tryGetSlice (0 , first_chunk_len )) | first_chunk |  {
1004+                     // Data is contiguous, use it directly 
1005+                     self .final_state .? .update (first_chunk );
1006+                 } else  {
1007+                     // Data spans boundaries, copy to buffer 
1008+                     var  first_chunk_buf : [chunk_size ]u8  =  undefined ;
1009+                     remaining_view .copyRange (0 , first_chunk_len , first_chunk_buf [0.. first_chunk_len ]);
1010+                     self .final_state .? .update (first_chunk_buf [0.. first_chunk_len ]);
1011+                 }
9381012
939-             if  (remaining_with_custom_len  >  0 ) {
940-                 // Build final leaf data with customization 
941-                 var  final_leaf_buffer : [chunk_size  +  256 ]u8  =  undefined ; // Extra space for customization 
942-                 @memcpy (final_leaf_buffer [0.. self .buffer_len ], self .buffer [0.. self .buffer_len ]);
943-                 @memcpy (final_leaf_buffer [self .buffer_len .. ][0.. self .customization .len ], self .customization );
944-                 @memcpy (final_leaf_buffer [self .buffer_len  +  self .customization .len  .. ][0.. self .custom_len_enc .len ], self .custom_len_enc .slice ());
945- 
946-                 // Generate CV for final leaf and absorb it 
947-                 var  cv_buffer : [64 ]u8  =  undefined ; // Max CV size 
948-                 const  cv_slice  =  MultiSliceView .init (final_leaf_buffer [0.. remaining_with_custom_len ], &[_ ]u8 {}, &[_ ]u8 {});
949-                 Variant .turboSHAKEToBuffer (& cv_slice , 0x0B , cv_buffer [0.. cv_size ]);
1013+                 // Absorb padding (8 bytes: 0x03 followed by 7 zeros) 
1014+                 const  padding  =  [_ ]u8 { 0x03 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00  };
1015+                 self .final_state .? .update (& padding );
1016+ 
1017+                 // Process remaining data as leaves 
1018+                 leaf_start  =  first_chunk_len ;
1019+             }
1020+ 
1021+             // Process all remaining data as leaves (starting from leaf_start) 
1022+             var  offset  =  leaf_start ;
1023+             while  (offset  <  remaining_len ) {
1024+                 const  leaf_end  =  @min (offset  +  chunk_size , remaining_len );
1025+                 const  leaf_size  =  leaf_end  -  offset ;
1026+ 
1027+                 var  cv_buffer : [64 ]u8  =  undefined ;
1028+                 if  (remaining_view .tryGetSlice (offset , leaf_end )) | leaf_data |  {
1029+                     // Data is contiguous, use it directly 
1030+                     const  cv_slice  =  MultiSliceView .init (leaf_data , &[_ ]u8 {}, &[_ ]u8 {});
1031+                     Variant .turboSHAKEToBuffer (& cv_slice , 0x0B , cv_buffer [0.. cv_size ]);
1032+                 } else  {
1033+                     // Data spans boundaries, copy to buffer 
1034+                     var  leaf_buf : [chunk_size ]u8  =  undefined ;
1035+                     remaining_view .copyRange (offset , leaf_end , leaf_buf [0.. leaf_size ]);
1036+                     const  cv_slice  =  MultiSliceView .init (leaf_buf [0.. leaf_size ], &[_ ]u8 {}, &[_ ]u8 {});
1037+                     Variant .turboSHAKEToBuffer (& cv_slice , 0x0B , cv_buffer [0.. cv_size ]);
1038+                 }
9501039                self .final_state .? .update (cv_buffer [0.. cv_size ]);
9511040                final_leaves  +=  1 ;
1041+                 offset  =  leaf_end ;
9521042            }
9531043
9541044            // Absorb right_encode(num_leaves) and terminator 
0 commit comments