@@ -703,53 +703,54 @@ pub(crate) fn scan_flat_mem_ranges(
703703 }
704704}
705705
706+ /// Files with row count greater than this threshold can contribute to the estimation.
706707const SPLIT_ROW_THRESHOLD : u64 = DEFAULT_ROW_GROUP_SIZE as u64 ;
708+ /// Number of series threshold for splitting batches.
707709const NUM_SERIES_THRESHOLD : u64 = 10240 ;
710+ /// Minimum batch size after splitting. The batch size is less than 60 because a series may only have
711+ /// 60 samples per hour.
708712const BATCH_SIZE_THRESHOLD : u64 = 50 ;
709713
710714/// Returns true if splitting flat record batches may improve merge performance.
711715pub ( crate ) fn should_split_flat_batches_for_merge (
712716 stream_ctx : & Arc < StreamContext > ,
713717 range_meta : & RangeMeta ,
714718) -> bool {
715- // Number of files to scan.
716- let mut num_scan_files = 0 ;
719+ // Number of files to split and scan.
720+ let mut num_files_to_split = 0 ;
717721 let mut num_mem_rows = 0 ;
718722 let mut num_mem_series = 0 ;
723+ // Checks each file range, returns early if any range is not splittable.
724+ // For mem ranges, we collect the total number of rows and series because the number of rows in a
725+ // mem range may be too small.
719726 for index in & range_meta. row_group_indices {
720727 if stream_ctx. is_mem_range_index ( * index) {
721728 let memtable = & stream_ctx. input . memtables [ index. index ] ;
722729 // Is mem range
723730 let stats = memtable. stats ( ) ;
724731 num_mem_rows += stats. num_rows ( ) ;
725732 num_mem_series += stats. series_count ( ) ;
726- } else if !stream_ctx. is_file_range_index ( * index) {
727- // Skips non-file and non-mem ranges.
728- } else {
729- assert ! ( stream_ctx. is_file_range_index( * index) ) ;
730-
733+ } else if stream_ctx. is_file_range_index ( * index) {
731734 // This is a file range.
732735 let file_index = index. index - stream_ctx. input . num_memtables ( ) ;
733736 let file = & stream_ctx. input . files [ file_index] ;
734- if file. meta_ref ( ) . num_rows < SPLIT_ROW_THRESHOLD {
735- // If the file doesn't have enough rows, skips it.
736- continue ;
737- }
738- if file. meta_ref ( ) . num_series == 0 {
739- // Number of series is unavailable.
737+ if file. meta_ref ( ) . num_rows < SPLIT_ROW_THRESHOLD || file. meta_ref ( ) . num_series == 0 {
738+ // If the file doesn't have enough rows, or the number of series is unavailable, skips it.
740739 continue ;
741740 }
742741 debug_assert ! ( file. meta_ref( ) . num_rows > 0 ) ;
743742 if !can_split_series ( file. meta_ref ( ) . num_rows , file. meta_ref ( ) . num_series ) {
744- // We don't skip if we find that we can't split batches in a file.
743+ // We can't split batches in a file.
745744 return false ;
746745 } else {
747- num_scan_files += 1 ;
746+ num_files_to_split += 1 ;
748747 }
749748 }
749+ // Skips non-file and non-mem ranges.
750750 }
751751
752- if num_scan_files > 0 {
752+ if num_files_to_split > 0 {
753+ // We mainly consider file ranges because they have enough data for sampling.
753754 true
754755 } else if num_mem_series > 0 && num_mem_rows > 0 {
755756 // If we don't have files to scan, we check whether to split by the memtable.
0 commit comments