Skip to content

Commit c7130cd

Browse files
committed
chore: update comments and control flow
Signed-off-by: evenyag <[email protected]>
1 parent f1491d8 commit c7130cd

File tree

1 file changed

+17
-16
lines changed

1 file changed

+17
-16
lines changed

src/mito2/src/read/scan_util.rs

Lines changed: 17 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -703,53 +703,54 @@ pub(crate) fn scan_flat_mem_ranges(
703703
}
704704
}
705705

706+
/// Files with row count greater than this threshold can contribute to the estimation.
706707
const SPLIT_ROW_THRESHOLD: u64 = DEFAULT_ROW_GROUP_SIZE as u64;
708+
/// Number of series threshold for splitting batches.
707709
const NUM_SERIES_THRESHOLD: u64 = 10240;
710+
/// Minimum batch size after splitting. The batch size is less than 60 because a series may only have
711+
/// 60 samples per hour.
708712
const BATCH_SIZE_THRESHOLD: u64 = 50;
709713

710714
/// Returns true if splitting flat record batches may improve merge performance.
711715
pub(crate) fn should_split_flat_batches_for_merge(
712716
stream_ctx: &Arc<StreamContext>,
713717
range_meta: &RangeMeta,
714718
) -> bool {
715-
// Number of files to scan.
716-
let mut num_scan_files = 0;
719+
// Number of files to split and scan.
720+
let mut num_files_to_split = 0;
717721
let mut num_mem_rows = 0;
718722
let mut num_mem_series = 0;
723+
// Checks each file range, returns early if any range is not splittable.
724+
// For mem ranges, we collect the total number of rows and series because the number of rows in a
725+
// mem range may be too small.
719726
for index in &range_meta.row_group_indices {
720727
if stream_ctx.is_mem_range_index(*index) {
721728
let memtable = &stream_ctx.input.memtables[index.index];
722729
// Is mem range
723730
let stats = memtable.stats();
724731
num_mem_rows += stats.num_rows();
725732
num_mem_series += stats.series_count();
726-
} else if !stream_ctx.is_file_range_index(*index) {
727-
// Skips non-file and non-mem ranges.
728-
} else {
729-
assert!(stream_ctx.is_file_range_index(*index));
730-
733+
} else if stream_ctx.is_file_range_index(*index) {
731734
// This is a file range.
732735
let file_index = index.index - stream_ctx.input.num_memtables();
733736
let file = &stream_ctx.input.files[file_index];
734-
if file.meta_ref().num_rows < SPLIT_ROW_THRESHOLD {
735-
// If the file doesn't have enough rows, skips it.
736-
continue;
737-
}
738-
if file.meta_ref().num_series == 0 {
739-
// Number of series is unavailable.
737+
if file.meta_ref().num_rows < SPLIT_ROW_THRESHOLD || file.meta_ref().num_series == 0 {
738+
// If the file doesn't have enough rows, or the number of series is unavailable, skips it.
740739
continue;
741740
}
742741
debug_assert!(file.meta_ref().num_rows > 0);
743742
if !can_split_series(file.meta_ref().num_rows, file.meta_ref().num_series) {
744-
// We don't skip if we find that we can't split batches in a file.
743+
// We can't split batches in a file.
745744
return false;
746745
} else {
747-
num_scan_files += 1;
746+
num_files_to_split += 1;
748747
}
749748
}
749+
// Skips non-file and non-mem ranges.
750750
}
751751

752-
if num_scan_files > 0 {
752+
if num_files_to_split > 0 {
753+
// We mainly consider file ranges because they have enough data for sampling.
753754
true
754755
} else if num_mem_series > 0 && num_mem_rows > 0 {
755756
// If we don't have files to scan, we check whether to split by the memtable.

0 commit comments

Comments
 (0)