Skip to content

Commit 71d3d29

Browse files
adriangbclaude
andauthored
fix: set distinct_count to Absent when merging statistics (#17385)
When merging statistics, the distinct count cannot be accurately determined from the merged data, so it should be set to Absent rather than attempting to combine the values. Added test to verify distinct_count becomes Absent after merge. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-authored-by: Claude <[email protected]>
1 parent 12e6ae4 commit 71d3d29

File tree

1 file changed

+47
-0
lines changed

1 file changed

+47
-0
lines changed

datafusion/common/src/stats.rs

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -538,6 +538,7 @@ impl Statistics {
538538
col_stats.max_value = col_stats.max_value.max(&item_col_stats.max_value);
539539
col_stats.min_value = col_stats.min_value.min(&item_col_stats.min_value);
540540
col_stats.sum_value = col_stats.sum_value.add(&item_col_stats.sum_value);
541+
col_stats.distinct_count = Precision::Absent;
541542
}
542543

543544
Ok(Statistics {
@@ -1152,4 +1153,50 @@ mod tests {
11521153
let e = Statistics::try_merge_iter(&items, &schema).unwrap_err();
11531154
assert_contains!(e.to_string(), "Error during planning: Cannot merge statistics with different number of columns: 0 vs 1");
11541155
}
1156+
1157+
#[test]
1158+
fn test_try_merge_distinct_count_absent() {
1159+
// Create statistics with known distinct counts
1160+
let stats1 = Statistics::default()
1161+
.with_num_rows(Precision::Exact(10))
1162+
.with_total_byte_size(Precision::Exact(100))
1163+
.add_column_statistics(
1164+
ColumnStatistics::new_unknown()
1165+
.with_null_count(Precision::Exact(0))
1166+
.with_min_value(Precision::Exact(ScalarValue::Int32(Some(1))))
1167+
.with_max_value(Precision::Exact(ScalarValue::Int32(Some(10))))
1168+
.with_distinct_count(Precision::Exact(5)),
1169+
);
1170+
1171+
let stats2 = Statistics::default()
1172+
.with_num_rows(Precision::Exact(15))
1173+
.with_total_byte_size(Precision::Exact(150))
1174+
.add_column_statistics(
1175+
ColumnStatistics::new_unknown()
1176+
.with_null_count(Precision::Exact(0))
1177+
.with_min_value(Precision::Exact(ScalarValue::Int32(Some(5))))
1178+
.with_max_value(Precision::Exact(ScalarValue::Int32(Some(20))))
1179+
.with_distinct_count(Precision::Exact(7)),
1180+
);
1181+
1182+
// Merge statistics
1183+
let merged_stats = stats1.try_merge(&stats2).unwrap();
1184+
1185+
// Verify the results
1186+
assert_eq!(merged_stats.num_rows, Precision::Exact(25));
1187+
assert_eq!(merged_stats.total_byte_size, Precision::Exact(250));
1188+
1189+
let col_stats = &merged_stats.column_statistics[0];
1190+
assert_eq!(col_stats.null_count, Precision::Exact(0));
1191+
assert_eq!(
1192+
col_stats.min_value,
1193+
Precision::Exact(ScalarValue::Int32(Some(1)))
1194+
);
1195+
assert_eq!(
1196+
col_stats.max_value,
1197+
Precision::Exact(ScalarValue::Int32(Some(20)))
1198+
);
1199+
// Distinct count should be Absent after merge
1200+
assert_eq!(col_stats.distinct_count, Precision::Absent);
1201+
}
11551202
}

0 commit comments

Comments
 (0)