Skip to content

Commit 2d7ae09

Browse files
authored
Set the default value of datafusion.execution.collect_statistics to true (#16447)
* fix sqllogicaltests * Add upgrade note
1 parent 43ba6f2 commit 2d7ae09

File tree

12 files changed

+211
-270
lines changed

12 files changed

+211
-270
lines changed

datafusion/common/src/config.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -294,8 +294,8 @@ config_namespace! {
294294

295295
/// Should DataFusion collect statistics when first creating a table.
296296
/// Has no effect after the table is created. Applies to the default
297-
/// `ListingTableProvider` in DataFusion. Defaults to false.
298-
pub collect_statistics: bool, default = false
297+
/// `ListingTableProvider` in DataFusion. Defaults to true.
298+
pub collect_statistics: bool, default = true
299299

300300
/// Number of partitions for query execution. Increasing partitions can increase
301301
/// concurrency.

datafusion/core/src/execution/context/parquet.rs

Lines changed: 13 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -34,13 +34,12 @@ impl SessionContext {
3434
///
3535
/// # Note: Statistics
3636
///
37-
/// NOTE: by default, statistics are not collected when reading the Parquet
38-
/// files as this can slow down the initial DataFrame creation. However,
39-
/// collecting statistics can greatly accelerate queries with certain
40-
/// filters.
37+
/// NOTE: by default, statistics are collected when reading the Parquet
38+
/// files This can slow down the initial DataFrame creation while
39+
/// greatly accelerating queries with certain filters.
4140
///
42-
/// To enable collect statistics, set the [config option]
43-
/// `datafusion.execution.collect_statistics` to `true`. See
41+
/// To disable statistics collection, set the [config option]
42+
/// `datafusion.execution.collect_statistics` to `false`. See
4443
/// [`ConfigOptions`] and [`ExecutionOptions::collect_statistics`] for more
4544
/// details.
4645
///
@@ -171,28 +170,28 @@ mod tests {
171170

172171
#[tokio::test]
173172
async fn register_parquet_respects_collect_statistics_config() -> Result<()> {
174-
// The default is false
173+
// The default is true
175174
let mut config = SessionConfig::new();
176175
config.options_mut().explain.physical_plan_only = true;
177176
config.options_mut().explain.show_statistics = true;
178177
let content = explain_query_all_with_config(config).await?;
179-
assert_contains!(content, "statistics=[Rows=Absent,");
178+
assert_contains!(content, "statistics=[Rows=Exact(");
180179

181-
// Explicitly set to false
180+
// Explicitly set to true
182181
let mut config = SessionConfig::new();
183182
config.options_mut().explain.physical_plan_only = true;
184183
config.options_mut().explain.show_statistics = true;
185-
config.options_mut().execution.collect_statistics = false;
184+
config.options_mut().execution.collect_statistics = true;
186185
let content = explain_query_all_with_config(config).await?;
187-
assert_contains!(content, "statistics=[Rows=Absent,");
186+
assert_contains!(content, "statistics=[Rows=Exact(");
188187

189-
// Explicitly set to true
188+
// Explicitly set to false
190189
let mut config = SessionConfig::new();
191190
config.options_mut().explain.physical_plan_only = true;
192191
config.options_mut().explain.show_statistics = true;
193-
config.options_mut().execution.collect_statistics = true;
192+
config.options_mut().execution.collect_statistics = false;
194193
let content = explain_query_all_with_config(config).await?;
195-
assert_contains!(content, "statistics=[Rows=Exact(10),");
194+
assert_contains!(content, "statistics=[Rows=Absent,");
196195

197196
Ok(())
198197
}

datafusion/core/tests/parquet/row_group_pruning.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -421,7 +421,7 @@ macro_rules! int_tests {
421421
.with_query(&format!("SELECT * FROM t where i{} in (100)", $bits))
422422
.with_expected_errors(Some(0))
423423
.with_matched_by_stats(Some(0))
424-
.with_pruned_by_stats(Some(4))
424+
.with_pruned_by_stats(Some(0))
425425
.with_matched_by_bloom_filter(Some(0))
426426
.with_pruned_by_bloom_filter(Some(0))
427427
.with_expected_rows(0)
@@ -1316,7 +1316,7 @@ async fn test_row_group_with_null_values() {
13161316
.with_query("SELECT * FROM t WHERE \"i32\" > 7")
13171317
.with_expected_errors(Some(0))
13181318
.with_matched_by_stats(Some(0))
1319-
.with_pruned_by_stats(Some(3))
1319+
.with_pruned_by_stats(Some(0))
13201320
.with_expected_rows(0)
13211321
.with_matched_by_bloom_filter(Some(0))
13221322
.with_pruned_by_bloom_filter(Some(0))

datafusion/sqllogictest/test_files/explain_tree.slt

Lines changed: 156 additions & 227 deletions
Large diffs are not rendered by default.

datafusion/sqllogictest/test_files/information_schema.slt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -216,7 +216,7 @@ datafusion.catalog.location NULL
216216
datafusion.catalog.newlines_in_values false
217217
datafusion.execution.batch_size 8192
218218
datafusion.execution.coalesce_batches true
219-
datafusion.execution.collect_statistics false
219+
datafusion.execution.collect_statistics true
220220
datafusion.execution.enable_recursive_ctes true
221221
datafusion.execution.enforce_batch_size_in_joins false
222222
datafusion.execution.keep_partition_by_columns false
@@ -328,7 +328,7 @@ datafusion.catalog.location NULL Location scanned to load tables for `default` s
328328
datafusion.catalog.newlines_in_values false Specifies whether newlines in (quoted) CSV values are supported. This is the default value for `format.newlines_in_values` for `CREATE EXTERNAL TABLE` if not specified explicitly in the statement. Parsing newlines in quoted values may be affected by execution behaviour such as parallel file scanning. Setting this to `true` ensures that newlines in values are parsed successfully, which may reduce performance.
329329
datafusion.execution.batch_size 8192 Default batch size while creating new batches, it's especially useful for buffer-in-memory batches since creating tiny batches would result in too much metadata memory consumption
330330
datafusion.execution.coalesce_batches true When set to true, record batches will be examined between each operator and small batches will be coalesced into larger batches. This is helpful when there are highly selective filters or joins that could produce tiny output batches. The target batch size is determined by the configuration setting
331-
datafusion.execution.collect_statistics false Should DataFusion collect statistics when first creating a table. Has no effect after the table is created. Applies to the default `ListingTableProvider` in DataFusion. Defaults to false.
331+
datafusion.execution.collect_statistics true Should DataFusion collect statistics when first creating a table. Has no effect after the table is created. Applies to the default `ListingTableProvider` in DataFusion. Defaults to true.
332332
datafusion.execution.enable_recursive_ctes true Should DataFusion support recursive CTEs
333333
datafusion.execution.enforce_batch_size_in_joins false Should DataFusion enforce batch size in joins or not. By default, DataFusion will not enforce batch size in joins. Enforcing batch size in joins can reduce memory usage when joining large tables with a highly-selective join filter, but is also slightly slower.
334334
datafusion.execution.keep_partition_by_columns false Should DataFusion keep the columns used for partition_by in the output RecordBatches

datafusion/sqllogictest/test_files/limit.slt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -854,7 +854,7 @@ physical_plan
854854
01)ProjectionExec: expr=[1 as foo]
855855
02)--SortPreservingMergeExec: [part_key@0 ASC NULLS LAST], fetch=1
856856
03)----SortExec: TopK(fetch=1), expr=[part_key@0 ASC NULLS LAST], preserve_partitioning=[true]
857-
04)------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_limit_with_partitions/part-0.parquet:0..794], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_limit_with_partitions/part-1.parquet:0..794], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_limit_with_partitions/part-2.parquet:0..794]]}, projection=[part_key], file_type=parquet, predicate=DynamicFilterPhysicalExpr [ true ]
857+
04)------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_limit_with_partitions/part-0.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_limit_with_partitions/part-1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_limit_with_partitions/part-2.parquet]]}, projection=[part_key], file_type=parquet, predicate=DynamicFilterPhysicalExpr [ true ]
858858

859859
query I
860860
with selection as (

datafusion/sqllogictest/test_files/parquet_filter_pushdown.slt

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -212,10 +212,9 @@ logical_plan
212212
physical_plan
213213
01)CoalesceBatchesExec: target_batch_size=8192
214214
02)--FilterExec: val@0 != part@1
215-
03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=3
216-
04)------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=a/file.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=b/file.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=c/file.parquet]]}, projection=[val, part], file_type=parquet
215+
03)----DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=a/file.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=b/file.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=c/file.parquet]]}, projection=[val, part], file_type=parquet
217216

218-
# If we reference only a partition column it gets evaluted during the listing phase
217+
# If we reference only a partition column it gets evaluated during the listing phase
219218
query TT
220219
EXPLAIN select * from t_pushdown where part != 'a';
221220
----
@@ -257,8 +256,7 @@ logical_plan
257256
physical_plan
258257
01)CoalesceBatchesExec: target_batch_size=8192
259258
02)--FilterExec: val@0 = part@1
260-
03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
261-
04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=a/file.parquet]]}, projection=[val, part], file_type=parquet
259+
03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=a/file.parquet]]}, projection=[val, part], file_type=parquet
262260

263261
query TT
264262
select val, part from t_pushdown where part = 'a' AND part = val;
@@ -274,8 +272,7 @@ logical_plan
274272
physical_plan
275273
01)CoalesceBatchesExec: target_batch_size=8192
276274
02)--FilterExec: val@0 = part@1
277-
03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
278-
04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=a/file.parquet]]}, projection=[val, part], file_type=parquet
275+
03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=a/file.parquet]]}, projection=[val, part], file_type=parquet
279276

280277
query TT
281278
select val, part from t_pushdown where part = val AND part = 'a';

datafusion/sqllogictest/test_files/parquet_statistics.slt

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ statement ok
4646
set datafusion.explain.show_statistics = true;
4747

4848
######
49-
# By default, the statistics are not gathered
49+
# By default, the statistics are gathered
5050
######
5151

5252
# Recreate the table to pick up the current setting
@@ -59,18 +59,18 @@ query TT
5959
EXPLAIN SELECT * FROM test_table WHERE column1 = 1;
6060
----
6161
physical_plan
62-
01)CoalesceBatchesExec: target_batch_size=8192, statistics=[Rows=Absent, Bytes=Absent, [(Col[0]:)]]
63-
02)--FilterExec: column1@0 = 1, statistics=[Rows=Absent, Bytes=Absent, [(Col[0]: Min=Exact(Int64(1)) Max=Exact(Int64(1)))]]
64-
03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2, statistics=[Rows=Absent, Bytes=Absent, [(Col[0]:)]]
62+
01)CoalesceBatchesExec: target_batch_size=8192, statistics=[Rows=Inexact(2), Bytes=Inexact(44), [(Col[0]: Min=Exact(Int64(1)) Max=Exact(Int64(1)) Null=Inexact(0))]]
63+
02)--FilterExec: column1@0 = 1, statistics=[Rows=Inexact(2), Bytes=Inexact(44), [(Col[0]: Min=Exact(Int64(1)) Max=Exact(Int64(1)) Null=Inexact(0))]]
64+
03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2, statistics=[Rows=Inexact(5), Bytes=Inexact(173), [(Col[0]: Min=Inexact(Int64(1)) Max=Inexact(Int64(4)) Null=Inexact(0))]]
6565
04)------DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/0.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/1.parquet]]}, projection=[column1], file_type=parquet, predicate=column1@0 = 1, pruning_predicate=column1_null_count@2 != row_count@3 AND column1_min@0 <= 1 AND 1 <= column1_max@1, required_guarantees=[column1 in (1)]
66-
05), statistics=[Rows=Absent, Bytes=Absent, [(Col[0]:)]]
66+
05), statistics=[Rows=Inexact(5), Bytes=Inexact(173), [(Col[0]: Min=Inexact(Int64(1)) Max=Inexact(Int64(4)) Null=Inexact(0))]]
6767

6868
# cleanup
6969
statement ok
7070
DROP TABLE test_table;
7171

7272
######
73-
# When the setting is true, the statistics are gathered
73+
# When the setting is true, statistics are gathered
7474
######
7575

7676
statement ok

datafusion/sqllogictest/test_files/repartition.slt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,8 @@ physical_plan
4646
01)AggregateExec: mode=FinalPartitioned, gby=[column1@0 as column1], aggr=[sum(parquet_table.column2)]
4747
02)--CoalesceBatchesExec: target_batch_size=8192
4848
03)----RepartitionExec: partitioning=Hash([column1@0], 4), input_partitions=4
49-
04)------AggregateExec: mode=Partial, gby=[column1@0 as column1], aggr=[sum(parquet_table.column2)]
50-
05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
49+
04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
50+
05)--------AggregateExec: mode=Partial, gby=[column1@0 as column1], aggr=[sum(parquet_table.column2)]
5151
06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition/parquet_table/2.parquet]]}, projection=[column1, column2], file_type=parquet
5252

5353
# disable round robin repartitioning

docs/source/library-user-guide/upgrading.md

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,23 @@
2121

2222
## DataFusion `49.0.0`
2323

24+
### `datafusion.execution.collect_statistics` now defaults to `true`
25+
26+
The default value of the `datafusion.execution.collect_statistics` configuration
27+
setting is now true. This change impacts users that use that value directly and relied
28+
on its default value being `false`.
29+
30+
This change also restores the default behavior of `ListingTable` to its previous. If you use it directly
31+
you can maintain the current behavior by overriding the default value in your code.
32+
33+
```rust
34+
# /* comment to avoid running
35+
ListingOptions::new(Arc::new(ParquetFormat::default()))
36+
.with_collect_stat(false)
37+
// other options
38+
# */
39+
```
40+
2441
### Metadata is now represented by `FieldMetadata`
2542

2643
Metadata from the Arrow `Field` is now stored using the `FieldMetadata`
@@ -139,7 +156,7 @@ match expr {
139156

140157
[details on #16207]: https://github.com/apache/datafusion/pull/16207#issuecomment-2922659103
141158

142-
### The `VARCHAR` SQL type is now represented as `Utf8View` in Arrow.
159+
### The `VARCHAR` SQL type is now represented as `Utf8View` in Arrow
143160

144161
The mapping of the SQL `VARCHAR` type has been changed from `Utf8` to `Utf8View`
145162
which improves performance for many string operations. You can read more about
@@ -291,7 +308,6 @@ Additionally `ObjectStore::list` and `ObjectStore::list_with_offset` have been c
291308

292309
[#6619]: https://github.com/apache/arrow-rs/pull/6619
293310
[#7371]: https://github.com/apache/arrow-rs/pull/7371
294-
[#7328]: https://github.com/apache/arrow-rs/pull/6961
295311

296312
This requires converting from `usize` to `u64` occasionally as well as changes to `ObjectStore` implementations such as
297313

0 commit comments

Comments
 (0)