Skip to content

Commit a13fd45

Browse files
committed
Change default pushdown_filters and reorder_filters to true
1 parent 6b40175 commit a13fd45

File tree

10 files changed

+38
-88
lines changed

10 files changed

+38
-88
lines changed

datafusion/common/src/config.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -519,12 +519,12 @@ config_namespace! {
519519

520520
/// (reading) If true, filter expressions are be applied during the parquet decoding operation to
521521
/// reduce the number of rows decoded. This optimization is sometimes called "late materialization".
522-
pub pushdown_filters: bool, default = false
522+
pub pushdown_filters: bool, default = true
523523

524524
/// (reading) If true, filter expressions evaluated during the parquet decoding operation
525525
/// will be reordered heuristically to minimize the cost of evaluation. If false,
526526
/// the filters are applied in the same order as written in the query
527-
pub reorder_filters: bool, default = false
527+
pub reorder_filters: bool, default = true
528528

529529
/// (reading) If true, parquet reader will read columns of `Utf8/Utf8Large` with `Utf8View`,
530530
/// and `Binary/BinaryLarge` with `BinaryView`.

datafusion/core/src/dataframe/parquet.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,10 @@ mod tests {
146146
let plan = df.explain(false, false)?.collect().await?;
147147
// Filters all the way to Parquet
148148
let formatted = pretty::pretty_format_batches(&plan)?.to_string();
149-
assert!(formatted.contains("FilterExec: id@0 = 1"));
149+
assert!(
150+
formatted.contains("projection=[bool_col, int_col], file_type=parquet"),
151+
"formated:\n {formatted}"
152+
);
150153

151154
Ok(())
152155
}

datafusion/core/src/datasource/view_test.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -326,7 +326,10 @@ mod tests {
326326
let formatted = arrow::util::pretty::pretty_format_batches(&plan)
327327
.unwrap()
328328
.to_string();
329-
assert!(formatted.contains("FilterExec: id@0 = 1"));
329+
assert!(
330+
formatted.contains("file_type=parquet, predicate=id@0 = 1"),
331+
"formatted:\n{formatted}",
332+
);
330333
Ok(())
331334
}
332335

datafusion/core/tests/sql/explain_analyze.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -720,7 +720,7 @@ async fn parquet_explain_analyze() {
720720
.to_string();
721721

722722
// should contain aggregated stats
723-
assert_contains!(&formatted, "output_rows=8");
723+
assert_contains!(&formatted, "output_rows=5");
724724
assert_contains!(&formatted, "row_groups_matched_bloom_filter=0");
725725
assert_contains!(&formatted, "row_groups_pruned_bloom_filter=0");
726726
assert_contains!(&formatted, "row_groups_matched_statistics=1");

datafusion/sqllogictest/test_files/explain_tree.slt

Lines changed: 7 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -605,35 +605,14 @@ explain SELECT int_col FROM table2 WHERE string_col != 'foo';
605605
----
606606
physical_plan
607607
01)┌───────────────────────────┐
608-
02)│ CoalesceBatchesExec
608+
02)│ DataSourceExec
609609
03)│ -------------------- │
610-
04)│ target_batch_size: │
611-
05)│ 8192 │
612-
06)└─────────────┬─────────────┘
613-
07)┌─────────────┴─────────────┐
614-
08)│ FilterExec │
615-
09)│ -------------------- │
616-
10)│ predicate: │
617-
11)│ string_col != foo │
618-
12)└─────────────┬─────────────┘
619-
13)┌─────────────┴─────────────┐
620-
14)│ RepartitionExec │
621-
15)│ -------------------- │
622-
16)│ partition_count(in->out): │
623-
17)│ 1 -> 4 │
624-
18)│ │
625-
19)│ partitioning_scheme: │
626-
20)│ RoundRobinBatch(4) │
627-
21)└─────────────┬─────────────┘
628-
22)┌─────────────┴─────────────┐
629-
23)│ DataSourceExec │
630-
24)│ -------------------- │
631-
25)│ files: 1 │
632-
26)│ format: parquet │
633-
27)│ │
634-
28)│ predicate: │
635-
29)│ string_col != foo │
636-
30)└───────────────────────────┘
610+
04)│ files: 1 │
611+
05)│ format: parquet │
612+
06)│ │
613+
07)│ predicate: │
614+
08)│ string_col != foo │
615+
09)└───────────────────────────┘
637616

638617
# Query with filter on memory
639618
query TT

datafusion/sqllogictest/test_files/information_schema.slt

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -246,8 +246,8 @@ datafusion.execution.parquet.maximum_buffered_record_batches_per_stream 2
246246
datafusion.execution.parquet.maximum_parallel_row_group_writers 1
247247
datafusion.execution.parquet.metadata_size_hint NULL
248248
datafusion.execution.parquet.pruning true
249-
datafusion.execution.parquet.pushdown_filters false
250-
datafusion.execution.parquet.reorder_filters false
249+
datafusion.execution.parquet.pushdown_filters true
250+
datafusion.execution.parquet.reorder_filters true
251251
datafusion.execution.parquet.schema_force_view_types true
252252
datafusion.execution.parquet.skip_arrow_metadata false
253253
datafusion.execution.parquet.skip_metadata true
@@ -357,8 +357,8 @@ datafusion.execution.parquet.maximum_buffered_record_batches_per_stream 2 (writi
357357
datafusion.execution.parquet.maximum_parallel_row_group_writers 1 (writing) By default parallel parquet writer is tuned for minimum memory usage in a streaming execution plan. You may see a performance benefit when writing large parquet files by increasing maximum_parallel_row_group_writers and maximum_buffered_record_batches_per_stream if your system has idle cores and can tolerate additional memory usage. Boosting these values is likely worthwhile when writing out already in-memory data, such as from a cached data frame.
358358
datafusion.execution.parquet.metadata_size_hint NULL (reading) If specified, the parquet reader will try and fetch the last `size_hint` bytes of the parquet file optimistically. If not specified, two reads are required: One read to fetch the 8-byte parquet footer and another to fetch the metadata length encoded in the footer
359359
datafusion.execution.parquet.pruning true (reading) If true, the parquet reader attempts to skip entire row groups based on the predicate in the query and the metadata (min/max values) stored in the parquet file
360-
datafusion.execution.parquet.pushdown_filters false (reading) If true, filter expressions are be applied during the parquet decoding operation to reduce the number of rows decoded. This optimization is sometimes called "late materialization".
361-
datafusion.execution.parquet.reorder_filters false (reading) If true, filter expressions evaluated during the parquet decoding operation will be reordered heuristically to minimize the cost of evaluation. If false, the filters are applied in the same order as written in the query
360+
datafusion.execution.parquet.pushdown_filters true (reading) If true, filter expressions are be applied during the parquet decoding operation to reduce the number of rows decoded. This optimization is sometimes called "late materialization".
361+
datafusion.execution.parquet.reorder_filters true (reading) If true, filter expressions evaluated during the parquet decoding operation will be reordered heuristically to minimize the cost of evaluation. If false, the filters are applied in the same order as written in the query
362362
datafusion.execution.parquet.schema_force_view_types true (reading) If true, parquet reader will read columns of `Utf8/Utf8Large` with `Utf8View`, and `Binary/BinaryLarge` with `BinaryView`.
363363
datafusion.execution.parquet.skip_arrow_metadata false (writing) Skip encoding the embedded arrow metadata in the KV_meta This is analogous to the `ArrowWriterOptions::with_skip_arrow_metadata`. Refer to <https://docs.rs/parquet/53.3.0/parquet/arrow/arrow_writer/struct.ArrowWriterOptions.html#method.with_skip_arrow_metadata>
364364
datafusion.execution.parquet.skip_metadata true (reading) If true, the parquet reader skip the optional embedded metadata that may be in the file Schema. This setting can help avoid schema conflicts when querying multiple parquet files with schemas containing compatible types but different metadata

datafusion/sqllogictest/test_files/parquet.slt

Lines changed: 4 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -455,11 +455,7 @@ EXPLAIN
455455
logical_plan
456456
01)Filter: CAST(binary_as_string_default.binary_col AS Utf8View) LIKE Utf8View("%a%") AND CAST(binary_as_string_default.largebinary_col AS Utf8View) LIKE Utf8View("%a%") AND CAST(binary_as_string_default.binaryview_col AS Utf8View) LIKE Utf8View("%a%")
457457
02)--TableScan: binary_as_string_default projection=[binary_col, largebinary_col, binaryview_col], partial_filters=[CAST(binary_as_string_default.binary_col AS Utf8View) LIKE Utf8View("%a%"), CAST(binary_as_string_default.largebinary_col AS Utf8View) LIKE Utf8View("%a%"), CAST(binary_as_string_default.binaryview_col AS Utf8View) LIKE Utf8View("%a%")]
458-
physical_plan
459-
01)CoalesceBatchesExec: target_batch_size=8192
460-
02)--FilterExec: CAST(binary_col@0 AS Utf8View) LIKE %a% AND CAST(largebinary_col@1 AS Utf8View) LIKE %a% AND CAST(binaryview_col@2 AS Utf8View) LIKE %a%
461-
03)----RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
462-
04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/binary_as_string.parquet]]}, projection=[binary_col, largebinary_col, binaryview_col], file_type=parquet, predicate=CAST(binary_col@0 AS Utf8View) LIKE %a% AND CAST(largebinary_col@1 AS Utf8View) LIKE %a% AND CAST(binaryview_col@2 AS Utf8View) LIKE %a%
458+
physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/binary_as_string.parquet]]}, projection=[binary_col, largebinary_col, binaryview_col], file_type=parquet, predicate=CAST(binary_col@0 AS Utf8View) LIKE %a% AND CAST(largebinary_col@1 AS Utf8View) LIKE %a% AND CAST(binaryview_col@2 AS Utf8View) LIKE %a%
463459

464460

465461
statement ok
@@ -503,11 +499,7 @@ EXPLAIN
503499
logical_plan
504500
01)Filter: binary_as_string_option.binary_col LIKE Utf8View("%a%") AND binary_as_string_option.largebinary_col LIKE Utf8View("%a%") AND binary_as_string_option.binaryview_col LIKE Utf8View("%a%")
505501
02)--TableScan: binary_as_string_option projection=[binary_col, largebinary_col, binaryview_col], partial_filters=[binary_as_string_option.binary_col LIKE Utf8View("%a%"), binary_as_string_option.largebinary_col LIKE Utf8View("%a%"), binary_as_string_option.binaryview_col LIKE Utf8View("%a%")]
506-
physical_plan
507-
01)CoalesceBatchesExec: target_batch_size=8192
508-
02)--FilterExec: binary_col@0 LIKE %a% AND largebinary_col@1 LIKE %a% AND binaryview_col@2 LIKE %a%
509-
03)----RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
510-
04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/binary_as_string.parquet]]}, projection=[binary_col, largebinary_col, binaryview_col], file_type=parquet, predicate=binary_col@0 LIKE %a% AND largebinary_col@1 LIKE %a% AND binaryview_col@2 LIKE %a%
502+
physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/binary_as_string.parquet]]}, projection=[binary_col, largebinary_col, binaryview_col], file_type=parquet, predicate=binary_col@0 LIKE %a% AND largebinary_col@1 LIKE %a% AND binaryview_col@2 LIKE %a%
511503

512504

513505
statement ok
@@ -554,11 +546,7 @@ EXPLAIN
554546
logical_plan
555547
01)Filter: binary_as_string_both.binary_col LIKE Utf8View("%a%") AND binary_as_string_both.largebinary_col LIKE Utf8View("%a%") AND binary_as_string_both.binaryview_col LIKE Utf8View("%a%")
556548
02)--TableScan: binary_as_string_both projection=[binary_col, largebinary_col, binaryview_col], partial_filters=[binary_as_string_both.binary_col LIKE Utf8View("%a%"), binary_as_string_both.largebinary_col LIKE Utf8View("%a%"), binary_as_string_both.binaryview_col LIKE Utf8View("%a%")]
557-
physical_plan
558-
01)CoalesceBatchesExec: target_batch_size=8192
559-
02)--FilterExec: binary_col@0 LIKE %a% AND largebinary_col@1 LIKE %a% AND binaryview_col@2 LIKE %a%
560-
03)----RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
561-
04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/binary_as_string.parquet]]}, projection=[binary_col, largebinary_col, binaryview_col], file_type=parquet, predicate=binary_col@0 LIKE %a% AND largebinary_col@1 LIKE %a% AND binaryview_col@2 LIKE %a%
549+
physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/binary_as_string.parquet]]}, projection=[binary_col, largebinary_col, binaryview_col], file_type=parquet, predicate=binary_col@0 LIKE %a% AND largebinary_col@1 LIKE %a% AND binaryview_col@2 LIKE %a%
562550

563551

564552
statement ok
@@ -669,11 +657,7 @@ explain select * from foo where starts_with(column1, 'f');
669657
logical_plan
670658
01)Filter: foo.column1 LIKE Utf8View("f%")
671659
02)--TableScan: foo projection=[column1], partial_filters=[foo.column1 LIKE Utf8View("f%")]
672-
physical_plan
673-
01)CoalesceBatchesExec: target_batch_size=8192
674-
02)--FilterExec: column1@0 LIKE f%
675-
03)----RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
676-
04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/foo.parquet]]}, projection=[column1], file_type=parquet, predicate=column1@0 LIKE f%, pruning_predicate=column1_null_count@2 != row_count@3 AND column1_min@0 <= g AND f <= column1_max@1, required_guarantees=[]
660+
physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/foo.parquet]]}, projection=[column1], file_type=parquet, predicate=column1@0 LIKE f%, pruning_predicate=column1_null_count@2 != row_count@3 AND column1_min@0 <= g AND f <= column1_max@1, required_guarantees=[]
677661

678662
statement ok
679663
drop table foo

datafusion/sqllogictest/test_files/parquet_statistics.slt

Lines changed: 6 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -59,11 +59,8 @@ query TT
5959
EXPLAIN SELECT * FROM test_table WHERE column1 = 1;
6060
----
6161
physical_plan
62-
01)CoalesceBatchesExec: target_batch_size=8192, statistics=[Rows=Inexact(2), Bytes=Inexact(31), [(Col[0]: Min=Exact(Int64(1)) Max=Exact(Int64(1)) Null=Inexact(0))]]
63-
02)--FilterExec: column1@0 = 1, statistics=[Rows=Inexact(2), Bytes=Inexact(31), [(Col[0]: Min=Exact(Int64(1)) Max=Exact(Int64(1)) Null=Inexact(0))]]
64-
03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2, statistics=[Rows=Inexact(5), Bytes=Inexact(121), [(Col[0]: Min=Inexact(Int64(1)) Max=Inexact(Int64(4)) Null=Inexact(0))]]
65-
04)------DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/0.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/1.parquet]]}, projection=[column1], file_type=parquet, predicate=column1@0 = 1, pruning_predicate=column1_null_count@2 != row_count@3 AND column1_min@0 <= 1 AND 1 <= column1_max@1, required_guarantees=[column1 in (1)]
66-
05), statistics=[Rows=Inexact(5), Bytes=Inexact(121), [(Col[0]: Min=Inexact(Int64(1)) Max=Inexact(Int64(4)) Null=Inexact(0))]]
62+
01)DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/0.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/1.parquet]]}, projection=[column1], file_type=parquet, predicate=column1@0 = 1, pruning_predicate=column1_null_count@2 != row_count@3 AND column1_min@0 <= 1 AND 1 <= column1_max@1, required_guarantees=[column1 in (1)]
63+
02), statistics=[Rows=Inexact(5), Bytes=Inexact(121), [(Col[0]: Min=Inexact(Int64(1)) Max=Inexact(Int64(4)) Null=Inexact(0))]]
6764

6865
# cleanup
6966
statement ok
@@ -86,11 +83,8 @@ query TT
8683
EXPLAIN SELECT * FROM test_table WHERE column1 = 1;
8784
----
8885
physical_plan
89-
01)CoalesceBatchesExec: target_batch_size=8192, statistics=[Rows=Inexact(2), Bytes=Inexact(31), [(Col[0]: Min=Exact(Int64(1)) Max=Exact(Int64(1)) Null=Inexact(0))]]
90-
02)--FilterExec: column1@0 = 1, statistics=[Rows=Inexact(2), Bytes=Inexact(31), [(Col[0]: Min=Exact(Int64(1)) Max=Exact(Int64(1)) Null=Inexact(0))]]
91-
03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2, statistics=[Rows=Inexact(5), Bytes=Inexact(121), [(Col[0]: Min=Inexact(Int64(1)) Max=Inexact(Int64(4)) Null=Inexact(0))]]
92-
04)------DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/0.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/1.parquet]]}, projection=[column1], file_type=parquet, predicate=column1@0 = 1, pruning_predicate=column1_null_count@2 != row_count@3 AND column1_min@0 <= 1 AND 1 <= column1_max@1, required_guarantees=[column1 in (1)]
93-
05), statistics=[Rows=Inexact(5), Bytes=Inexact(121), [(Col[0]: Min=Inexact(Int64(1)) Max=Inexact(Int64(4)) Null=Inexact(0))]]
86+
01)DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/0.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/1.parquet]]}, projection=[column1], file_type=parquet, predicate=column1@0 = 1, pruning_predicate=column1_null_count@2 != row_count@3 AND column1_min@0 <= 1 AND 1 <= column1_max@1, required_guarantees=[column1 in (1)]
87+
02), statistics=[Rows=Inexact(5), Bytes=Inexact(121), [(Col[0]: Min=Inexact(Int64(1)) Max=Inexact(Int64(4)) Null=Inexact(0))]]
9488

9589
# cleanup
9690
statement ok
@@ -114,11 +108,8 @@ query TT
114108
EXPLAIN SELECT * FROM test_table WHERE column1 = 1;
115109
----
116110
physical_plan
117-
01)CoalesceBatchesExec: target_batch_size=8192, statistics=[Rows=Absent, Bytes=Absent, [(Col[0]:)]]
118-
02)--FilterExec: column1@0 = 1, statistics=[Rows=Absent, Bytes=Absent, [(Col[0]: Min=Exact(Int64(1)) Max=Exact(Int64(1)))]]
119-
03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2, statistics=[Rows=Absent, Bytes=Absent, [(Col[0]:)]]
120-
04)------DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/0.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/1.parquet]]}, projection=[column1], file_type=parquet, predicate=column1@0 = 1, pruning_predicate=column1_null_count@2 != row_count@3 AND column1_min@0 <= 1 AND 1 <= column1_max@1, required_guarantees=[column1 in (1)]
121-
05), statistics=[Rows=Absent, Bytes=Absent, [(Col[0]:)]]
111+
01)DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/0.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/1.parquet]]}, projection=[column1], file_type=parquet, predicate=column1@0 = 1, pruning_predicate=column1_null_count@2 != row_count@3 AND column1_min@0 <= 1 AND 1 <= column1_max@1, required_guarantees=[column1 in (1)]
112+
02), statistics=[Rows=Absent, Bytes=Absent, [(Col[0]:)]]
122113

123114
# cleanup
124115
statement ok

0 commit comments

Comments
 (0)