Skip to content

Commit b3acc9f

Browse files
authored
fix: Make input_sorted_by_group_key tolerate unrelated single value columns outside the group key prefix (#171)
* fix: Make input_sorted_by_group_key tolerate unrelated single value columns outside the group key prefix * Adds basic output_hints to SortExec actually just for the sake of test code * Adds hash_agg_aggregation_strategy_with_nongrouped_single_value_columns_in_sort_key test
1 parent c29478e commit b3acc9f

File tree

2 files changed

+109
-18
lines changed

2 files changed

+109
-18
lines changed

datafusion/src/physical_plan/planner.rs

Lines changed: 79 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1716,16 +1716,21 @@ fn input_sorted_by_group_key(
17161716
}
17171717
sort_to_group[sort_key_pos] = group_i;
17181718
}
1719-
for i in 0..sort_key.len() {
1720-
if hints.single_value_columns.contains(&sort_key[i]) {
1721-
sort_key_hit[i] = true;
1719+
1720+
// At this point all elements of the group key mapped into some column of the sort key. This
1721+
// checks the group key is mapped into a prefix of the sort key, except that it's okay if it
1722+
// skips over single value columns.
1723+
let mut pref_len: usize = 0;
1724+
for (i, hit) in sort_key_hit.iter().enumerate() {
1725+
if !hit && !hints.single_value_columns.contains(&sort_key[i]) {
1726+
break;
17221727
}
1728+
pref_len += 1;
17231729
}
17241730

1725-
// At this point all elements of the group key mapped into some column of the sort key.
1726-
// This checks the group key is mapped into a prefix of the sort key.
1727-
let pref_len = sort_key_hit.iter().take_while(|present| **present).count();
17281731
if sort_key_hit[pref_len..].iter().any(|present| *present) {
1732+
// The group key did not hit a contiguous prefix of the sort key (ignoring single value
1733+
// columns); return false.
17291734
return false;
17301735
}
17311736

@@ -1753,7 +1758,8 @@ fn tuple_err<T, R>(value: (Result<T>, Result<R>)) -> Result<(T, R)> {
17531758
#[cfg(test)]
17541759
mod tests {
17551760
use super::*;
1756-
use crate::logical_plan::{DFField, DFSchema, DFSchemaRef};
1761+
use crate::logical_plan::{and, DFField, DFSchema, DFSchemaRef};
1762+
use crate::physical_plan::OptimizerHints;
17571763
use crate::physical_plan::{csv::CsvReadOptions, expressions, Partitioning};
17581764
use crate::scalar::ScalarValue;
17591765
use crate::{
@@ -2036,6 +2042,72 @@ mod tests {
20362042
Ok(())
20372043
}
20382044

2045+
#[test]
2046+
fn hash_agg_aggregation_strategy_with_nongrouped_single_value_columns_in_sort_key(
2047+
) -> Result<()> {
2048+
let testdata = crate::test_util::arrow_test_data();
2049+
let path = format!("{}/csv/aggregate_test_100.csv", testdata);
2050+
2051+
let options = CsvReadOptions::new().schema_infer_max_records(100);
2052+
2053+
fn sort(column_name: &str) -> Expr {
2054+
col(column_name).sort(true, true)
2055+
}
2056+
2057+
// Instead of creating a mock ExecutionPlan, we have some input plan which produces the desired output_hints().
2058+
let logical_plan = LogicalPlanBuilder::scan_csv(path, options, None)?
2059+
.filter(and(
2060+
col("c4").eq(lit("value_a")),
2061+
col("c8").eq(lit("value_b")),
2062+
))?
2063+
.sort(vec![
2064+
sort("c1"),
2065+
sort("c2"),
2066+
sort("c3"),
2067+
sort("c4"),
2068+
sort("c5"),
2069+
sort("c6"),
2070+
sort("c7"),
2071+
sort("c8"),
2072+
])?
2073+
.build()?;
2074+
2075+
let execution_plan = plan(&logical_plan)?;
2076+
2077+
// Note that both single_value_columns are part of the sort key... but one will not be part of the group key.
2078+
let hints: OptimizerHints = execution_plan.output_hints();
2079+
assert_eq!(hints.sort_order, Some(vec![0, 1, 2, 3, 4, 5, 6, 7]));
2080+
assert_eq!(hints.single_value_columns, vec![3, 7]);
2081+
2082+
// Now make a group_key that overlaps one single_value_column, but the single value column 7
2083+
// has column 5 and 6 ("c6" and "c7" respectively) in between.
2084+
let group_key = vec![col("c1"), col("c2"), col("c3"), col("c4"), col("c5")];
2085+
let mut ctx_state = make_ctx_state();
2086+
ctx_state.config.concurrency = 4;
2087+
let planner = DefaultPhysicalPlanner::default();
2088+
let mut physical_group_key = Vec::new();
2089+
for expr in group_key {
2090+
let phys_expr = planner.create_physical_expr(
2091+
&expr,
2092+
&logical_plan.schema(),
2093+
&execution_plan.schema(),
2094+
&ctx_state,
2095+
)?;
2096+
physical_group_key.push((phys_expr, "".to_owned()));
2097+
}
2098+
2099+
let mut sort_order = Vec::<usize>::new();
2100+
let is_sorted: bool = input_sorted_by_group_key(
2101+
execution_plan.as_ref(),
2102+
&physical_group_key,
2103+
&mut sort_order,
2104+
);
2105+
assert!(is_sorted);
2106+
assert_eq!(sort_order, vec![0, 1, 2, 3, 4]);
2107+
2108+
Ok(())
2109+
}
2110+
20392111
#[test]
20402112
fn test_explain() {
20412113
let schema = Schema::new(vec![Field::new("id", DataType::Int32, false)]);

datafusion/src/physical_plan/sort.rs

Lines changed: 30 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,15 @@
1717

1818
//! Defines the SORT plan
1919
20-
use super::{RecordBatchStream, SendableRecordBatchStream};
2120
use crate::cube_ext;
2221
use crate::error::{DataFusionError, Result};
23-
use crate::physical_plan::expressions::PhysicalSortExpr;
22+
use crate::physical_plan::expressions::{Column, PhysicalSortExpr};
2423
use crate::physical_plan::{
2524
common, DisplayFormatType, Distribution, ExecutionPlan, Partitioning, SQLMetric,
2625
};
26+
use crate::physical_plan::{
27+
OptimizerHints, RecordBatchStream, SendableRecordBatchStream,
28+
};
2729
pub use arrow::compute::SortOptions;
2830
use arrow::compute::{lexsort_to_indices, take, SortColumn, TakeOptions};
2931
use arrow::datatypes::SchemaRef;
@@ -186,15 +188,32 @@ impl ExecutionPlan for SortExec {
186188
metrics
187189
}
188190

189-
// TODO
190-
// fn output_sort_order(&self) -> Result<Option<Vec<usize>>> {
191-
// let mut order = Vec::with_capacity(self.expr.len());
192-
// for s in &self.expr {
193-
// let col = s.expr.as_any().downcast_ref::<Column>()?;
194-
// order.push(self.schema().index_of(col.name())?);
195-
// }
196-
// Ok(Some(order))
197-
// }
191+
fn output_hints(&self) -> OptimizerHints {
192+
let mut order = Vec::with_capacity(self.expr.len());
193+
// let mut sort_order_truncated = false;
194+
for s in &self.expr {
195+
let column = s.expr.as_any().downcast_ref::<Column>();
196+
if column.is_none() {
197+
// sort_order_truncated = true;
198+
break;
199+
}
200+
let column = column.unwrap();
201+
202+
let index: usize = match self.schema().index_of(column.name()) {
203+
Ok(ix) => ix,
204+
Err(_) => return OptimizerHints::default(),
205+
};
206+
order.push(index);
207+
}
208+
209+
let input_hints = self.input.output_hints();
210+
// TODO: If sort_order_truncated is false, we can combine input_hints.sort_order. Do this.
211+
212+
OptimizerHints {
213+
sort_order: Some(order),
214+
single_value_columns: input_hints.single_value_columns.clone(),
215+
}
216+
}
198217
}
199218

200219
#[tracing::instrument(level = "trace", skip(batch, schema, expr))]

0 commit comments

Comments
 (0)