diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 5796edc283e0..b75a5df8bec8 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -277,6 +277,16 @@ config_namespace! { /// Specifies the recursion depth limit when parsing complex SQL Queries pub recursion_limit: usize, default = 50 + + /// Specifies the default null ordering for query results. There are 4 options: + /// - `nulls_max`: Nulls appear last in ascending order. + /// - `nulls_min`: Nulls appear first in ascending order. + /// - `nulls_first`: Nulls always be first in any order. + /// - `nulls_last`: Nulls always be last in any order. + /// + /// By default, `nulls_max` is used to follow Postgres's behavior. + /// postgres rule: + pub default_null_ordering: String, default = "nulls_max".to_string() } } diff --git a/datafusion/core/src/execution/session_state.rs b/datafusion/core/src/execution/session_state.rs index 364ad75b0869..a28cc38919dd 100644 --- a/datafusion/core/src/execution/session_state.rs +++ b/datafusion/core/src/execution/session_state.rs @@ -496,6 +496,10 @@ impl SessionState { support_varchar_with_length: sql_parser_options.support_varchar_with_length, map_string_types_to_utf8view: sql_parser_options.map_string_types_to_utf8view, collect_spans: sql_parser_options.collect_spans, + default_null_ordering: sql_parser_options + .default_null_ordering + .as_str() + .into(), } } diff --git a/datafusion/sql/src/expr/order_by.rs b/datafusion/sql/src/expr/order_by.rs index d357c3753e13..79ebc5943ffb 100644 --- a/datafusion/sql/src/expr/order_by.rs +++ b/datafusion/sql/src/expr/order_by.rs @@ -63,14 +63,14 @@ impl SqlToRel<'_, S> { let mut sort_expr_vec = Vec::with_capacity(order_by_exprs.len()); - let make_sort_expr = - |expr: Expr, asc: Option, nulls_first: Option| { - let asc = asc.unwrap_or(true); - // When asc is true, by default nulls last to be consistent with postgres - // postgres rule: https://www.postgresql.org/docs/current/queries-order.html - let nulls_first = nulls_first.unwrap_or(!asc); - Sort::new(expr, asc, nulls_first) - }; + let make_sort_expr = |expr: Expr, + asc: Option, + nulls_first: Option| { + let asc = asc.unwrap_or(true); + let nulls_first = nulls_first + .unwrap_or_else(|| self.options.default_null_ordering.nulls_first(asc)); + Sort::new(expr, asc, nulls_first) + }; for order_by_expr in order_by_exprs { let OrderByExpr { diff --git a/datafusion/sql/src/planner.rs b/datafusion/sql/src/planner.rs index 26c982690115..2cb1dbdcb4ac 100644 --- a/datafusion/sql/src/planner.rs +++ b/datafusion/sql/src/planner.rs @@ -17,6 +17,7 @@ //! [`SqlToRel`]: SQL Query Planner (produces [`LogicalPlan`] from SQL AST) use std::collections::HashMap; +use std::str::FromStr; use std::sync::Arc; use std::vec; @@ -54,6 +55,8 @@ pub struct ParserOptions { pub collect_spans: bool, /// Whether string types (VARCHAR, CHAR, Text, and String) are mapped to `Utf8View` during SQL planning. pub map_string_types_to_utf8view: bool, + /// Default null ordering for sorting expressions. + pub default_null_ordering: NullOrdering, } impl ParserOptions { @@ -75,6 +78,9 @@ impl ParserOptions { map_string_types_to_utf8view: true, enable_options_value_normalization: false, collect_spans: false, + // By default, `nulls_max` is used to follow Postgres's behavior. + // postgres rule: https://www.postgresql.org/docs/current/queries-order.html + default_null_ordering: NullOrdering::NullsMax, } } @@ -129,6 +135,12 @@ impl ParserOptions { self.collect_spans = value; self } + + /// Sets the `default_null_ordering` option. + pub fn with_default_null_ordering(mut self, value: NullOrdering) -> Self { + self.default_null_ordering = value; + self + } } impl Default for ParserOptions { @@ -147,10 +159,60 @@ impl From<&SqlParserOptions> for ParserOptions { enable_options_value_normalization: options .enable_options_value_normalization, collect_spans: options.collect_spans, + default_null_ordering: options.default_null_ordering.as_str().into(), } } } +/// Represents the null ordering for sorting expressions. +#[derive(Debug, Clone, Copy)] +pub enum NullOrdering { + /// Nulls appear last in ascending order. + NullsMax, + /// Nulls appear first in descending order. + NullsMin, + /// Nulls appear first. + NullsFirst, + /// Nulls appear last. + NullsLast, +} + +impl NullOrdering { + /// Evaluates the null ordering based on the given ascending flag. + /// + /// # Returns + /// * `true` if nulls should appear first. + /// * `false` if nulls should appear last. + pub fn nulls_first(&self, asc: bool) -> bool { + match self { + Self::NullsMax => !asc, + Self::NullsMin => asc, + Self::NullsFirst => true, + Self::NullsLast => false, + } + } +} + +impl FromStr for NullOrdering { + type Err = DataFusionError; + + fn from_str(s: &str) -> Result { + match s { + "nulls_max" => Ok(Self::NullsMax), + "nulls_min" => Ok(Self::NullsMin), + "nulls_first" => Ok(Self::NullsFirst), + "nulls_last" => Ok(Self::NullsLast), + _ => plan_err!("Unknown null ordering: Expected one of 'nulls_first', 'nulls_last', 'nulls_min' or 'nulls_max'. Got {s}"), + } + } +} + +impl From<&str> for NullOrdering { + fn from(s: &str) -> Self { + Self::from_str(s).unwrap_or(Self::NullsMax) + } +} + /// Ident Normalizer #[derive(Debug)] pub struct IdentNormalizer { diff --git a/datafusion/sql/src/statement.rs b/datafusion/sql/src/statement.rs index b2bea86f5524..0fef18ac55f8 100644 --- a/datafusion/sql/src/statement.rs +++ b/datafusion/sql/src/statement.rs @@ -1421,7 +1421,9 @@ impl SqlToRel<'_, S> { .unwrap(); let asc = order_by_expr.options.asc.unwrap_or(true); let nulls_first = - order_by_expr.options.nulls_first.unwrap_or(!asc); + order_by_expr.options.nulls_first.unwrap_or_else(|| { + self.options.default_null_ordering.nulls_first(asc) + }); SortExpr::new(ordered_expr, asc, nulls_first) }) diff --git a/datafusion/sql/tests/sql_integration.rs b/datafusion/sql/tests/sql_integration.rs index dd5ec4a20118..25144042504f 100644 --- a/datafusion/sql/tests/sql_integration.rs +++ b/datafusion/sql/tests/sql_integration.rs @@ -32,7 +32,7 @@ use datafusion_expr::{ use datafusion_functions::{string, unicode}; use datafusion_sql::{ parser::DFParser, - planner::{ParserOptions, SqlToRel}, + planner::{NullOrdering, ParserOptions, SqlToRel}, }; use crate::common::{CustomExprPlanner, CustomTypePlanner, MockSessionState}; @@ -3361,6 +3361,7 @@ fn parse_decimals_parser_options() -> ParserOptions { map_string_types_to_utf8view: true, enable_options_value_normalization: false, collect_spans: false, + default_null_ordering: NullOrdering::NullsMax, } } @@ -3372,6 +3373,7 @@ fn ident_normalization_parser_options_no_ident_normalization() -> ParserOptions map_string_types_to_utf8view: true, enable_options_value_normalization: false, collect_spans: false, + default_null_ordering: NullOrdering::NullsMax, } } @@ -3383,6 +3385,7 @@ fn ident_normalization_parser_options_ident_normalization() -> ParserOptions { map_string_types_to_utf8view: true, enable_options_value_normalization: false, collect_spans: false, + default_null_ordering: NullOrdering::NullsMax, } } diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt index 86dfbd7c8496..2d778bc9d654 100644 --- a/datafusion/sqllogictest/test_files/information_schema.slt +++ b/datafusion/sqllogictest/test_files/information_schema.slt @@ -307,6 +307,7 @@ datafusion.optimizer.repartition_windows true datafusion.optimizer.skip_failed_rules false datafusion.optimizer.top_down_join_key_reordering true datafusion.sql_parser.collect_spans false +datafusion.sql_parser.default_null_ordering nulls_max datafusion.sql_parser.dialect generic datafusion.sql_parser.enable_ident_normalization true datafusion.sql_parser.enable_options_value_normalization false @@ -420,6 +421,7 @@ datafusion.optimizer.repartition_windows true Should DataFusion repartition data datafusion.optimizer.skip_failed_rules false When set to true, the logical plan optimizer will produce warning messages if any optimization rules produce errors and then proceed to the next rule. When set to false, any rules that produce errors will cause the query to fail datafusion.optimizer.top_down_join_key_reordering true When set to true, the physical plan optimizer will run a top down process to reorder the join keys datafusion.sql_parser.collect_spans false When set to true, the source locations relative to the original SQL query (i.e. [`Span`](https://docs.rs/sqlparser/latest/sqlparser/tokenizer/struct.Span.html)) will be collected and recorded in the logical plan nodes. +datafusion.sql_parser.default_null_ordering nulls_max Specifies the default null ordering for query results. There are 4 options: - `nulls_max`: Nulls appear last in ascending order. - `nulls_min`: Nulls appear first in ascending order. - `nulls_first`: Nulls always be first in any order. - `nulls_last`: Nulls always be last in any order. By default, `nulls_max` is used to follow Postgres's behavior. postgres rule: datafusion.sql_parser.dialect generic Configure the SQL dialect used by DataFusion's parser; supported values include: Generic, MySQL, PostgreSQL, Hive, SQLite, Snowflake, Redshift, MsSQL, ClickHouse, BigQuery, Ansi, DuckDB and Databricks. datafusion.sql_parser.enable_ident_normalization true When set to true, SQL parser will normalize ident (convert ident to lowercase when not quoted) datafusion.sql_parser.enable_options_value_normalization false When set to true, SQL parser will normalize options value (convert value to lowercase). Note that this option is ignored and will be removed in the future. All case-insensitive values are normalized automatically. diff --git a/datafusion/sqllogictest/test_files/order.slt b/datafusion/sqllogictest/test_files/order.slt index e3bcfcdbda1d..1ceea1a7ea0e 100644 --- a/datafusion/sqllogictest/test_files/order.slt +++ b/datafusion/sqllogictest/test_files/order.slt @@ -94,6 +94,98 @@ NULL three 1 one 2 two +statement ok +set datafusion.sql_parser.default_null_ordering = 'nulls_min'; + +# test asc with `nulls_min` null ordering + +query IT +SELECT * FROM (VALUES (1, 'one'), (2, 'two'), (null, 'three')) AS t (num,letter) ORDER BY num +---- +NULL three +1 one +2 two + +# test desc with `nulls_min` null ordering + +query IT +SELECT * FROM (VALUES (1, 'one'), (2, 'two'), (null, 'three')) AS t (num,letter) ORDER BY num DESC +---- +2 two +1 one +NULL three + +statement ok +set datafusion.sql_parser.default_null_ordering = 'nulls_first'; + +# test asc with `nulls_first` null ordering + +query IT +SELECT * FROM (VALUES (1, 'one'), (2, 'two'), (null, 'three')) AS t (num,letter) ORDER BY num +---- +NULL three +1 one +2 two + +# test desc with `nulls_first` null ordering + +query IT +SELECT * FROM (VALUES (1, 'one'), (2, 'two'), (null, 'three')) AS t (num,letter) ORDER BY num DESC +---- +NULL three +2 two +1 one + + +statement ok +set datafusion.sql_parser.default_null_ordering = 'nulls_last'; + +# test asc with `nulls_last` null ordering + +query IT +SELECT * FROM (VALUES (1, 'one'), (2, 'two'), (null, 'three')) AS t (num,letter) ORDER BY num +---- +1 one +2 two +NULL three + +# test desc with `nulls_last` null ordering + +query IT +SELECT * FROM (VALUES (1, 'one'), (2, 'two'), (null, 'three')) AS t (num,letter) ORDER BY num DESC +---- +2 two +1 one +NULL three + +statement ok +set datafusion.sql_parser.default_null_ordering = ''; + +# test asc with an empty `default_null_ordering`. Expected to use the default null ordering which is `nulls_max` + +query IT +SELECT * FROM (VALUES (1, 'one'), (2, 'two'), (null, 'three')) AS t (num,letter) ORDER BY num +---- +1 one +2 two +NULL three + +# test desc with an empty `default_null_ordering`. Expected to use the default null ordering which is `nulls_max` + +query IT +SELECT * FROM (VALUES (1, 'one'), (2, 'two'), (null, 'three')) AS t (num,letter) ORDER BY num DESC +---- +NULL three +2 two +1 one + +statement error DataFusion error: Error during planning: Unsupported Value NULL +set datafusion.sql_parser.default_null_ordering = null; + +# reset to default null ordering +statement ok +set datafusion.sql_parser.default_null_ordering = 'nulls_max'; + # sort statement ok diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md index 96b7ee672bdb..ec51e8fc9f40 100644 --- a/docs/source/user-guide/configs.md +++ b/docs/source/user-guide/configs.md @@ -136,6 +136,7 @@ Environment variables are read during `SessionConfig` initialisation so they mus | datafusion.sql_parser.map_string_types_to_utf8view | true | If true, string types (VARCHAR, CHAR, Text, and String) are mapped to `Utf8View` during SQL planning. If false, they are mapped to `Utf8`. Default is true. | | datafusion.sql_parser.collect_spans | false | When set to true, the source locations relative to the original SQL query (i.e. [`Span`](https://docs.rs/sqlparser/latest/sqlparser/tokenizer/struct.Span.html)) will be collected and recorded in the logical plan nodes. | | datafusion.sql_parser.recursion_limit | 50 | Specifies the recursion depth limit when parsing complex SQL Queries | +| datafusion.sql_parser.default_null_ordering | nulls_max | Specifies the default null ordering for query results. There are 4 options: - `nulls_max`: Nulls appear last in ascending order. - `nulls_min`: Nulls appear first in ascending order. - `nulls_first`: Nulls always be first in any order. - `nulls_last`: Nulls always be last in any order. By default, `nulls_max` is used to follow Postgres's behavior. postgres rule: | | datafusion.format.safe | true | If set to `true` any formatting errors will be written to the output instead of being converted into a [`std::fmt::Error`] | | datafusion.format.null | | Format string for nulls | | datafusion.format.date_format | %Y-%m-%d | Date format for date arrays |