From dc53f30ae7079a987d4f2dded6d06818486fd17f Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Wed, 30 Apr 2025 14:06:52 -0400 Subject: [PATCH 01/15] Intermediate work on setting up sql feature flags throughout repo --- Cargo.lock | 11 ------- Cargo.toml | 4 +-- benchmarks/Cargo.toml | 2 +- datafusion/catalog/Cargo.toml | 1 - datafusion/common/Cargo.toml | 1 + datafusion/core/Cargo.toml | 2 +- datafusion/core/src/dataframe/mod.rs | 8 ++--- datafusion/core/src/execution/context/mod.rs | 3 ++ .../core/src/execution/session_state.rs | 23 ++++++++++++--- datafusion/core/src/lib.rs | 3 ++ datafusion/core/src/physical_planner.rs | 5 +--- datafusion/core/src/test_util/mod.rs | 1 + datafusion/core/tests/expr_api/mod.rs | 2 +- datafusion/execution/Cargo.toml | 7 +++-- datafusion/expr/Cargo.toml | 6 ++-- datafusion/expr/src/expr.rs | 29 ++++++++++++++++++- datafusion/expr/src/expr_fn.rs | 5 ++-- datafusion/expr/src/lib.rs | 2 ++ datafusion/expr/src/planner.rs | 3 +- datafusion/optimizer/Cargo.toml | 2 +- datafusion/physical-expr-common/Cargo.toml | 2 +- datafusion/physical-expr/Cargo.toml | 2 +- datafusion/physical-optimizer/Cargo.toml | 2 +- datafusion/physical-plan/Cargo.toml | 2 +- datafusion/proto/Cargo.toml | 2 +- datafusion/session/Cargo.toml | 10 ------- datafusion/sql/Cargo.toml | 2 +- datafusion/sql/src/expr/function.rs | 8 ++--- datafusion/wasmtest/Cargo.toml | 2 +- 29 files changed, 91 insertions(+), 61 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 933a0ee44a76..10310734029e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1957,7 +1957,6 @@ dependencies = [ "datafusion-physical-expr", "datafusion-physical-plan", "datafusion-session", - "datafusion-sql", "futures", "itertools 0.14.0", "log", @@ -2639,22 +2638,12 @@ dependencies = [ name = "datafusion-session" version = "49.0.2" dependencies = [ - "arrow", "async-trait", - "dashmap", "datafusion-common", - "datafusion-common-runtime", "datafusion-execution", "datafusion-expr", - "datafusion-physical-expr", "datafusion-physical-plan", - "datafusion-sql", - "futures", - "itertools 0.14.0", - "log", - "object_store", "parking_lot", - "tokio", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 988f382515ce..375c5d6a8839 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -121,8 +121,8 @@ datafusion-datasource-csv = { path = "datafusion/datasource-csv", version = "49. datafusion-datasource-json = { path = "datafusion/datasource-json", version = "49.0.0", default-features = false } datafusion-datasource-parquet = { path = "datafusion/datasource-parquet", version = "49.0.0", default-features = false } datafusion-doc = { path = "datafusion/doc", version = "49.0.0" } -datafusion-execution = { path = "datafusion/execution", version = "49.0.0" } -datafusion-expr = { path = "datafusion/expr", version = "49.0.0" } +datafusion-execution = { path = "datafusion/execution", version = "49.0.0", default-features = false } +datafusion-expr = { path = "datafusion/expr", version = "49.0.0", default-features = false } datafusion-expr-common = { path = "datafusion/expr-common", version = "49.0.0" } datafusion-ffi = { path = "datafusion/ffi", version = "49.0.0" } datafusion-functions = { path = "datafusion/functions", version = "49.0.0" } diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index 5a846cb49e0c..01c657c1f71b 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -38,7 +38,7 @@ mimalloc_extended = ["libmimalloc-sys/extended"] [dependencies] arrow = { workspace = true } datafusion = { workspace = true, default-features = true } -datafusion-common = { workspace = true, default-features = true } +datafusion-common = { workspace = true } env_logger = { workspace = true } futures = { workspace = true } libmimalloc-sys = { version = "0.1", optional = true } diff --git a/datafusion/catalog/Cargo.toml b/datafusion/catalog/Cargo.toml index 7307c4de87a8..5edb1c4a6878 100644 --- a/datafusion/catalog/Cargo.toml +++ b/datafusion/catalog/Cargo.toml @@ -42,7 +42,6 @@ datafusion-expr = { workspace = true } datafusion-physical-expr = { workspace = true } datafusion-physical-plan = { workspace = true } datafusion-session = { workspace = true } -datafusion-sql = { workspace = true } futures = { workspace = true } itertools = { workspace = true } log = { workspace = true } diff --git a/datafusion/common/Cargo.toml b/datafusion/common/Cargo.toml index aea5e51befb0..052c858b4a8b 100644 --- a/datafusion/common/Cargo.toml +++ b/datafusion/common/Cargo.toml @@ -49,6 +49,7 @@ pyarrow = ["pyo3", "arrow/pyarrow", "parquet"] force_hash_collisions = [] recursive_protection = ["dep:recursive"] parquet = ["dep:parquet"] +sql = ["sqlparser"] [dependencies] ahash = { workspace = true } diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml index 21c6e38945ee..2083428139d9 100644 --- a/datafusion/core/Cargo.toml +++ b/datafusion/core/Cargo.toml @@ -133,7 +133,7 @@ datafusion-physical-expr-common = { workspace = true } datafusion-physical-optimizer = { workspace = true } datafusion-physical-plan = { workspace = true } datafusion-session = { workspace = true } -datafusion-sql = { workspace = true } +datafusion-sql = { workspace = true, optional = true } flate2 = { version = "1.1.2", optional = true } futures = { workspace = true } hex = { workspace = true, optional = true } diff --git a/datafusion/core/src/dataframe/mod.rs b/datafusion/core/src/dataframe/mod.rs index 7a3739d36cb2..b0bdc5b7bed1 100644 --- a/datafusion/core/src/dataframe/mod.rs +++ b/datafusion/core/src/dataframe/mod.rs @@ -50,10 +50,7 @@ use arrow::array::{Array, ArrayRef, Int64Array, StringArray}; use arrow::compute::{cast, concat}; use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; use datafusion_common::config::{CsvOptions, JsonOptions}; -use datafusion_common::{ - exec_err, not_impl_err, plan_datafusion_err, plan_err, Column, DFSchema, - DataFusionError, ParamValues, ScalarValue, SchemaError, UnnestOptions, -}; +use datafusion_common::{exec_err, not_impl_err, plan_datafusion_err, plan_err, Column, DFSchema, DataFusionError, ParamValues, ScalarValue, SchemaError, TableReference, UnnestOptions}; use datafusion_expr::select_expr::SelectExpr; use datafusion_expr::{ case, @@ -70,7 +67,6 @@ use datafusion_functions_aggregate::expr_fn::{ use async_trait::async_trait; use datafusion_catalog::Session; -use datafusion_sql::TableReference; /// Contains options that control how data is /// written out from a DataFrame @@ -267,6 +263,7 @@ impl DataFrame { /// # Ok(()) /// # } /// ``` + #[cfg(feature = "sql")] pub fn parse_sql_expr(&self, sql: &str) -> Result { let df_schema = self.schema(); @@ -333,6 +330,7 @@ impl DataFrame { /// # Ok(()) /// # } /// ``` + #[cfg(feature = "sql")] pub fn select_exprs(self, exprs: &[&str]) -> Result { let expr_list = exprs .iter() diff --git a/datafusion/core/src/execution/context/mod.rs b/datafusion/core/src/execution/context/mod.rs index de1d40dda349..e7a66c4f9e8e 100644 --- a/datafusion/core/src/execution/context/mod.rs +++ b/datafusion/core/src/execution/context/mod.rs @@ -585,6 +585,7 @@ impl SessionContext { /// # Ok(()) /// # } /// ``` + #[cfg(feature = "sql")] pub async fn sql(&self, sql: &str) -> Result { self.sql_with_options(sql, SQLOptions::new()).await } @@ -615,6 +616,7 @@ impl SessionContext { /// # Ok(()) /// # } /// ``` + #[cfg(feature = "sql")] pub async fn sql_with_options( &self, sql: &str, @@ -648,6 +650,7 @@ impl SessionContext { /// # Ok(()) /// # } /// ``` + #[cfg(feature = "sql")] pub fn parse_sql_expr(&self, sql: &str, df_schema: &DFSchema) -> Result { self.state.read().create_logical_expr(sql, df_schema) } diff --git a/datafusion/core/src/execution/session_state.rs b/datafusion/core/src/execution/session_state.rs index a7b3bdeeace8..cde92fe526a9 100644 --- a/datafusion/core/src/execution/session_state.rs +++ b/datafusion/core/src/execution/session_state.rs @@ -69,16 +69,22 @@ use datafusion_physical_optimizer::optimizer::PhysicalOptimizer; use datafusion_physical_optimizer::PhysicalOptimizerRule; use datafusion_physical_plan::ExecutionPlan; use datafusion_session::Session; -use datafusion_sql::parser::{DFParserBuilder, Statement}; -use datafusion_sql::planner::{ContextProvider, ParserOptions, PlannerContext, SqlToRel}; +#[cfg(feature = "sql")] +use datafusion_sql::{ + parser::{DFParserBuilder, Statement}, + planner::{ContextProvider, ParserOptions, PlannerContext, SqlToRel}, +}; use async_trait::async_trait; use chrono::{DateTime, Utc}; use itertools::Itertools; use log::{debug, info}; use object_store::ObjectStore; -use sqlparser::ast::{Expr as SQLExpr, ExprWithAlias as SQLExprWithAlias}; -use sqlparser::dialect::dialect_from_str; +#[cfg(feature = "sql")] +use sqlparser::{ + ast::{Expr as SQLExpr, ExprWithAlias as SQLExprWithAlias}, + dialect::dialect_from_str, +}; use url::Url; use uuid::Uuid; @@ -358,6 +364,7 @@ impl SessionState { /// [`Statement`]. See [`SessionContext::sql`] for running queries. /// /// [`SessionContext::sql`]: crate::execution::context::SessionContext::sql + #[cfg(feature = "sql")] pub fn sql_to_statement( &self, sql: &str, @@ -394,6 +401,7 @@ impl SessionState { /// parse a sql string into a sqlparser-rs AST [`SQLExpr`]. /// /// See [`Self::create_logical_expr`] for parsing sql to [`Expr`]. + #[cfg(feature = "sql")] pub fn sql_to_expr( &self, sql: &str, @@ -405,6 +413,7 @@ impl SessionState { /// parse a sql string into a sqlparser-rs AST [`SQLExprWithAlias`]. /// /// See [`Self::create_logical_expr`] for parsing sql to [`Expr`]. + #[cfg(feature = "sql")] pub fn sql_to_expr_with_alias( &self, sql: &str, @@ -433,6 +442,7 @@ impl SessionState { /// See [`datafusion_sql::resolve::resolve_table_references`] for more information. /// /// [`datafusion_sql::resolve::resolve_table_references`]: datafusion_sql::resolve::resolve_table_references + #[cfg(feature = "sql")] pub fn resolve_table_references( &self, statement: &Statement, @@ -447,6 +457,7 @@ impl SessionState { } /// Convert an AST Statement into a LogicalPlan + #[cfg(feature = "sql")] pub async fn statement_to_plan( &self, statement: Statement, @@ -474,6 +485,7 @@ impl SessionState { query.statement_to_plan(statement) } + #[cfg(feature = "sql")] fn get_parser_options(&self) -> ParserOptions { let sql_parser_options = &self.config.options().sql_parser; @@ -504,6 +516,7 @@ impl SessionState { /// /// [`SessionContext::sql`]: crate::execution::context::SessionContext::sql /// [`SessionContext::sql_with_options`]: crate::execution::context::SessionContext::sql_with_options + #[cfg(feature = "sql")] pub async fn create_logical_plan( &self, sql: &str, @@ -517,6 +530,7 @@ impl SessionState { /// Creates a datafusion style AST [`Expr`] from a SQL string. /// /// See example on [SessionContext::parse_sql_expr](crate::execution::context::SessionContext::parse_sql_expr) + #[cfg(feature = "sql")] pub fn create_logical_expr( &self, sql: &str, @@ -1639,6 +1653,7 @@ struct SessionContextProvider<'a> { tables: HashMap>, } +#[cfg(feature = "sql")] impl ContextProvider for SessionContextProvider<'_> { fn get_expr_planners(&self) -> &[Arc] { self.state.expr_planners() diff --git a/datafusion/core/src/lib.rs b/datafusion/core/src/lib.rs index bb69d0c85a5c..6640d4158c6b 100644 --- a/datafusion/core/src/lib.rs +++ b/datafusion/core/src/lib.rs @@ -734,6 +734,8 @@ pub const DATAFUSION_VERSION: &str = env!("CARGO_PKG_VERSION"); extern crate core; + +#[cfg(feature = "sql")] extern crate sqlparser; pub mod dataframe; @@ -820,6 +822,7 @@ pub use datafusion_common::assert_batches_eq; pub use datafusion_common::assert_batches_sorted_eq; /// re-export of [`datafusion_sql`] crate +#[cfg(feature = "sql")] pub mod sql { pub use datafusion_sql::*; } diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index 6618d9495d78..0fae3aacd5b1 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -71,10 +71,7 @@ use datafusion_common::{ use datafusion_datasource::file_groups::FileGroup; use datafusion_datasource::memory::MemorySourceConfig; use datafusion_expr::dml::{CopyTo, InsertOp}; -use datafusion_expr::expr::{ - physical_name, AggregateFunction, AggregateFunctionParams, Alias, GroupingSet, - WindowFunction, WindowFunctionParams, -}; +use datafusion_expr::expr::{physical_name, AggregateFunction, AggregateFunctionParams, Alias, GroupingSet, NullTreatment, WindowFunction, WindowFunctionParams}; use datafusion_expr::expr_rewriter::unnormalize_cols; use datafusion_expr::logical_plan::builder::wrap_projection_for_join_if_necessary; use datafusion_expr::{ diff --git a/datafusion/core/src/test_util/mod.rs b/datafusion/core/src/test_util/mod.rs index 2f8e66a2bbfb..299b73ccbe40 100644 --- a/datafusion/core/src/test_util/mod.rs +++ b/datafusion/core/src/test_util/mod.rs @@ -135,6 +135,7 @@ pub async fn test_table() -> Result { } /// Execute SQL and return results +#[cfg(feature = "sql")] pub async fn plan_and_collect( ctx: &SessionContext, sql: &str, diff --git a/datafusion/core/tests/expr_api/mod.rs b/datafusion/core/tests/expr_api/mod.rs index a9cf7f04bb3a..cc48c4312e59 100644 --- a/datafusion/core/tests/expr_api/mod.rs +++ b/datafusion/core/tests/expr_api/mod.rs @@ -31,9 +31,9 @@ use datafusion_functions_aggregate::first_last::first_value_udaf; use datafusion_functions_aggregate::sum::sum_udaf; use datafusion_functions_nested::expr_ext::{IndexAccessor, SliceAccessor}; use datafusion_optimizer::simplify_expressions::ExprSimplifier; -use sqlparser::ast::NullTreatment; /// Tests of using and evaluating `Expr`s outside the context of a LogicalPlan use std::sync::{Arc, LazyLock}; +use datafusion_expr::expr::NullTreatment; mod parse_sql_expr; mod simplification; diff --git a/datafusion/execution/Cargo.toml b/datafusion/execution/Cargo.toml index afe9039f8bae..67a37a86c706 100644 --- a/datafusion/execution/Cargo.toml +++ b/datafusion/execution/Cargo.toml @@ -38,16 +38,19 @@ workspace = true name = "datafusion_execution" [features] +default = ["sql"] + parquet_encryption = [ "parquet/encryption", ] +sql = [] [dependencies] arrow = { workspace = true } async-trait = { workspace = true } dashmap = { workspace = true } -datafusion-common = { workspace = true, default-features = true } -datafusion-expr = { workspace = true } +datafusion-common = { workspace = true, default-features = false } +datafusion-expr = { workspace = true, default-features = false } futures = { workspace = true } log = { workspace = true } object_store = { workspace = true, features = ["fs"] } diff --git a/datafusion/expr/Cargo.toml b/datafusion/expr/Cargo.toml index 812544587bf9..108e1ed30fa8 100644 --- a/datafusion/expr/Cargo.toml +++ b/datafusion/expr/Cargo.toml @@ -38,13 +38,15 @@ workspace = true name = "datafusion_expr" [features] +default = ["sql"] recursive_protection = ["dep:recursive"] +sql = ["sqlparser"] [dependencies] arrow = { workspace = true } async-trait = { workspace = true } chrono = { workspace = true } -datafusion-common = { workspace = true } +datafusion-common = { workspace = true, default-features = false } datafusion-doc = { workspace = true } datafusion-expr-common = { workspace = true } datafusion-functions-aggregate-common = { workspace = true } @@ -54,7 +56,7 @@ indexmap = { workspace = true } paste = "^1.0" recursive = { workspace = true, optional = true } serde_json = { workspace = true } -sqlparser = { workspace = true } +sqlparser = { workspace = true, optional = true } [dev-dependencies] ctor = { workspace = true } diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs index f2493abbdea4..4d88b3721704 100644 --- a/datafusion/expr/src/expr.rs +++ b/datafusion/expr/src/expr.rs @@ -41,9 +41,36 @@ use datafusion_common::{ use datafusion_functions_window_common::field::WindowUDFFieldArgs; use sqlparser::ast::{ display_comma_separated, ExceptSelectItem, ExcludeSelectItem, IlikeSelectItem, - NullTreatment, RenameSelectItem, ReplaceSelectElement, + RenameSelectItem, ReplaceSelectElement, }; +// This mirrors sqlparser::ast::NullTreatment but we need our own variant +// for when the sql feature is disabled. +#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash, Ord, PartialOrd)] +pub enum NullTreatment { + IgnoreNulls, + RespectNulls, +} + +impl Display for NullTreatment { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + f.write_str(match self { + NullTreatment::IgnoreNulls => "IGNORE NULLS", + NullTreatment::RespectNulls => "RESPECT NULLS", + }) + } +} + +impl From for NullTreatment { + fn from(value: sqlparser::ast::NullTreatment) -> Self { + match value { + sqlparser::ast::NullTreatment::IgnoreNulls => Self::IgnoreNulls, + sqlparser::ast::NullTreatment::RespectNulls => Self::RespectNulls, + } + } +} + + /// Represents logical expressions such as `A + 1`, or `CAST(c1 AS int)`. /// /// For example the expression `A + 1` will be represented as diff --git a/datafusion/expr/src/expr_fn.rs b/datafusion/expr/src/expr_fn.rs index 63b4333f218a..9da97720235d 100644 --- a/datafusion/expr/src/expr_fn.rs +++ b/datafusion/expr/src/expr_fn.rs @@ -19,7 +19,7 @@ use crate::expr::{ AggregateFunction, BinaryExpr, Cast, Exists, GroupingSet, InList, InSubquery, - Placeholder, TryCast, Unnest, WildcardOptions, WindowFunction, + NullTreatment, Placeholder, TryCast, Unnest, WildcardOptions, WindowFunction, }; use crate::function::{ AccumulatorArgs, AccumulatorFactoryFunction, PartitionEvaluatorFactory, @@ -42,7 +42,6 @@ use arrow::datatypes::{DataType, Field, FieldRef}; use datafusion_common::{plan_err, Column, Result, ScalarValue, Spans, TableReference}; use datafusion_functions_window_common::field::WindowUDFFieldArgs; use datafusion_functions_window_common::partition::PartitionEvaluatorArgs; -use sqlparser::ast::NullTreatment; use std::any::Any; use std::fmt::Debug; use std::hash::Hash; @@ -716,8 +715,8 @@ pub fn interval_month_day_nano_lit(value: &str) -> Expr { /// # Example /// ```no_run /// # use datafusion_common::Result; +/// # use datafusion_expr::expr::NullTreatment; /// # use datafusion_expr::test::function_stub::count; -/// # use sqlparser::ast::NullTreatment; /// # use datafusion_expr::{ExprFunctionExt, lit, Expr, col}; /// # // first_value is an aggregate function in another crate /// # fn first_value(_arg: Expr) -> Expr { diff --git a/datafusion/expr/src/lib.rs b/datafusion/expr/src/lib.rs index b4ad8387215e..c35654345e9f 100644 --- a/datafusion/expr/src/lib.rs +++ b/datafusion/expr/src/lib.rs @@ -34,6 +34,8 @@ //! //! The [expr_fn] module contains functions for creating expressions. +extern crate core; + mod literal; mod operation; mod partition_evaluator; diff --git a/datafusion/expr/src/planner.rs b/datafusion/expr/src/planner.rs index 669931d7bae7..d2aa7ab43d8c 100644 --- a/datafusion/expr/src/planner.rs +++ b/datafusion/expr/src/planner.rs @@ -25,12 +25,13 @@ use datafusion_common::{ config::ConfigOptions, file_options::file_type::FileType, not_impl_err, DFSchema, Result, TableReference, }; -use sqlparser::ast::{self, NullTreatment}; +use sqlparser::ast; use crate::{ AggregateUDF, Expr, GetFieldAccess, ScalarUDF, SortExpr, TableSource, WindowFrame, WindowFunctionDefinition, WindowUDF, }; +use crate::expr::NullTreatment; /// Provides the `SQL` query planner meta-data about tables and /// functions referenced in SQL statements, without a direct dependency on the diff --git a/datafusion/optimizer/Cargo.toml b/datafusion/optimizer/Cargo.toml index f10510e0973c..2b799f437bda 100644 --- a/datafusion/optimizer/Cargo.toml +++ b/datafusion/optimizer/Cargo.toml @@ -43,7 +43,7 @@ recursive_protection = ["dep:recursive"] [dependencies] arrow = { workspace = true } chrono = { workspace = true } -datafusion-common = { workspace = true, default-features = true } +datafusion-common = { workspace = true } datafusion-expr = { workspace = true } datafusion-expr-common = { workspace = true } datafusion-physical-expr = { workspace = true } diff --git a/datafusion/physical-expr-common/Cargo.toml b/datafusion/physical-expr-common/Cargo.toml index a5a12b5527b7..58dc767dbad2 100644 --- a/datafusion/physical-expr-common/Cargo.toml +++ b/datafusion/physical-expr-common/Cargo.toml @@ -40,7 +40,7 @@ name = "datafusion_physical_expr_common" [dependencies] ahash = { workspace = true } arrow = { workspace = true } -datafusion-common = { workspace = true, default-features = true } +datafusion-common = { workspace = true } datafusion-expr-common = { workspace = true } hashbrown = { workspace = true } itertools = { workspace = true } diff --git a/datafusion/physical-expr/Cargo.toml b/datafusion/physical-expr/Cargo.toml index c0eaa3e20528..194822a26921 100644 --- a/datafusion/physical-expr/Cargo.toml +++ b/datafusion/physical-expr/Cargo.toml @@ -40,7 +40,7 @@ name = "datafusion_physical_expr" [dependencies] ahash = { workspace = true } arrow = { workspace = true } -datafusion-common = { workspace = true, default-features = true } +datafusion-common = { workspace = true } datafusion-expr = { workspace = true } datafusion-expr-common = { workspace = true } datafusion-functions-aggregate-common = { workspace = true } diff --git a/datafusion/physical-optimizer/Cargo.toml b/datafusion/physical-optimizer/Cargo.toml index c7795418bf10..0b024fe920c8 100644 --- a/datafusion/physical-optimizer/Cargo.toml +++ b/datafusion/physical-optimizer/Cargo.toml @@ -39,7 +39,7 @@ recursive_protection = ["dep:recursive"] [dependencies] arrow = { workspace = true } -datafusion-common = { workspace = true, default-features = true } +datafusion-common = { workspace = true } datafusion-execution = { workspace = true } datafusion-expr = { workspace = true } datafusion-expr-common = { workspace = true, default-features = true } diff --git a/datafusion/physical-plan/Cargo.toml b/datafusion/physical-plan/Cargo.toml index 830a159d5eb1..fbb0a08cff8f 100644 --- a/datafusion/physical-plan/Cargo.toml +++ b/datafusion/physical-plan/Cargo.toml @@ -49,7 +49,7 @@ arrow-ord = { workspace = true } arrow-schema = { workspace = true } async-trait = { workspace = true } chrono = { workspace = true } -datafusion-common = { workspace = true, default-features = true } +datafusion-common = { workspace = true } datafusion-common-runtime = { workspace = true, default-features = true } datafusion-execution = { workspace = true } datafusion-expr = { workspace = true } diff --git a/datafusion/proto/Cargo.toml b/datafusion/proto/Cargo.toml index c95f392a051a..b64c17589108 100644 --- a/datafusion/proto/Cargo.toml +++ b/datafusion/proto/Cargo.toml @@ -47,7 +47,7 @@ avro = ["datafusion/avro", "datafusion-common/avro"] arrow = { workspace = true } chrono = { workspace = true } datafusion = { workspace = true, default-features = true } -datafusion-common = { workspace = true, default-features = true } +datafusion-common = { workspace = true } datafusion-expr = { workspace = true } datafusion-proto-common = { workspace = true } object_store = { workspace = true } diff --git a/datafusion/session/Cargo.toml b/datafusion/session/Cargo.toml index c6e268735a7b..09bfbe1c4842 100644 --- a/datafusion/session/Cargo.toml +++ b/datafusion/session/Cargo.toml @@ -31,22 +31,12 @@ version.workspace = true all-features = true [dependencies] -arrow = { workspace = true } async-trait = { workspace = true } -dashmap = { workspace = true } datafusion-common = { workspace = true } -datafusion-common-runtime = { workspace = true } datafusion-execution = { workspace = true } datafusion-expr = { workspace = true } -datafusion-physical-expr = { workspace = true } datafusion-physical-plan = { workspace = true } -datafusion-sql = { workspace = true } -futures = { workspace = true } -itertools = { workspace = true } -log = { workspace = true } -object_store = { workspace = true } parking_lot = { workspace = true } -tokio = { workspace = true } [lints] workspace = true diff --git a/datafusion/sql/Cargo.toml b/datafusion/sql/Cargo.toml index eca40c553280..78603a1b2fd1 100644 --- a/datafusion/sql/Cargo.toml +++ b/datafusion/sql/Cargo.toml @@ -50,7 +50,7 @@ recursive_protection = ["dep:recursive"] [dependencies] arrow = { workspace = true } bigdecimal = { workspace = true } -datafusion-common = { workspace = true, default-features = true } +datafusion-common = { workspace = true } datafusion-expr = { workspace = true } indexmap = { workspace = true } log = { workspace = true } diff --git a/datafusion/sql/src/expr/function.rs b/datafusion/sql/src/expr/function.rs index 5a47c56f3488..5c4a55e46ed5 100644 --- a/datafusion/sql/src/expr/function.rs +++ b/datafusion/sql/src/expr/function.rs @@ -22,13 +22,13 @@ use datafusion_common::{ internal_datafusion_err, internal_err, not_impl_err, plan_datafusion_err, plan_err, DFSchema, Dependency, Diagnostic, Result, Span, }; -use datafusion_expr::expr::{ScalarFunction, Unnest, WildcardOptions, WindowFunction}; +use datafusion_expr::expr::{NullTreatment, ScalarFunction, Unnest, WildcardOptions, WindowFunction}; use datafusion_expr::planner::{PlannerResult, RawAggregateExpr, RawWindowExpr}; use datafusion_expr::{expr, Expr, ExprSchemable, WindowFrame, WindowFunctionDefinition}; use sqlparser::ast::{ DuplicateTreatment, Expr as SQLExpr, Function as SQLFunction, FunctionArg, FunctionArgExpr, FunctionArgumentClause, FunctionArgumentList, FunctionArguments, - NullTreatment, ObjectName, OrderByExpr, Spanned, WindowType, + ObjectName, OrderByExpr, Spanned, WindowType, }; /// Suggest a valid function based on an invalid input function name @@ -115,7 +115,7 @@ impl FunctionArgs { order_by: vec![], over, filter, - null_treatment, + null_treatment: null_treatment.map(|v| v.into()), distinct: false, within_group, function_without_parentheses: matches!(args, FunctionArguments::None), @@ -197,7 +197,7 @@ impl FunctionArgs { order_by, over, filter, - null_treatment, + null_treatment: null_treatment.map(|v| v.into()), distinct, within_group, function_without_parentheses: false, diff --git a/datafusion/wasmtest/Cargo.toml b/datafusion/wasmtest/Cargo.toml index b43c34f19760..2fc9f0df3084 100644 --- a/datafusion/wasmtest/Cargo.toml +++ b/datafusion/wasmtest/Cargo.toml @@ -46,7 +46,7 @@ chrono = { version = "0.4", features = ["wasmbind"] } # code size when deploying. console_error_panic_hook = { version = "0.1.1", optional = true } datafusion = { workspace = true, features = ["parquet"] } -datafusion-common = { workspace = true, default-features = true } +datafusion-common = { workspace = true } datafusion-execution = { workspace = true } datafusion-expr = { workspace = true } datafusion-optimizer = { workspace = true, default-features = true } From 61654305443fbd4d942be5bba4443cd4a392f167 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Wed, 30 Apr 2025 14:53:44 -0400 Subject: [PATCH 02/15] More intermediate work, but does not yet compile --- datafusion/common/src/column.rs | 10 ++++++- datafusion/common/src/error.rs | 6 ++++ .../common/src/file_options/json_writer.rs | 2 +- datafusion/common/src/parsers.rs | 8 ++--- datafusion/common/src/spans.rs | 2 ++ datafusion/common/src/table_reference.rs | 29 ++++++++++++++++++- datafusion/common/src/utils/mod.rs | 7 +++-- datafusion/expr/src/expr.rs | 2 ++ datafusion/expr/src/lib.rs | 1 + datafusion/expr/src/planner.rs | 7 +++-- datafusion/expr/src/window_frame.rs | 23 ++++++++++----- 11 files changed, 77 insertions(+), 20 deletions(-) diff --git a/datafusion/common/src/column.rs b/datafusion/common/src/column.rs index 78b45a130616..f64ab76d5a12 100644 --- a/datafusion/common/src/column.rs +++ b/datafusion/common/src/column.rs @@ -18,7 +18,9 @@ //! Column use crate::error::{_schema_err, add_possible_columns_to_diag}; -use crate::utils::{parse_identifiers_normalized, quote_identifier}; +#[cfg(feature = "sql")] +use crate::utils::parse_identifiers_normalized; +use crate::utils::quote_identifier; use crate::{DFSchema, Diagnostic, Result, SchemaError, Spans, TableReference}; use arrow::datatypes::{Field, FieldRef}; use std::collections::HashSet; @@ -128,6 +130,7 @@ impl Column { /// Treats the name as a SQL identifier. For example /// `foo.BAR` would be parsed to a reference to relation `foo`, column name `bar` (lower case) /// where `"foo.BAR"` would be parsed to a reference to column named `foo.BAR` + #[cfg(feature = "sql")] pub fn from_qualified_name(flat_name: impl Into) -> Self { let flat_name = flat_name.into(); Self::from_idents(parse_identifiers_normalized(&flat_name, false)).unwrap_or_else( @@ -140,6 +143,7 @@ impl Column { } /// Deserialize a fully qualified name string into a column preserving column text case + #[cfg(feature = "sql")] pub fn from_qualified_name_ignore_case(flat_name: impl Into) -> Self { let flat_name = flat_name.into(); Self::from_idents(parse_identifiers_normalized(&flat_name, true)).unwrap_or_else( @@ -322,6 +326,7 @@ impl Column { } } +#[cfg(feature = "sql")] impl From<&str> for Column { fn from(c: &str) -> Self { Self::from_qualified_name(c) @@ -329,6 +334,7 @@ impl From<&str> for Column { } /// Create a column, cloning the string +#[cfg(feature = "sql")] impl From<&String> for Column { fn from(c: &String) -> Self { Self::from_qualified_name(c) @@ -336,6 +342,7 @@ impl From<&String> for Column { } /// Create a column, reusing the existing string +#[cfg(feature = "sql")] impl From for Column { fn from(c: String) -> Self { Self::from_qualified_name(c) @@ -356,6 +363,7 @@ impl From<(Option<&TableReference>, &FieldRef)> for Column { } } +#[cfg(feature = "sql")] impl FromStr for Column { type Err = Infallible; diff --git a/datafusion/common/src/error.rs b/datafusion/common/src/error.rs index 8b64c0a6a443..8d0f89b7bd10 100644 --- a/datafusion/common/src/error.rs +++ b/datafusion/common/src/error.rs @@ -35,6 +35,7 @@ use apache_avro::Error as AvroError; use arrow::error::ArrowError; #[cfg(feature = "parquet")] use parquet::errors::ParquetError; +#[cfg(feature = "sql")] use sqlparser::parser::ParserError; use tokio::task::JoinError; @@ -68,6 +69,7 @@ pub enum DataFusionError { /// Error when SQL is syntactically incorrect. /// /// 2nd argument is for optional backtrace + #[cfg(feature = "sql")] SQL(Box, Option), /// Error when a feature is not yet implemented. /// @@ -329,6 +331,7 @@ impl From for DataFusionError { } } +#[cfg(feature = "sql")] impl From for DataFusionError { fn from(e: ParserError) -> Self { DataFusionError::SQL(Box::new(e), None) @@ -369,6 +372,7 @@ impl Error for DataFusionError { #[cfg(feature = "object_store")] DataFusionError::ObjectStore(e) => Some(e.as_ref()), DataFusionError::IoError(e) => Some(e), + #[cfg(feature = "sql")] DataFusionError::SQL(e, _) => Some(e.as_ref()), DataFusionError::NotImplemented(_) => None, DataFusionError::Internal(_) => None, @@ -497,6 +501,7 @@ impl DataFusionError { #[cfg(feature = "object_store")] DataFusionError::ObjectStore(_) => "Object Store error: ", DataFusionError::IoError(_) => "IO error: ", + #[cfg(feature = "sql")] DataFusionError::SQL(_, _) => "SQL error: ", DataFusionError::NotImplemented(_) => { "This feature is not implemented: " @@ -534,6 +539,7 @@ impl DataFusionError { #[cfg(feature = "avro")] DataFusionError::AvroError(ref desc) => Cow::Owned(desc.to_string()), DataFusionError::IoError(ref desc) => Cow::Owned(desc.to_string()), + #[cfg(feature = "sql")] DataFusionError::SQL(ref desc, ref backtrace) => { let backtrace: String = backtrace.clone().unwrap_or_else(|| "".to_owned()); diff --git a/datafusion/common/src/file_options/json_writer.rs b/datafusion/common/src/file_options/json_writer.rs index 750d2972329b..889aec236af4 100644 --- a/datafusion/common/src/file_options/json_writer.rs +++ b/datafusion/common/src/file_options/json_writer.rs @@ -20,7 +20,7 @@ use crate::{ config::JsonOptions, error::{DataFusionError, Result}, - parsers::CompressionTypeVariant, + parsers::CompressionTypeVariant }; /// Options for writing JSON files diff --git a/datafusion/common/src/parsers.rs b/datafusion/common/src/parsers.rs index 41571ebb8576..cd3d607dacd8 100644 --- a/datafusion/common/src/parsers.rs +++ b/datafusion/common/src/parsers.rs @@ -20,7 +20,7 @@ use std::fmt::Display; use std::str::FromStr; -use sqlparser::parser::ParserError; +use crate::DataFusionError; /// Readable file compression type #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] @@ -38,9 +38,9 @@ pub enum CompressionTypeVariant { } impl FromStr for CompressionTypeVariant { - type Err = ParserError; + type Err = DataFusionError; - fn from_str(s: &str) -> Result { + fn from_str(s: &str) -> Result { let s = s.to_uppercase(); match s.as_str() { "GZIP" | "GZ" => Ok(Self::GZIP), @@ -48,7 +48,7 @@ impl FromStr for CompressionTypeVariant { "XZ" => Ok(Self::XZ), "ZST" | "ZSTD" => Ok(Self::ZSTD), "" | "UNCOMPRESSED" => Ok(Self::UNCOMPRESSED), - _ => Err(ParserError::ParserError(format!( + _ => Err(DataFusionError::NotImplemented(format!( "Unsupported file compression type {s}" ))), } diff --git a/datafusion/common/src/spans.rs b/datafusion/common/src/spans.rs index 5111e264123c..c0b52977e14a 100644 --- a/datafusion/common/src/spans.rs +++ b/datafusion/common/src/spans.rs @@ -39,6 +39,7 @@ impl fmt::Debug for Location { } } +#[cfg(feature = "sql")] impl From for Location { fn from(value: sqlparser::tokenizer::Location) -> Self { Self { @@ -70,6 +71,7 @@ impl Span { /// Convert a [`Span`](sqlparser::tokenizer::Span) from the parser, into a /// DataFusion [`Span`]. If the input span is empty (line 0 column 0, to /// line 0 column 0), then [`None`] is returned. + #[cfg(feature = "sql")] pub fn try_from_sqlparser_span(span: sqlparser::tokenizer::Span) -> Option { if span == sqlparser::tokenizer::Span::empty() { None diff --git a/datafusion/common/src/table_reference.rs b/datafusion/common/src/table_reference.rs index 9b6f9696c00b..91cd7a0862e7 100644 --- a/datafusion/common/src/table_reference.rs +++ b/datafusion/common/src/table_reference.rs @@ -15,7 +15,9 @@ // specific language governing permissions and limitations // under the License. -use crate::utils::{parse_identifiers_normalized, quote_identifier}; +#[cfg(feature = "sql")] +use crate::utils::parse_identifiers_normalized; +use crate::utils::quote_identifier; use std::sync::Arc; /// A fully resolved path to a table of the form "catalog.schema.table" @@ -269,6 +271,7 @@ impl TableReference { /// Forms a [`TableReference`] by parsing `s` as a multipart SQL /// identifier. See docs on [`TableReference`] for more details. + #[cfg(feature = "sql")] pub fn parse_str(s: &str) -> Self { let mut parts = parse_identifiers_normalized(s, false); @@ -312,24 +315,48 @@ impl TableReference { /// Parse a string into a TableReference, normalizing where appropriate /// /// See full details on [`TableReference::parse_str`] +#[cfg(feature = "sql")] impl<'a> From<&'a str> for TableReference { fn from(s: &'a str) -> Self { Self::parse_str(s) } } +#[cfg(feature = "sql")] impl<'a> From<&'a String> for TableReference { fn from(s: &'a String) -> Self { Self::parse_str(s) } } +#[cfg(feature = "sql")] impl From for TableReference { fn from(s: String) -> Self { Self::parse_str(&s) } } +#[cfg(not(feature = "sql"))] +impl<'a> From<&'a str> for TableReference { + fn from(s: &'a str) -> Self { + Self::Bare { table: s.into() } + } +} + +#[cfg(not(feature = "sql"))] +impl<'a> From<&'a String> for TableReference { + fn from(s: &'a String) -> Self { + Self::Bare { table: s.as_str().into() } + } +} + +#[cfg(not(feature = "sql"))] +impl From for TableReference { + fn from(s: String) -> Self { + Self::Bare { table: s.into() } + } +} + impl From for TableReference { fn from(resolved: ResolvedTableReference) -> Self { Self::Full { diff --git a/datafusion/common/src/utils/mod.rs b/datafusion/common/src/utils/mod.rs index 3f776a44bc59..9a733bbb1e2b 100644 --- a/datafusion/common/src/utils/mod.rs +++ b/datafusion/common/src/utils/mod.rs @@ -31,9 +31,8 @@ use arrow::array::{ use arrow::buffer::OffsetBuffer; use arrow::compute::{partition, SortColumn, SortOptions}; use arrow::datatypes::{DataType, Field, SchemaRef}; -use sqlparser::ast::Ident; -use sqlparser::dialect::GenericDialect; -use sqlparser::parser::Parser; +#[cfg(feature = "sql")] +use sqlparser::{ast::Ident, dialect::GenericDialect, parser::Parser}; use std::borrow::{Borrow, Cow}; use std::cmp::{min, Ordering}; use std::collections::HashSet; @@ -282,6 +281,7 @@ fn needs_quotes(s: &str) -> bool { !chars.all(|c| c.is_ascii_lowercase() || c.is_ascii_digit() || c == '_') } +#[cfg(feature = "sql")] pub(crate) fn parse_identifiers(s: &str) -> Result> { let dialect = GenericDialect; let mut parser = Parser::new(&dialect).try_with_sql(s)?; @@ -289,6 +289,7 @@ pub(crate) fn parse_identifiers(s: &str) -> Result> { Ok(idents) } +#[cfg(feature = "sql")] pub(crate) fn parse_identifiers_normalized(s: &str, ignore_case: bool) -> Vec { parse_identifiers(s) .unwrap_or_default() diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs index 4d88b3721704..a0e9e7d33030 100644 --- a/datafusion/expr/src/expr.rs +++ b/datafusion/expr/src/expr.rs @@ -39,6 +39,7 @@ use datafusion_common::{ Column, DFSchema, HashMap, Result, ScalarValue, Spans, TableReference, }; use datafusion_functions_window_common::field::WindowUDFFieldArgs; +#[cfg(feature = "sql")] use sqlparser::ast::{ display_comma_separated, ExceptSelectItem, ExcludeSelectItem, IlikeSelectItem, RenameSelectItem, ReplaceSelectElement, @@ -61,6 +62,7 @@ impl Display for NullTreatment { } } +#[cfg(feature = "sql")] impl From for NullTreatment { fn from(value: sqlparser::ast::NullTreatment) -> Self { match value { diff --git a/datafusion/expr/src/lib.rs b/datafusion/expr/src/lib.rs index c35654345e9f..f5c0501911df 100644 --- a/datafusion/expr/src/lib.rs +++ b/datafusion/expr/src/lib.rs @@ -104,6 +104,7 @@ pub use literal::{ }; pub use logical_plan::*; pub use partition_evaluator::PartitionEvaluator; +#[cfg(feature = "sql")] pub use sqlparser; pub use table_source::{TableProviderFilterPushDown, TableSource, TableType}; pub use udaf::{ diff --git a/datafusion/expr/src/planner.rs b/datafusion/expr/src/planner.rs index d2aa7ab43d8c..c8fe6fd9eace 100644 --- a/datafusion/expr/src/planner.rs +++ b/datafusion/expr/src/planner.rs @@ -25,8 +25,7 @@ use datafusion_common::{ config::ConfigOptions, file_options::file_type::FileType, not_impl_err, DFSchema, Result, TableReference, }; -use sqlparser::ast; - +use datafusion_expr_common::operator::Operator; use crate::{ AggregateUDF, Expr, GetFieldAccess, ScalarUDF, SortExpr, TableSource, WindowFrame, WindowFunctionDefinition, WindowUDF, @@ -86,6 +85,7 @@ pub trait ContextProvider { } /// Return [`TypePlanner`] extensions for planning data types + #[cfg(feature = "sql")] fn get_type_planner(&self) -> Option> { None } @@ -262,7 +262,7 @@ pub trait ExprPlanner: Debug + Send + Sync { /// custom expressions. #[derive(Debug, Clone)] pub struct RawBinaryExpr { - pub op: ast::BinaryOperator, + pub op: Operator, pub left: Expr, pub right: Expr, } @@ -323,6 +323,7 @@ pub enum PlannerResult { } /// Customize planning SQL types to DataFusion (Arrow) types. +#[cfg(feature = "sql")] pub trait TypePlanner: Debug + Send + Sync { /// Plan SQL [`ast::DataType`] to DataFusion [`DataType`] /// diff --git a/datafusion/expr/src/window_frame.rs b/datafusion/expr/src/window_frame.rs index b91bbddd8bac..44d0a1b04323 100644 --- a/datafusion/expr/src/window_frame.rs +++ b/datafusion/expr/src/window_frame.rs @@ -28,9 +28,14 @@ use arrow::datatypes::DataType; use std::fmt::{self, Formatter}; use std::hash::Hash; -use datafusion_common::{plan_err, sql_err, DataFusionError, Result, ScalarValue}; -use sqlparser::ast::{self, ValueWithSpan}; -use sqlparser::parser::ParserError::ParserError; +use datafusion_common::{ + exec_err, plan_err, DataFusionError, Result, ScalarValue, +}; +#[cfg(feature = "sql")] +use sqlparser::{ + ast::{self, ValueWithSpan}, + parser::ParserError::ParserError, +}; /// The frame specification determines which output rows are read by an aggregate /// window function. The ending frame boundary can be omitted if the `BETWEEN` @@ -115,6 +120,7 @@ impl fmt::Debug for WindowFrame { } } +#[cfg(feature = "sql")] impl TryFrom for WindowFrame { type Error = DataFusionError; @@ -343,6 +349,7 @@ impl WindowFrameBound { } impl WindowFrameBound { + #[cfg(feature = "sql")] fn try_parse( value: ast::WindowFrameBound, units: &ast::WindowFrameUnits, @@ -365,6 +372,7 @@ impl WindowFrameBound { } } +#[cfg(feature = "sql")] fn convert_frame_bound_to_scalar_value( v: ast::Expr, units: &ast::WindowFrameUnits, @@ -385,9 +393,9 @@ fn convert_frame_bound_to_scalar_value( let value = match *value { ast::Expr::Value(ValueWithSpan{value: ast::Value::SingleQuotedString(item), span: _}) => item, e => { - return sql_err!(ParserError(format!( + return exec_err!( "INTERVAL expression cannot be {e:?}" - ))); + ); } }; Ok(ScalarValue::try_from_string(value, &DataType::UInt64)?) @@ -408,9 +416,9 @@ fn convert_frame_bound_to_scalar_value( let result = match *value { ast::Expr::Value(ValueWithSpan{value: ast::Value::SingleQuotedString(item), span: _}) => item, e => { - return sql_err!(ParserError(format!( + return exec_err!( "INTERVAL expression cannot be {e:?}" - ))); + ); } }; if let Some(leading_field) = leading_field { @@ -477,6 +485,7 @@ impl fmt::Display for WindowFrameUnits { } } +#[cfg(feature = "sql")] impl From for WindowFrameUnits { fn from(value: ast::WindowFrameUnits) -> Self { match value { From 7c466de3fc861b1732be1aae2cac81d8adb53e91 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Thu, 1 May 2025 14:59:06 -0400 Subject: [PATCH 03/15] Working through more points of friction to remove sql as feature --- datafusion/common/src/column.rs | 17 ++- .../core/src/execution/session_state.rs | 37 +++-- datafusion/expr/Cargo.toml | 1 + datafusion/expr/src/expr.rs | 129 +++++++++++++++++- datafusion/expr/src/logical_plan/ddl.rs | 3 + datafusion/expr/src/planner.rs | 5 +- datafusion/expr/src/utils.rs | 3 + datafusion/proto/Cargo.toml | 2 +- datafusion/sql/Cargo.toml | 4 +- 9 files changed, 182 insertions(+), 19 deletions(-) diff --git a/datafusion/common/src/column.rs b/datafusion/common/src/column.rs index f64ab76d5a12..4d4fbef24832 100644 --- a/datafusion/common/src/column.rs +++ b/datafusion/common/src/column.rs @@ -141,7 +141,14 @@ impl Column { }, ) } - + #[cfg(not(feature = "sql"))] + pub fn from_qualified_name(flat_name: impl Into) -> Self { + Self { + relation: None, + name: flat_name.into(), + spans: Spans::new(), + } + } /// Deserialize a fully qualified name string into a column preserving column text case #[cfg(feature = "sql")] pub fn from_qualified_name_ignore_case(flat_name: impl Into) -> Self { @@ -154,7 +161,10 @@ impl Column { }, ) } - + #[cfg(not(feature = "sql"))] + pub fn from_qualified_name_ignore_case(flat_name: impl Into) -> Self { + Self::from_qualified_name(flat_name) + } /// return the column's name. /// /// Note: This ignores the relation and returns the column name only. @@ -326,7 +336,6 @@ impl Column { } } -#[cfg(feature = "sql")] impl From<&str> for Column { fn from(c: &str) -> Self { Self::from_qualified_name(c) @@ -334,7 +343,6 @@ impl From<&str> for Column { } /// Create a column, cloning the string -#[cfg(feature = "sql")] impl From<&String> for Column { fn from(c: &String) -> Self { Self::from_qualified_name(c) @@ -342,7 +350,6 @@ impl From<&String> for Column { } /// Create a column, reusing the existing string -#[cfg(feature = "sql")] impl From for Column { fn from(c: String) -> Self { Self::from_qualified_name(c) diff --git a/datafusion/core/src/execution/session_state.rs b/datafusion/core/src/execution/session_state.rs index cde92fe526a9..5a27c769f1eb 100644 --- a/datafusion/core/src/execution/session_state.rs +++ b/datafusion/core/src/execution/session_state.rs @@ -51,7 +51,9 @@ use datafusion_execution::runtime_env::RuntimeEnv; use datafusion_execution::TaskContext; use datafusion_expr::execution_props::ExecutionProps; use datafusion_expr::expr_rewriter::FunctionRewrite; -use datafusion_expr::planner::{ExprPlanner, TypePlanner}; +use datafusion_expr::planner::ExprPlanner; +#[cfg(feature = "sql")] +use datafusion_expr::planner::TypePlanner; use datafusion_expr::registry::{FunctionRegistry, SerializerRegistry}; use datafusion_expr::simplify::SimplifyInfo; use datafusion_expr::var_provider::{is_system_variables, VarType}; @@ -138,6 +140,7 @@ pub struct SessionState { /// Provides support for customizing the SQL planner, e.g. to add support for custom operators like `->>` or `?` expr_planners: Vec>, /// Provides support for customizing the SQL type planning + #[cfg(feature = "sql")] type_planner: Option>, /// Responsible for optimizing a logical plan optimizer: Optimizer, @@ -191,7 +194,8 @@ impl Debug for SessionState { /// Prefer having short fields at the top and long vector fields near the end /// Group fields by fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("SessionState") + let mut debug_struct = f.debug_struct("SessionState"); + let ret = debug_struct .field("session_id", &self.session_id) .field("config", &self.config) .field("runtime_env", &self.runtime_env) @@ -202,9 +206,12 @@ impl Debug for SessionState { .field("table_options", &self.table_options) .field("table_factories", &self.table_factories) .field("function_factory", &self.function_factory) - .field("expr_planners", &self.expr_planners) - .field("type_planner", &self.type_planner) - .field("query_planners", &self.query_planner) + .field("expr_planners", &self.expr_planners); + + #[cfg(feature = "sql")] + let ret = ret.field("type_planner", &self.type_planner); + + ret.field("query_planners", &self.query_planner) .field("analyzer", &self.analyzer) .field("optimizer", &self.optimizer) .field("physical_optimizers", &self.physical_optimizers) @@ -906,6 +913,7 @@ pub struct SessionStateBuilder { session_id: Option, analyzer: Option, expr_planners: Option>>, + #[cfg(feature = "sql")] type_planner: Option>, optimizer: Option, physical_optimizers: Option, @@ -942,6 +950,7 @@ impl SessionStateBuilder { session_id: None, analyzer: None, expr_planners: None, + #[cfg(feature = "sql")] type_planner: None, optimizer: None, physical_optimizers: None, @@ -991,6 +1000,7 @@ impl SessionStateBuilder { session_id: None, analyzer: Some(existing.analyzer), expr_planners: Some(existing.expr_planners), + #[cfg(feature = "sql")] type_planner: existing.type_planner, optimizer: Some(existing.optimizer), physical_optimizers: Some(existing.physical_optimizers), @@ -1132,6 +1142,7 @@ impl SessionStateBuilder { } /// Set the [`TypePlanner`] used to customize the behavior of the SQL planner. + #[cfg(feature = "sql")] pub fn with_type_planner(mut self, type_planner: Arc) -> Self { self.type_planner = Some(type_planner); self @@ -1343,6 +1354,7 @@ impl SessionStateBuilder { session_id, analyzer, expr_planners, + #[cfg(feature = "sql")] type_planner, optimizer, physical_optimizers, @@ -1372,6 +1384,7 @@ impl SessionStateBuilder { session_id: session_id.unwrap_or_else(|| Uuid::new_v4().to_string()), analyzer: analyzer.unwrap_or_default(), expr_planners: expr_planners.unwrap_or_default(), + #[cfg(feature = "sql")] type_planner, optimizer: optimizer.unwrap_or_default(), physical_optimizers: physical_optimizers.unwrap_or_default(), @@ -1490,6 +1503,7 @@ impl SessionStateBuilder { } /// Returns the current type_planner value + #[cfg(feature = "sql")] pub fn type_planner(&mut self) -> &mut Option> { &mut self.type_planner } @@ -1604,7 +1618,9 @@ impl Debug for SessionStateBuilder { /// Prefer having short fields at the top and long vector fields near the end /// Group fields by fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("SessionStateBuilder") + let mut debug_struct = f + .debug_struct("SessionStateBuilder"); + let ret = debug_struct .field("session_id", &self.session_id) .field("config", &self.config) .field("runtime_env", &self.runtime_env) @@ -1615,9 +1631,10 @@ impl Debug for SessionStateBuilder { .field("table_options", &self.table_options) .field("table_factories", &self.table_factories) .field("function_factory", &self.function_factory) - .field("expr_planners", &self.expr_planners) - .field("type_planner", &self.type_planner) - .field("query_planners", &self.query_planner) + .field("expr_planners", &self.expr_planners); + #[cfg(feature = "sql")] + let ret = ret.field("type_planner", &self.type_planner); + ret.field("query_planners", &self.query_planner) .field("analyzer_rules", &self.analyzer_rules) .field("analyzer", &self.analyzer) .field("optimizer_rules", &self.optimizer_rules) @@ -2011,8 +2028,10 @@ mod tests { use std::sync::Arc; #[test] + #[cfg(feature = "sql")] fn test_session_state_with_default_features() { // test array planners with and without builtin planners + #[cfg(feature = "sql")] fn sql_to_expr(state: &SessionState) -> Result { let provider = SessionContextProvider { state, diff --git a/datafusion/expr/Cargo.toml b/datafusion/expr/Cargo.toml index 108e1ed30fa8..b80cf23e104f 100644 --- a/datafusion/expr/Cargo.toml +++ b/datafusion/expr/Cargo.toml @@ -57,6 +57,7 @@ paste = "^1.0" recursive = { workspace = true, optional = true } serde_json = { workspace = true } sqlparser = { workspace = true, optional = true } +itertools = "0.14.0" [dev-dependencies] ctor = { workspace = true } diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs index a0e9e7d33030..df05bee5d366 100644 --- a/datafusion/expr/src/expr.rs +++ b/datafusion/expr/src/expr.rs @@ -41,7 +41,7 @@ use datafusion_common::{ use datafusion_functions_window_common::field::WindowUDFFieldArgs; #[cfg(feature = "sql")] use sqlparser::ast::{ - display_comma_separated, ExceptSelectItem, ExcludeSelectItem, IlikeSelectItem, + display_comma_separated, ExceptSelectItem, ExcludeSelectItem, Ident, IdentWithAlias, IlikeSelectItem, RenameSelectItem, ReplaceSelectElement, }; @@ -1410,6 +1410,133 @@ impl GroupingSet { } } +#[derive(Clone, PartialEq, Eq, PartialOrd, Hash, Debug)] +#[cfg(not(feature = "sql"))] +pub struct IlikeSelectItem{ + pub pattern: String, +} +#[cfg(not(feature = "sql"))] +impl Display for IlikeSelectItem { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + write!( + f, + "ILIKE '{}'", + &self.pattern + )?; + Ok(()) + } +} +#[derive(Clone, PartialEq, Eq, PartialOrd, Hash, Debug)] +#[cfg(not(feature = "sql"))] +pub enum ExcludeSelectItem { + Single(Ident), + Multiple(Vec), +} +#[cfg(not(feature = "sql"))] +impl Display for ExcludeSelectItem { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + write!(f, "EXCLUDE")?; + match self { + Self::Single(column) => { + write!(f, " {column}")?; + } + Self::Multiple(columns) => { + write!(f, " ({})", display_comma_separated(columns))?; + } + } + Ok(()) + } +} +#[derive(Clone, PartialEq, Eq, PartialOrd, Hash, Debug)] +#[cfg(not(feature = "sql"))] +pub struct ExceptSelectItem { + pub first_element: Ident, + pub additional_elements: Vec, +} +#[cfg(not(feature = "sql"))] +impl Display for ExceptSelectItem { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + write!(f, "EXCEPT ")?; + if self.additional_elements.is_empty() { + write!(f, "({})", self.first_element)?; + } else { + write!( + f, + "({}, {})", + self.first_element, + display_comma_separated(&self.additional_elements) + )?; + } + Ok(()) + } +} + +#[cfg(not(feature = "sql"))] +pub fn display_comma_separated(slice: &[T]) -> String +where + T: Display { + use itertools::Itertools; + slice.iter().map(|v| format!("{v}")).join(", ") +} + +#[derive(Clone, PartialEq, Eq, PartialOrd, Hash, Debug)] +#[cfg(not(feature = "sql"))] +pub enum RenameSelectItem { + Single(String), + Multiple(Vec), +} +#[cfg(not(feature = "sql"))] +impl Display for RenameSelectItem { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + write!(f, "RENAME")?; + match self { + Self::Single(column) => { + write!(f, " {column}")?; + } + Self::Multiple(columns) => { + write!(f, " ({})", display_comma_separated(columns))?; + } + } + Ok(()) + } +} + +#[derive(Clone, PartialEq, Eq, PartialOrd, Hash, Debug)] +#[cfg(not(feature = "sql"))] +pub struct Ident { + /// The value of the identifier without quotes. + pub value: String, + /// The starting quote if any. Valid quote characters are the single quote, + /// double quote, backtick, and opening square bracket. + pub quote_style: Option, + /// The span of the identifier in the original SQL string. + pub span: String, +} +#[cfg(not(feature = "sql"))] +impl Display for Ident { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + write!(f, "[{}]", self.value) + } +} + +#[derive(Clone, PartialEq, Eq, PartialOrd, Hash, Debug)] +#[cfg(not(feature = "sql"))] +pub struct ReplaceSelectElement { + pub expr: String, + pub column_name: Ident, + pub as_keyword: bool, +} +#[cfg(not(feature = "sql"))] +impl Display for ReplaceSelectElement { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + if self.as_keyword { + write!(f, "{} AS {}", self.expr, self.column_name) + } else { + write!(f, "{} {}", self.expr, self.column_name) + } + } +} + /// Additional options for wildcards, e.g. Snowflake `EXCLUDE`/`RENAME` and Bigquery `EXCEPT`. #[derive(Clone, PartialEq, Eq, PartialOrd, Hash, Debug, Default)] pub struct WildcardOptions { diff --git a/datafusion/expr/src/logical_plan/ddl.rs b/datafusion/expr/src/logical_plan/ddl.rs index 827e2812ecae..0c2c7f1dc279 100644 --- a/datafusion/expr/src/logical_plan/ddl.rs +++ b/datafusion/expr/src/logical_plan/ddl.rs @@ -30,7 +30,10 @@ use datafusion_common::tree_node::{Transformed, TreeNodeContainer, TreeNodeRecur use datafusion_common::{ Constraints, DFSchemaRef, Result, SchemaReference, TableReference, }; +#[cfg(feature = "sql")] use sqlparser::ast::Ident; +#[cfg(not(feature = "sql"))] +use crate::expr::Ident; /// Various types of DDL (CREATE / DROP) catalog manipulation #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Hash)] diff --git a/datafusion/expr/src/planner.rs b/datafusion/expr/src/planner.rs index c8fe6fd9eace..d69bbbdddf2e 100644 --- a/datafusion/expr/src/planner.rs +++ b/datafusion/expr/src/planner.rs @@ -262,7 +262,10 @@ pub trait ExprPlanner: Debug + Send + Sync { /// custom expressions. #[derive(Debug, Clone)] pub struct RawBinaryExpr { + #[cfg(not(feature = "sql"))] pub op: Operator, + #[cfg(feature = "sql")] + pub op: sqlparser::ast::BinaryOperator, pub left: Expr, pub right: Expr, } @@ -328,7 +331,7 @@ pub trait TypePlanner: Debug + Send + Sync { /// Plan SQL [`ast::DataType`] to DataFusion [`DataType`] /// /// Returns None if not possible - fn plan_type(&self, _sql_type: &ast::DataType) -> Result> { + fn plan_type(&self, _sql_type: &sqlparser::ast::DataType) -> Result> { Ok(None) } } diff --git a/datafusion/expr/src/utils.rs b/datafusion/expr/src/utils.rs index 2e364d0d2b80..7361c77cb5b5 100644 --- a/datafusion/expr/src/utils.rs +++ b/datafusion/expr/src/utils.rs @@ -39,7 +39,10 @@ use datafusion_common::{ }; use indexmap::IndexSet; +#[cfg(feature = "sql")] use sqlparser::ast::{ExceptSelectItem, ExcludeSelectItem}; +#[cfg(not(feature = "sql"))] +use crate::expr::{ExceptSelectItem, ExcludeSelectItem}; pub use datafusion_functions_aggregate_common::order::AggregateOrderSensitivity; diff --git a/datafusion/proto/Cargo.toml b/datafusion/proto/Cargo.toml index b64c17589108..956887e1bb17 100644 --- a/datafusion/proto/Cargo.toml +++ b/datafusion/proto/Cargo.toml @@ -46,7 +46,7 @@ avro = ["datafusion/avro", "datafusion-common/avro"] [dependencies] arrow = { workspace = true } chrono = { workspace = true } -datafusion = { workspace = true, default-features = true } +datafusion = { workspace = true, default-features = false } datafusion-common = { workspace = true } datafusion-expr = { workspace = true } datafusion-proto-common = { workspace = true } diff --git a/datafusion/sql/Cargo.toml b/datafusion/sql/Cargo.toml index 78603a1b2fd1..7bd775fb5393 100644 --- a/datafusion/sql/Cargo.toml +++ b/datafusion/sql/Cargo.toml @@ -50,8 +50,8 @@ recursive_protection = ["dep:recursive"] [dependencies] arrow = { workspace = true } bigdecimal = { workspace = true } -datafusion-common = { workspace = true } -datafusion-expr = { workspace = true } +datafusion-common = { workspace = true, features = ["sql"] } +datafusion-expr = { workspace = true, features = ["sql"] } indexmap = { workspace = true } log = { workspace = true } recursive = { workspace = true, optional = true } From 4a6ca20bed5cd13ef5b0ced70655aebf236baf53 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Thu, 1 May 2025 19:52:40 -0400 Subject: [PATCH 04/15] Switch optimizer to compare full logical plan instead of hash of the plan in an effort to remove the hash functions from the final wasm blob. Probably need to revert or put under a feature flag. --- datafusion/optimizer/src/optimizer.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/datafusion/optimizer/src/optimizer.rs b/datafusion/optimizer/src/optimizer.rs index 49806d6db344..6a697289bd90 100644 --- a/datafusion/optimizer/src/optimizer.rs +++ b/datafusion/optimizer/src/optimizer.rs @@ -325,8 +325,8 @@ impl Optimizer { let options = config.options(); let mut new_plan = plan; - let mut previous_plans = HashSet::with_capacity(16); - previous_plans.insert(LogicalPlanSignature::new(&new_plan)); + let mut previous_plans = Vec::with_capacity(16); + previous_plans.push(new_plan.clone()); let starting_schema = Arc::clone(new_plan.schema()); @@ -412,10 +412,10 @@ impl Optimizer { } log_plan(&format!("Optimized plan (pass {i})"), &new_plan); - // HashSet::insert returns, whether the value was newly inserted. - let plan_is_fresh = - previous_plans.insert(LogicalPlanSignature::new(&new_plan)); - if !plan_is_fresh { + let plan_is_fresh = previous_plans.contains(&new_plan); + if plan_is_fresh { + previous_plans.push(new_plan.clone()); + } else { // plan did not change, so no need to continue trying to optimize debug!("optimizer pass {i} did not make changes"); break; From f2a988939e2acea3ae667b4454e13cdf5f90f604 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Tue, 26 Aug 2025 08:17:56 -0400 Subject: [PATCH 05/15] Corrections after rebase --- Cargo.lock | 1 + datafusion/common/Cargo.toml | 2 +- datafusion/core/Cargo.toml | 8 +++++++- datafusion/core/src/physical_planner.rs | 1 - 4 files changed, 9 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 10310734029e..c6d594ca9087 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2271,6 +2271,7 @@ dependencies = [ "env_logger", "indexmap 2.11.0", "insta", + "itertools 0.14.0", "paste", "recursive", "serde_json", diff --git a/datafusion/common/Cargo.toml b/datafusion/common/Cargo.toml index 052c858b4a8b..22916925c116 100644 --- a/datafusion/common/Cargo.toml +++ b/datafusion/common/Cargo.toml @@ -74,7 +74,7 @@ parquet = { workspace = true, optional = true, default-features = true } paste = "1.0.15" pyo3 = { version = "0.25", optional = true } recursive = { workspace = true, optional = true } -sqlparser = { workspace = true } +sqlparser = { workspace = true, optional = true } tokio = { workspace = true } [target.'cfg(target_family = "wasm")'.dependencies] diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml index 2083428139d9..80542efb3d4b 100644 --- a/datafusion/core/Cargo.toml +++ b/datafusion/core/Cargo.toml @@ -63,6 +63,7 @@ default = [ "compression", "parquet", "recursive_protection", + "sql", ] encoding_expressions = ["datafusion-functions/encoding_expressions"] # Used for testing ONLY: causes all values to hash to the same value (test for collisions) @@ -94,6 +95,11 @@ serde = [ # statements in `arrow-schema` crate "arrow-schema/serde", ] +sql = [ + "datafusion-common/sql", + "datafusion-sql", + "sqlparser", +] string_expressions = ["datafusion-functions/string_expressions"] unicode_expressions = [ "datafusion-sql/unicode_expressions", @@ -145,7 +151,7 @@ parquet = { workspace = true, optional = true, default-features = true } rand = { workspace = true } regex = { workspace = true } serde = { version = "1.0", default-features = false, features = ["derive"], optional = true } -sqlparser = { workspace = true } +sqlparser = { workspace = true, optional = true } tempfile = { workspace = true } tokio = { workspace = true } url = { workspace = true } diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index 0fae3aacd5b1..1570d959efae 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -91,7 +91,6 @@ use datafusion_physical_plan::placeholder_row::PlaceholderRowExec; use datafusion_physical_plan::recursive_query::RecursiveQueryExec; use datafusion_physical_plan::unnest::ListUnnest; use datafusion_sql::TableReference; -use sqlparser::ast::NullTreatment; use async_trait::async_trait; use datafusion_physical_plan::async_func::{AsyncFuncExec, AsyncMapper}; From e3ad2f11510abeb9182bb82f5ec4b812f8f2ac70 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Tue, 26 Aug 2025 13:10:30 -0400 Subject: [PATCH 06/15] Resolve errors and clippy warnings after rebase --- benchmarks/Cargo.toml | 2 +- datafusion/common/src/column.rs | 7 +++---- .../common/src/file_options/json_writer.rs | 2 +- datafusion/common/src/table_reference.rs | 4 +++- datafusion/core/src/dataframe/mod.rs | 6 +++++- datafusion/core/src/execution/session_state.rs | 3 +-- datafusion/core/src/physical_planner.rs | 5 ++++- datafusion/core/tests/dataframe/mod.rs | 5 +++-- datafusion/core/tests/expr_api/mod.rs | 2 +- datafusion/expr/src/expr.rs | 14 +++++--------- datafusion/expr/src/logical_plan/ddl.rs | 4 ++-- datafusion/expr/src/planner.rs | 18 ++++++++++-------- datafusion/expr/src/utils.rs | 4 ++-- datafusion/expr/src/window_frame.rs | 14 +++++--------- datafusion/optimizer/Cargo.toml | 2 +- datafusion/optimizer/src/optimizer.rs | 12 ++++++------ datafusion/sql/src/expr/function.rs | 4 +++- 17 files changed, 56 insertions(+), 52 deletions(-) diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index 01c657c1f71b..5a846cb49e0c 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -38,7 +38,7 @@ mimalloc_extended = ["libmimalloc-sys/extended"] [dependencies] arrow = { workspace = true } datafusion = { workspace = true, default-features = true } -datafusion-common = { workspace = true } +datafusion-common = { workspace = true, default-features = true } env_logger = { workspace = true } futures = { workspace = true } libmimalloc-sys = { version = "0.1", optional = true } diff --git a/datafusion/common/src/column.rs b/datafusion/common/src/column.rs index 4d4fbef24832..83b68865d283 100644 --- a/datafusion/common/src/column.rs +++ b/datafusion/common/src/column.rs @@ -24,9 +24,7 @@ use crate::utils::quote_identifier; use crate::{DFSchema, Diagnostic, Result, SchemaError, Spans, TableReference}; use arrow::datatypes::{Field, FieldRef}; use std::collections::HashSet; -use std::convert::Infallible; use std::fmt; -use std::str::FromStr; /// A named reference to a qualified field in a schema. #[derive(Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] @@ -90,6 +88,7 @@ impl Column { /// /// For example, `foo.bar` would be represented as a two element vector /// `["foo", "bar"]` + #[cfg(feature = "sql")] fn from_idents(mut idents: Vec) -> Option { let (relation, name) = match idents.len() { 1 => (None, idents.remove(0)), @@ -371,8 +370,8 @@ impl From<(Option<&TableReference>, &FieldRef)> for Column { } #[cfg(feature = "sql")] -impl FromStr for Column { - type Err = Infallible; +impl std::str::FromStr for Column { + type Err = std::convert::Infallible; fn from_str(s: &str) -> Result { Ok(s.into()) diff --git a/datafusion/common/src/file_options/json_writer.rs b/datafusion/common/src/file_options/json_writer.rs index 889aec236af4..750d2972329b 100644 --- a/datafusion/common/src/file_options/json_writer.rs +++ b/datafusion/common/src/file_options/json_writer.rs @@ -20,7 +20,7 @@ use crate::{ config::JsonOptions, error::{DataFusionError, Result}, - parsers::CompressionTypeVariant + parsers::CompressionTypeVariant, }; /// Options for writing JSON files diff --git a/datafusion/common/src/table_reference.rs b/datafusion/common/src/table_reference.rs index 91cd7a0862e7..b05e6e0967a2 100644 --- a/datafusion/common/src/table_reference.rs +++ b/datafusion/common/src/table_reference.rs @@ -346,7 +346,9 @@ impl<'a> From<&'a str> for TableReference { #[cfg(not(feature = "sql"))] impl<'a> From<&'a String> for TableReference { fn from(s: &'a String) -> Self { - Self::Bare { table: s.as_str().into() } + Self::Bare { + table: s.as_str().into(), + } } } diff --git a/datafusion/core/src/dataframe/mod.rs b/datafusion/core/src/dataframe/mod.rs index b0bdc5b7bed1..9832c0e9db1e 100644 --- a/datafusion/core/src/dataframe/mod.rs +++ b/datafusion/core/src/dataframe/mod.rs @@ -50,7 +50,11 @@ use arrow::array::{Array, ArrayRef, Int64Array, StringArray}; use arrow::compute::{cast, concat}; use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; use datafusion_common::config::{CsvOptions, JsonOptions}; -use datafusion_common::{exec_err, not_impl_err, plan_datafusion_err, plan_err, Column, DFSchema, DataFusionError, ParamValues, ScalarValue, SchemaError, TableReference, UnnestOptions}; +use datafusion_common::{ + exec_err, not_impl_err, plan_datafusion_err, plan_err, Column, DFSchema, + DataFusionError, ParamValues, ScalarValue, SchemaError, TableReference, + UnnestOptions, +}; use datafusion_expr::select_expr::SelectExpr; use datafusion_expr::{ case, diff --git a/datafusion/core/src/execution/session_state.rs b/datafusion/core/src/execution/session_state.rs index 5a27c769f1eb..5f2e8f391e63 100644 --- a/datafusion/core/src/execution/session_state.rs +++ b/datafusion/core/src/execution/session_state.rs @@ -1618,8 +1618,7 @@ impl Debug for SessionStateBuilder { /// Prefer having short fields at the top and long vector fields near the end /// Group fields by fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - let mut debug_struct = f - .debug_struct("SessionStateBuilder"); + let mut debug_struct = f.debug_struct("SessionStateBuilder"); let ret = debug_struct .field("session_id", &self.session_id) .field("config", &self.config) diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index 1570d959efae..32df06b9195c 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -71,7 +71,10 @@ use datafusion_common::{ use datafusion_datasource::file_groups::FileGroup; use datafusion_datasource::memory::MemorySourceConfig; use datafusion_expr::dml::{CopyTo, InsertOp}; -use datafusion_expr::expr::{physical_name, AggregateFunction, AggregateFunctionParams, Alias, GroupingSet, NullTreatment, WindowFunction, WindowFunctionParams}; +use datafusion_expr::expr::{ + physical_name, AggregateFunction, AggregateFunctionParams, Alias, GroupingSet, + NullTreatment, WindowFunction, WindowFunctionParams, +}; use datafusion_expr::expr_rewriter::unnormalize_cols; use datafusion_expr::logical_plan::builder::wrap_projection_for_join_if_necessary; use datafusion_expr::{ diff --git a/datafusion/core/tests/dataframe/mod.rs b/datafusion/core/tests/dataframe/mod.rs index a563459f42a1..aa984775e457 100644 --- a/datafusion/core/tests/dataframe/mod.rs +++ b/datafusion/core/tests/dataframe/mod.rs @@ -41,7 +41,6 @@ use datafusion_functions_nested::make_array::make_array_udf; use datafusion_functions_window::expr_fn::{first_value, row_number}; use insta::assert_snapshot; use object_store::local::LocalFileSystem; -use sqlparser::ast::NullTreatment; use std::collections::HashMap; use std::fs; use std::sync::Arc; @@ -71,7 +70,9 @@ use datafusion_common_runtime::SpawnedTask; use datafusion_datasource::file_format::format_as_file_type; use datafusion_execution::config::SessionConfig; use datafusion_execution::runtime_env::RuntimeEnv; -use datafusion_expr::expr::{FieldMetadata, GroupingSet, Sort, WindowFunction}; +use datafusion_expr::expr::{ + FieldMetadata, GroupingSet, NullTreatment, Sort, WindowFunction, +}; use datafusion_expr::var_provider::{VarProvider, VarType}; use datafusion_expr::{ cast, col, create_udf, exists, in_subquery, lit, out_ref_col, placeholder, diff --git a/datafusion/core/tests/expr_api/mod.rs b/datafusion/core/tests/expr_api/mod.rs index cc48c4312e59..4aee274de908 100644 --- a/datafusion/core/tests/expr_api/mod.rs +++ b/datafusion/core/tests/expr_api/mod.rs @@ -24,6 +24,7 @@ use arrow::util::pretty::{pretty_format_batches, pretty_format_columns}; use datafusion::prelude::*; use datafusion_common::{DFSchema, ScalarValue}; use datafusion_expr::execution_props::ExecutionProps; +use datafusion_expr::expr::NullTreatment; use datafusion_expr::simplify::SimplifyContext; use datafusion_expr::ExprFunctionExt; use datafusion_functions::core::expr_ext::FieldAccessor; @@ -33,7 +34,6 @@ use datafusion_functions_nested::expr_ext::{IndexAccessor, SliceAccessor}; use datafusion_optimizer::simplify_expressions::ExprSimplifier; /// Tests of using and evaluating `Expr`s outside the context of a LogicalPlan use std::sync::{Arc, LazyLock}; -use datafusion_expr::expr::NullTreatment; mod parse_sql_expr; mod simplification; diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs index df05bee5d366..7a8ddba7be6d 100644 --- a/datafusion/expr/src/expr.rs +++ b/datafusion/expr/src/expr.rs @@ -41,7 +41,7 @@ use datafusion_common::{ use datafusion_functions_window_common::field::WindowUDFFieldArgs; #[cfg(feature = "sql")] use sqlparser::ast::{ - display_comma_separated, ExceptSelectItem, ExcludeSelectItem, Ident, IdentWithAlias, IlikeSelectItem, + display_comma_separated, ExceptSelectItem, ExcludeSelectItem, IlikeSelectItem, RenameSelectItem, ReplaceSelectElement, }; @@ -72,7 +72,6 @@ impl From for NullTreatment { } } - /// Represents logical expressions such as `A + 1`, or `CAST(c1 AS int)`. /// /// For example the expression `A + 1` will be represented as @@ -1412,17 +1411,13 @@ impl GroupingSet { #[derive(Clone, PartialEq, Eq, PartialOrd, Hash, Debug)] #[cfg(not(feature = "sql"))] -pub struct IlikeSelectItem{ +pub struct IlikeSelectItem { pub pattern: String, } #[cfg(not(feature = "sql"))] impl Display for IlikeSelectItem { fn fmt(&self, f: &mut Formatter) -> fmt::Result { - write!( - f, - "ILIKE '{}'", - &self.pattern - )?; + write!(f, "ILIKE '{}'", &self.pattern)?; Ok(()) } } @@ -1474,7 +1469,8 @@ impl Display for ExceptSelectItem { #[cfg(not(feature = "sql"))] pub fn display_comma_separated(slice: &[T]) -> String where - T: Display { + T: Display, +{ use itertools::Itertools; slice.iter().map(|v| format!("{v}")).join(", ") } diff --git a/datafusion/expr/src/logical_plan/ddl.rs b/datafusion/expr/src/logical_plan/ddl.rs index 0c2c7f1dc279..4ee394731e5b 100644 --- a/datafusion/expr/src/logical_plan/ddl.rs +++ b/datafusion/expr/src/logical_plan/ddl.rs @@ -24,6 +24,8 @@ use std::{ hash::{Hash, Hasher}, }; +#[cfg(not(feature = "sql"))] +use crate::expr::Ident; use crate::expr::Sort; use arrow::datatypes::DataType; use datafusion_common::tree_node::{Transformed, TreeNodeContainer, TreeNodeRecursion}; @@ -32,8 +34,6 @@ use datafusion_common::{ }; #[cfg(feature = "sql")] use sqlparser::ast::Ident; -#[cfg(not(feature = "sql"))] -use crate::expr::Ident; /// Various types of DDL (CREATE / DROP) catalog manipulation #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Hash)] diff --git a/datafusion/expr/src/planner.rs b/datafusion/expr/src/planner.rs index d69bbbdddf2e..955414df0d7e 100644 --- a/datafusion/expr/src/planner.rs +++ b/datafusion/expr/src/planner.rs @@ -20,17 +20,16 @@ use std::fmt::Debug; use std::sync::Arc; +use crate::expr::NullTreatment; +use crate::{ + AggregateUDF, Expr, GetFieldAccess, ScalarUDF, SortExpr, TableSource, WindowFrame, + WindowFunctionDefinition, WindowUDF, +}; use arrow::datatypes::{DataType, Field, SchemaRef}; use datafusion_common::{ config::ConfigOptions, file_options::file_type::FileType, not_impl_err, DFSchema, Result, TableReference, }; -use datafusion_expr_common::operator::Operator; -use crate::{ - AggregateUDF, Expr, GetFieldAccess, ScalarUDF, SortExpr, TableSource, WindowFrame, - WindowFunctionDefinition, WindowUDF, -}; -use crate::expr::NullTreatment; /// Provides the `SQL` query planner meta-data about tables and /// functions referenced in SQL statements, without a direct dependency on the @@ -263,7 +262,7 @@ pub trait ExprPlanner: Debug + Send + Sync { #[derive(Debug, Clone)] pub struct RawBinaryExpr { #[cfg(not(feature = "sql"))] - pub op: Operator, + pub op: datafusion_expr_common::operator::Operator, #[cfg(feature = "sql")] pub op: sqlparser::ast::BinaryOperator, pub left: Expr, @@ -331,7 +330,10 @@ pub trait TypePlanner: Debug + Send + Sync { /// Plan SQL [`ast::DataType`] to DataFusion [`DataType`] /// /// Returns None if not possible - fn plan_type(&self, _sql_type: &sqlparser::ast::DataType) -> Result> { + fn plan_type( + &self, + _sql_type: &sqlparser::ast::DataType, + ) -> Result> { Ok(None) } } diff --git a/datafusion/expr/src/utils.rs b/datafusion/expr/src/utils.rs index 7361c77cb5b5..48d07c643f2c 100644 --- a/datafusion/expr/src/utils.rs +++ b/datafusion/expr/src/utils.rs @@ -38,11 +38,11 @@ use datafusion_common::{ Result, TableReference, }; +#[cfg(not(feature = "sql"))] +use crate::expr::{ExceptSelectItem, ExcludeSelectItem}; use indexmap::IndexSet; #[cfg(feature = "sql")] use sqlparser::ast::{ExceptSelectItem, ExcludeSelectItem}; -#[cfg(not(feature = "sql"))] -use crate::expr::{ExceptSelectItem, ExcludeSelectItem}; pub use datafusion_functions_aggregate_common::order::AggregateOrderSensitivity; diff --git a/datafusion/expr/src/window_frame.rs b/datafusion/expr/src/window_frame.rs index 44d0a1b04323..f72dc10a6950 100644 --- a/datafusion/expr/src/window_frame.rs +++ b/datafusion/expr/src/window_frame.rs @@ -24,18 +24,12 @@ //! - An EXCLUDE clause. use crate::{expr::Sort, lit}; -use arrow::datatypes::DataType; use std::fmt::{self, Formatter}; use std::hash::Hash; -use datafusion_common::{ - exec_err, plan_err, DataFusionError, Result, ScalarValue, -}; +use datafusion_common::{plan_err, Result, ScalarValue}; #[cfg(feature = "sql")] -use sqlparser::{ - ast::{self, ValueWithSpan}, - parser::ParserError::ParserError, -}; +use sqlparser::ast::{self, ValueWithSpan}; /// The frame specification determines which output rows are read by an aggregate /// window function. The ending frame boundary can be omitted if the `BETWEEN` @@ -122,7 +116,7 @@ impl fmt::Debug for WindowFrame { #[cfg(feature = "sql")] impl TryFrom for WindowFrame { - type Error = DataFusionError; + type Error = datafusion_common::error::DataFusionError; fn try_from(value: ast::WindowFrame) -> Result { let start_bound = WindowFrameBound::try_parse(value.start_bound, &value.units)?; @@ -377,6 +371,8 @@ fn convert_frame_bound_to_scalar_value( v: ast::Expr, units: &ast::WindowFrameUnits, ) -> Result { + use arrow::datatypes::DataType; + use datafusion_common::exec_err; match units { // For ROWS and GROUPS we are sure that the ScalarValue must be a non-negative integer ... ast::WindowFrameUnits::Rows | ast::WindowFrameUnits::Groups => match v { diff --git a/datafusion/optimizer/Cargo.toml b/datafusion/optimizer/Cargo.toml index 2b799f437bda..f10510e0973c 100644 --- a/datafusion/optimizer/Cargo.toml +++ b/datafusion/optimizer/Cargo.toml @@ -43,7 +43,7 @@ recursive_protection = ["dep:recursive"] [dependencies] arrow = { workspace = true } chrono = { workspace = true } -datafusion-common = { workspace = true } +datafusion-common = { workspace = true, default-features = true } datafusion-expr = { workspace = true } datafusion-expr-common = { workspace = true } datafusion-physical-expr = { workspace = true } diff --git a/datafusion/optimizer/src/optimizer.rs b/datafusion/optimizer/src/optimizer.rs index 6a697289bd90..49806d6db344 100644 --- a/datafusion/optimizer/src/optimizer.rs +++ b/datafusion/optimizer/src/optimizer.rs @@ -325,8 +325,8 @@ impl Optimizer { let options = config.options(); let mut new_plan = plan; - let mut previous_plans = Vec::with_capacity(16); - previous_plans.push(new_plan.clone()); + let mut previous_plans = HashSet::with_capacity(16); + previous_plans.insert(LogicalPlanSignature::new(&new_plan)); let starting_schema = Arc::clone(new_plan.schema()); @@ -412,10 +412,10 @@ impl Optimizer { } log_plan(&format!("Optimized plan (pass {i})"), &new_plan); - let plan_is_fresh = previous_plans.contains(&new_plan); - if plan_is_fresh { - previous_plans.push(new_plan.clone()); - } else { + // HashSet::insert returns, whether the value was newly inserted. + let plan_is_fresh = + previous_plans.insert(LogicalPlanSignature::new(&new_plan)); + if !plan_is_fresh { // plan did not change, so no need to continue trying to optimize debug!("optimizer pass {i} did not make changes"); break; diff --git a/datafusion/sql/src/expr/function.rs b/datafusion/sql/src/expr/function.rs index 5c4a55e46ed5..772742850237 100644 --- a/datafusion/sql/src/expr/function.rs +++ b/datafusion/sql/src/expr/function.rs @@ -22,7 +22,9 @@ use datafusion_common::{ internal_datafusion_err, internal_err, not_impl_err, plan_datafusion_err, plan_err, DFSchema, Dependency, Diagnostic, Result, Span, }; -use datafusion_expr::expr::{NullTreatment, ScalarFunction, Unnest, WildcardOptions, WindowFunction}; +use datafusion_expr::expr::{ + NullTreatment, ScalarFunction, Unnest, WildcardOptions, WindowFunction, +}; use datafusion_expr::planner::{PlannerResult, RawAggregateExpr, RawWindowExpr}; use datafusion_expr::{expr, Expr, ExprSchemable, WindowFrame, WindowFunctionDefinition}; use sqlparser::ast::{ From 00e22024b7eaf8cf9c28fdcce71f3b5b963a2435 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Tue, 26 Aug 2025 13:59:11 -0400 Subject: [PATCH 07/15] Remove unused imports when not using sql feature --- .../core/src/execution/session_state.rs | 37 ++++++++++++------- datafusion/core/src/physical_planner.rs | 2 +- 2 files changed, 25 insertions(+), 14 deletions(-) diff --git a/datafusion/core/src/execution/session_state.rs b/datafusion/core/src/execution/session_state.rs index 5f2e8f391e63..f6582909041a 100644 --- a/datafusion/core/src/execution/session_state.rs +++ b/datafusion/core/src/execution/session_state.rs @@ -24,8 +24,8 @@ use std::fmt::Debug; use std::sync::Arc; use crate::catalog::{CatalogProviderList, SchemaProvider, TableProviderFactory}; -use crate::datasource::cte_worktable::CteWorkTable; -use crate::datasource::file_format::{format_as_file_type, FileFormatFactory}; +use crate::datasource::file_format::FileFormatFactory; +#[cfg(feature = "sql")] use crate::datasource::provider_as_source; use crate::execution::context::{EmptySerializerRegistry, FunctionFactory, QueryPlanner}; use crate::execution::SessionStateDefaults; @@ -34,16 +34,15 @@ use datafusion_catalog::information_schema::{ InformationSchemaProvider, INFORMATION_SCHEMA, }; -use arrow::datatypes::{DataType, SchemaRef}; +use arrow::datatypes::DataType; use datafusion_catalog::MemoryCatalogProviderList; use datafusion_catalog::{TableFunction, TableFunctionImpl}; use datafusion_common::alias::AliasGenerator; use datafusion_common::config::{ConfigExtension, ConfigOptions, TableOptions}; use datafusion_common::display::{PlanType, StringifiedPlan, ToStringifiedPlan}; -use datafusion_common::file_options::file_type::FileType; use datafusion_common::tree_node::TreeNode; use datafusion_common::{ - config_err, exec_err, not_impl_err, plan_datafusion_err, DFSchema, DataFusionError, + config_err, exec_err, plan_datafusion_err, DFSchema, DataFusionError, ResolvedTableReference, TableReference, }; use datafusion_execution::config::SessionConfig; @@ -56,10 +55,10 @@ use datafusion_expr::planner::ExprPlanner; use datafusion_expr::planner::TypePlanner; use datafusion_expr::registry::{FunctionRegistry, SerializerRegistry}; use datafusion_expr::simplify::SimplifyInfo; -use datafusion_expr::var_provider::{is_system_variables, VarType}; +#[cfg(feature = "sql")] +use datafusion_expr::TableSource; use datafusion_expr::{ - AggregateUDF, Explain, Expr, ExprSchemable, LogicalPlan, ScalarUDF, TableSource, - WindowUDF, + AggregateUDF, Explain, Expr, ExprSchemable, LogicalPlan, ScalarUDF, WindowUDF, }; use datafusion_optimizer::simplify_expressions::ExprSimplifier; use datafusion_optimizer::{ @@ -394,7 +393,7 @@ impl SessionState { .parse_statements()?; if statements.len() > 1 { - return not_impl_err!( + return datafusion_common::not_impl_err!( "The context currently only supports a single SQL statement" ); } @@ -1664,6 +1663,7 @@ impl From for SessionStateBuilder { /// /// This is used so the SQL planner can access the state of the session without /// having a direct dependency on the [`SessionState`] struct (and core crate) +#[cfg(feature = "sql")] struct SessionContextProvider<'a> { state: &'a SessionState, tables: HashMap>, @@ -1723,9 +1723,11 @@ impl ContextProvider for SessionContextProvider<'_> { fn create_cte_work_table( &self, name: &str, - schema: SchemaRef, + schema: arrow::datatypes::SchemaRef, ) -> datafusion_common::Result> { - let table = Arc::new(CteWorkTable::new(name, schema)); + let table = Arc::new(crate::datasource::cte_worktable::CteWorkTable::new( + name, schema, + )); Ok(provider_as_source(table)) } @@ -1742,6 +1744,8 @@ impl ContextProvider for SessionContextProvider<'_> { } fn get_variable_type(&self, variable_names: &[String]) -> Option { + use datafusion_expr::var_provider::{is_system_variables, VarType}; + if variable_names.is_empty() { return None; } @@ -1775,14 +1779,21 @@ impl ContextProvider for SessionContextProvider<'_> { self.state.window_functions().keys().cloned().collect() } - fn get_file_type(&self, ext: &str) -> datafusion_common::Result> { + fn get_file_type( + &self, + ext: &str, + ) -> datafusion_common::Result< + Arc, + > { self.state .file_formats .get(&ext.to_lowercase()) .ok_or(plan_datafusion_err!( "There is no registered file format with ext {ext}" )) - .map(|file_type| format_as_file_type(Arc::clone(file_type))) + .map(|file_type| { + crate::datasource::file_format::format_as_file_type(Arc::clone(file_type)) + }) } } diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index 32df06b9195c..c58c155dd7ca 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -64,6 +64,7 @@ use datafusion_common::display::ToStringifiedPlan; use datafusion_common::tree_node::{ Transformed, TransformedResult, TreeNode, TreeNodeRecursion, TreeNodeVisitor, }; +use datafusion_common::TableReference; use datafusion_common::{ exec_err, internal_datafusion_err, internal_err, not_impl_err, plan_err, DFSchema, ScalarValue, @@ -93,7 +94,6 @@ use datafusion_physical_plan::execution_plan::InvariantLevel; use datafusion_physical_plan::placeholder_row::PlaceholderRowExec; use datafusion_physical_plan::recursive_query::RecursiveQueryExec; use datafusion_physical_plan::unnest::ListUnnest; -use datafusion_sql::TableReference; use async_trait::async_trait; use datafusion_physical_plan::async_func::{AsyncFuncExec, AsyncMapper}; From 36272cf96f3bc25efd9d1835f1e94628623d5184 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Tue, 26 Aug 2025 15:36:33 -0400 Subject: [PATCH 08/15] Working through more feature gating --- Cargo.lock | 1 + datafusion/common/src/config.rs | 4 +++- datafusion/common/src/table_reference.rs | 6 +++--- datafusion/common/src/utils/mod.rs | 3 ++- datafusion/functions-nested/Cargo.toml | 7 ++++++- datafusion/functions-nested/src/planner.rs | 18 +++++++++--------- datafusion/substrait/Cargo.toml | 2 +- 7 files changed, 25 insertions(+), 16 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c6d594ca9087..c909130f039c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2384,6 +2384,7 @@ dependencies = [ "datafusion-doc", "datafusion-execution", "datafusion-expr", + "datafusion-expr-common", "datafusion-functions", "datafusion-functions-aggregate", "datafusion-functions-aggregate-common", diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index cdd8e72a06cc..269c44661c41 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -2671,9 +2671,11 @@ impl Display for OutputFormat { #[cfg(test)] mod tests { + #[cfg(feature = "parquet")] + use crate::config::TableParquetOptions; use crate::config::{ ConfigEntry, ConfigExtension, ConfigField, ConfigFileType, ExtensionOptions, - Extensions, TableOptions, TableParquetOptions, + Extensions, TableOptions, }; use std::any::Any; use std::collections::HashMap; diff --git a/datafusion/common/src/table_reference.rs b/datafusion/common/src/table_reference.rs index b05e6e0967a2..efc2908ae395 100644 --- a/datafusion/common/src/table_reference.rs +++ b/datafusion/common/src/table_reference.rs @@ -406,16 +406,16 @@ mod tests { #[test] fn test_table_reference_to_vector() { - let table_reference = TableReference::parse_str("table"); + let table_reference = TableReference::from("table"); assert_eq!(vec!["table".to_string()], table_reference.to_vec()); - let table_reference = TableReference::parse_str("schema.table"); + let table_reference = TableReference::from("schema.table"); assert_eq!( vec!["schema".to_string(), "table".to_string()], table_reference.to_vec() ); - let table_reference = TableReference::parse_str("catalog.schema.table"); + let table_reference = TableReference::from("catalog.schema.table"); assert_eq!( vec![ "catalog".to_string(), diff --git a/datafusion/common/src/utils/mod.rs b/datafusion/common/src/utils/mod.rs index 9a733bbb1e2b..b934d7494f02 100644 --- a/datafusion/common/src/utils/mod.rs +++ b/datafusion/common/src/utils/mod.rs @@ -886,11 +886,12 @@ pub fn take_function_args( }) } -#[cfg(test)] +#[cfg(all(test, feature = "sql"))] mod tests { use super::*; use crate::ScalarValue::Null; use arrow::array::Float64Array; + use sqlparser::ast::Ident; use sqlparser::tokenizer::Span; #[test] diff --git a/datafusion/functions-nested/Cargo.toml b/datafusion/functions-nested/Cargo.toml index 87a480e16003..15af9129b712 100644 --- a/datafusion/functions-nested/Cargo.toml +++ b/datafusion/functions-nested/Cargo.toml @@ -37,13 +37,18 @@ workspace = true [lib] name = "datafusion_functions_nested" +[features] +default = ["sql"] +sql = ["datafusion-common/sql"] + [dependencies] arrow = { workspace = true } arrow-ord = { workspace = true } datafusion-common = { workspace = true } datafusion-doc = { workspace = true } datafusion-execution = { workspace = true } -datafusion-expr = { workspace = true } +datafusion-expr = { workspace = true, default-features = false } +datafusion-expr-common = { workspace = true } datafusion-functions = { workspace = true } datafusion-functions-aggregate = { workspace = true } datafusion-functions-aggregate-common = { workspace = true } diff --git a/datafusion/functions-nested/src/planner.rs b/datafusion/functions-nested/src/planner.rs index 369eaecb1905..b7a9b878c69e 100644 --- a/datafusion/functions-nested/src/planner.rs +++ b/datafusion/functions-nested/src/planner.rs @@ -22,11 +22,15 @@ use datafusion_common::ExprSchema; use datafusion_common::{plan_err, utils::list_ndims, DFSchema, Result}; use datafusion_expr::expr::ScalarFunction; use datafusion_expr::expr::{AggregateFunction, AggregateFunctionParams}; +#[cfg(feature = "sql")] +use datafusion_expr::sqlparser::ast::BinaryOperator; use datafusion_expr::AggregateUDF; use datafusion_expr::{ planner::{ExprPlanner, PlannerResult, RawBinaryExpr, RawFieldAccessExpr}, - sqlparser, Expr, ExprSchemable, GetFieldAccess, + Expr, ExprSchemable, GetFieldAccess, }; +#[cfg(not(feature = "sql"))] +use datafusion_expr_common::operator::Operator as BinaryOperator; use datafusion_functions::core::get_field as get_field_inner; use datafusion_functions::expr_fn::get_field; use datafusion_functions_aggregate::nth_value::nth_value_udaf; @@ -51,7 +55,7 @@ impl ExprPlanner for NestedFunctionPlanner { ) -> Result> { let RawBinaryExpr { op, left, right } = expr; - if op == sqlparser::ast::BinaryOperator::StringConcat { + if op == BinaryOperator::StringConcat { let left_type = left.get_type(schema)?; let right_type = right.get_type(schema)?; let left_list_ndims = list_ndims(&left_type); @@ -75,18 +79,14 @@ impl ExprPlanner for NestedFunctionPlanner { } else if left_list_ndims < right_list_ndims { return Ok(PlannerResult::Planned(array_prepend(left, right))); } - } else if matches!( - op, - sqlparser::ast::BinaryOperator::AtArrow - | sqlparser::ast::BinaryOperator::ArrowAt - ) { + } else if matches!(op, BinaryOperator::AtArrow | BinaryOperator::ArrowAt) { let left_type = left.get_type(schema)?; let right_type = right.get_type(schema)?; let left_list_ndims = list_ndims(&left_type); let right_list_ndims = list_ndims(&right_type); // if both are list if left_list_ndims > 0 && right_list_ndims > 0 { - if op == sqlparser::ast::BinaryOperator::AtArrow { + if op == BinaryOperator::AtArrow { // array1 @> array2 -> array_has_all(array1, array2) return Ok(PlannerResult::Planned(array_has_all(left, right))); } else { @@ -123,7 +123,7 @@ impl ExprPlanner for NestedFunctionPlanner { } fn plan_any(&self, expr: RawBinaryExpr) -> Result> { - if expr.op == sqlparser::ast::BinaryOperator::Eq { + if expr.op == BinaryOperator::Eq { Ok(PlannerResult::Planned(Expr::ScalarFunction( ScalarFunction::new_udf( array_has_udf(), diff --git a/datafusion/substrait/Cargo.toml b/datafusion/substrait/Cargo.toml index 63a69b48866b..dfdf4899fc9b 100644 --- a/datafusion/substrait/Cargo.toml +++ b/datafusion/substrait/Cargo.toml @@ -34,7 +34,7 @@ workspace = true async-recursion = "1.0" async-trait = { workspace = true } chrono = { workspace = true } -datafusion = { workspace = true } +datafusion = { workspace = true, features = ["sql"] } itertools = { workspace = true } object_store = { workspace = true } pbjson-types = { workspace = true } From 6139746d36505692d1be3763852852b834393c99 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Tue, 26 Aug 2025 16:48:01 -0400 Subject: [PATCH 09/15] Working through more feature gating --- .github/workflows/rust.yml | 2 +- Cargo.toml | 4 ++-- datafusion-cli/Cargo.toml | 3 ++- datafusion/common/src/utils/mod.rs | 2 +- datafusion/expr/Cargo.toml | 2 +- datafusion/functions-nested/Cargo.toml | 2 +- datafusion/proto/Cargo.toml | 4 +++- 7 files changed, 11 insertions(+), 8 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 05a6d70f0278..abd42f170712 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -579,7 +579,7 @@ jobs: with: rust-version: stable - name: Run datafusion-common tests - run: cargo test --profile ci -p datafusion-common --features=pyarrow + run: cargo test --profile ci -p datafusion-common --features=pyarrow,sql vendor: name: Verify Vendored Code diff --git a/Cargo.toml b/Cargo.toml index 375c5d6a8839..b29cf5d7e376 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -121,8 +121,8 @@ datafusion-datasource-csv = { path = "datafusion/datasource-csv", version = "49. datafusion-datasource-json = { path = "datafusion/datasource-json", version = "49.0.0", default-features = false } datafusion-datasource-parquet = { path = "datafusion/datasource-parquet", version = "49.0.0", default-features = false } datafusion-doc = { path = "datafusion/doc", version = "49.0.0" } -datafusion-execution = { path = "datafusion/execution", version = "49.0.0", default-features = false } -datafusion-expr = { path = "datafusion/expr", version = "49.0.0", default-features = false } +datafusion-execution = { path = "datafusion/execution", version = "49.0.0", default-features = false } +datafusion-expr = { path = "datafusion/expr", version = "49.0.0", default-features = false } datafusion-expr-common = { path = "datafusion/expr-common", version = "49.0.0" } datafusion-ffi = { path = "datafusion/ffi", version = "49.0.0" } datafusion-functions = { path = "datafusion/functions", version = "49.0.0" } diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml index 394adbb6105f..052fcf69b259 100644 --- a/datafusion-cli/Cargo.toml +++ b/datafusion-cli/Cargo.toml @@ -42,6 +42,7 @@ aws-credential-types = "1.2.6" clap = { version = "4.5.47", features = ["derive", "cargo"] } datafusion = { workspace = true, features = [ "avro", + "compression", "crypto_expressions", "datetime_expressions", "encoding_expressions", @@ -50,8 +51,8 @@ datafusion = { workspace = true, features = [ "parquet_encryption", "recursive_protection", "regex_expressions", + "sql", "unicode_expressions", - "compression", ] } dirs = "6.0.0" env_logger = { workspace = true } diff --git a/datafusion/common/src/utils/mod.rs b/datafusion/common/src/utils/mod.rs index b934d7494f02..fbd009e5779d 100644 --- a/datafusion/common/src/utils/mod.rs +++ b/datafusion/common/src/utils/mod.rs @@ -886,7 +886,7 @@ pub fn take_function_args( }) } -#[cfg(all(test, feature = "sql"))] +#[cfg(test)] mod tests { use super::*; use crate::ScalarValue::Null; diff --git a/datafusion/expr/Cargo.toml b/datafusion/expr/Cargo.toml index b80cf23e104f..2a8e5ecfb1ab 100644 --- a/datafusion/expr/Cargo.toml +++ b/datafusion/expr/Cargo.toml @@ -53,11 +53,11 @@ datafusion-functions-aggregate-common = { workspace = true } datafusion-functions-window-common = { workspace = true } datafusion-physical-expr-common = { workspace = true } indexmap = { workspace = true } +itertools = "0.14.0" paste = "^1.0" recursive = { workspace = true, optional = true } serde_json = { workspace = true } sqlparser = { workspace = true, optional = true } -itertools = "0.14.0" [dev-dependencies] ctor = { workspace = true } diff --git a/datafusion/functions-nested/Cargo.toml b/datafusion/functions-nested/Cargo.toml index 15af9129b712..9c0b7a16f9a9 100644 --- a/datafusion/functions-nested/Cargo.toml +++ b/datafusion/functions-nested/Cargo.toml @@ -39,7 +39,7 @@ name = "datafusion_functions_nested" [features] default = ["sql"] -sql = ["datafusion-common/sql"] +sql = ["datafusion-expr/sql"] [dependencies] arrow = { workspace = true } diff --git a/datafusion/proto/Cargo.toml b/datafusion/proto/Cargo.toml index 956887e1bb17..9dc433f7efcc 100644 --- a/datafusion/proto/Cargo.toml +++ b/datafusion/proto/Cargo.toml @@ -46,7 +46,7 @@ avro = ["datafusion/avro", "datafusion-common/avro"] [dependencies] arrow = { workspace = true } chrono = { workspace = true } -datafusion = { workspace = true, default-features = false } +datafusion = { workspace = true, default-features = false, features = ["parquet", "nested_expressions"] } datafusion-common = { workspace = true } datafusion-expr = { workspace = true } datafusion-proto-common = { workspace = true } @@ -55,7 +55,9 @@ pbjson = { workspace = true, optional = true } prost = { workspace = true } serde = { version = "1.0", optional = true } serde_json = { workspace = true, optional = true } + [dev-dependencies] +datafusion = { workspace = true, default-features = false, features = ["sql"] } datafusion-functions = { workspace = true, default-features = true } datafusion-functions-aggregate = { workspace = true } datafusion-functions-window-common = { workspace = true } From d5b04ea18fcd2d8c9bbc27e005b1bd49cb160f79 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Tue, 26 Aug 2025 16:54:42 -0400 Subject: [PATCH 10/15] wasmtest required sql --- datafusion/wasmtest/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/wasmtest/Cargo.toml b/datafusion/wasmtest/Cargo.toml index 2fc9f0df3084..8de9534980d8 100644 --- a/datafusion/wasmtest/Cargo.toml +++ b/datafusion/wasmtest/Cargo.toml @@ -45,7 +45,7 @@ chrono = { version = "0.4", features = ["wasmbind"] } # all the `std::fmt` and `std::panicking` infrastructure, so isn't great for # code size when deploying. console_error_panic_hook = { version = "0.1.1", optional = true } -datafusion = { workspace = true, features = ["parquet"] } +datafusion = { workspace = true, features = ["parquet", "sql"] } datafusion-common = { workspace = true } datafusion-execution = { workspace = true } datafusion-expr = { workspace = true } From ddaf8a38dc9b970b32ade2eb8a622f24ec46acc7 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Tue, 26 Aug 2025 17:07:39 -0400 Subject: [PATCH 11/15] working on CI --- datafusion/common/Cargo.toml | 1 + datafusion/common/src/utils/mod.rs | 1 + 2 files changed, 2 insertions(+) diff --git a/datafusion/common/Cargo.toml b/datafusion/common/Cargo.toml index 22916925c116..bb5f53084a41 100644 --- a/datafusion/common/Cargo.toml +++ b/datafusion/common/Cargo.toml @@ -84,3 +84,4 @@ web-time = "1.1.0" chrono = { workspace = true } insta = { workspace = true } rand = { workspace = true } +sqlparser = { workspace = true } diff --git a/datafusion/common/src/utils/mod.rs b/datafusion/common/src/utils/mod.rs index fbd009e5779d..c7f57980abc1 100644 --- a/datafusion/common/src/utils/mod.rs +++ b/datafusion/common/src/utils/mod.rs @@ -1088,6 +1088,7 @@ mod tests { Ok(()) } + #[cfg(feature = "sql")] #[test] fn test_quote_identifier() -> Result<()> { let cases = vec![ From 95eff0df7d0305b5071dd354ef433ec3a8435081 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Tue, 26 Aug 2025 19:04:59 -0400 Subject: [PATCH 12/15] Update docs --- datafusion/expr/src/planner.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/expr/src/planner.rs b/datafusion/expr/src/planner.rs index 955414df0d7e..25a0f83947ee 100644 --- a/datafusion/expr/src/planner.rs +++ b/datafusion/expr/src/planner.rs @@ -327,7 +327,7 @@ pub enum PlannerResult { /// Customize planning SQL types to DataFusion (Arrow) types. #[cfg(feature = "sql")] pub trait TypePlanner: Debug + Send + Sync { - /// Plan SQL [`ast::DataType`] to DataFusion [`DataType`] + /// Plan SQL [`sqlparser::ast::DataType`] to DataFusion [`DataType`] /// /// Returns None if not possible fn plan_type( From 28a0d0111a777a16529f4163785f20e4e6fd034d Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Fri, 5 Sep 2025 09:06:33 -0400 Subject: [PATCH 13/15] Remove some duplicate code by doing a bit of sql-like parsing of idedntifiers --- datafusion/common/src/column.rs | 14 ++----- datafusion/common/src/table_reference.rs | 46 ++++++-------------- datafusion/common/src/utils/mod.rs | 53 ++++++++++++++++++++++++ 3 files changed, 68 insertions(+), 45 deletions(-) diff --git a/datafusion/common/src/column.rs b/datafusion/common/src/column.rs index 83b68865d283..c7f0b5a4f488 100644 --- a/datafusion/common/src/column.rs +++ b/datafusion/common/src/column.rs @@ -18,7 +18,6 @@ //! Column use crate::error::{_schema_err, add_possible_columns_to_diag}; -#[cfg(feature = "sql")] use crate::utils::parse_identifiers_normalized; use crate::utils::quote_identifier; use crate::{DFSchema, Diagnostic, Result, SchemaError, Spans, TableReference}; @@ -88,7 +87,6 @@ impl Column { /// /// For example, `foo.bar` would be represented as a two element vector /// `["foo", "bar"]` - #[cfg(feature = "sql")] fn from_idents(mut idents: Vec) -> Option { let (relation, name) = match idents.len() { 1 => (None, idents.remove(0)), @@ -129,7 +127,6 @@ impl Column { /// Treats the name as a SQL identifier. For example /// `foo.BAR` would be parsed to a reference to relation `foo`, column name `bar` (lower case) /// where `"foo.BAR"` would be parsed to a reference to column named `foo.BAR` - #[cfg(feature = "sql")] pub fn from_qualified_name(flat_name: impl Into) -> Self { let flat_name = flat_name.into(); Self::from_idents(parse_identifiers_normalized(&flat_name, false)).unwrap_or_else( @@ -140,14 +137,7 @@ impl Column { }, ) } - #[cfg(not(feature = "sql"))] - pub fn from_qualified_name(flat_name: impl Into) -> Self { - Self { - relation: None, - name: flat_name.into(), - spans: Spans::new(), - } - } + /// Deserialize a fully qualified name string into a column preserving column text case #[cfg(feature = "sql")] pub fn from_qualified_name_ignore_case(flat_name: impl Into) -> Self { @@ -160,10 +150,12 @@ impl Column { }, ) } + #[cfg(not(feature = "sql"))] pub fn from_qualified_name_ignore_case(flat_name: impl Into) -> Self { Self::from_qualified_name(flat_name) } + /// return the column's name. /// /// Note: This ignores the relation and returns the column name only. diff --git a/datafusion/common/src/table_reference.rs b/datafusion/common/src/table_reference.rs index efc2908ae395..7cf8e7af1a79 100644 --- a/datafusion/common/src/table_reference.rs +++ b/datafusion/common/src/table_reference.rs @@ -15,7 +15,6 @@ // specific language governing permissions and limitations // under the License. -#[cfg(feature = "sql")] use crate::utils::parse_identifiers_normalized; use crate::utils::quote_identifier; use std::sync::Arc; @@ -271,7 +270,6 @@ impl TableReference { /// Forms a [`TableReference`] by parsing `s` as a multipart SQL /// identifier. See docs on [`TableReference`] for more details. - #[cfg(feature = "sql")] pub fn parse_str(s: &str) -> Self { let mut parts = parse_identifiers_normalized(s, false); @@ -315,50 +313,24 @@ impl TableReference { /// Parse a string into a TableReference, normalizing where appropriate /// /// See full details on [`TableReference::parse_str`] -#[cfg(feature = "sql")] impl<'a> From<&'a str> for TableReference { fn from(s: &'a str) -> Self { Self::parse_str(s) } } -#[cfg(feature = "sql")] impl<'a> From<&'a String> for TableReference { fn from(s: &'a String) -> Self { Self::parse_str(s) } } -#[cfg(feature = "sql")] impl From for TableReference { fn from(s: String) -> Self { Self::parse_str(&s) } } -#[cfg(not(feature = "sql"))] -impl<'a> From<&'a str> for TableReference { - fn from(s: &'a str) -> Self { - Self::Bare { table: s.into() } - } -} - -#[cfg(not(feature = "sql"))] -impl<'a> From<&'a String> for TableReference { - fn from(s: &'a String) -> Self { - Self::Bare { - table: s.as_str().into(), - } - } -} - -#[cfg(not(feature = "sql"))] -impl From for TableReference { - fn from(s: String) -> Self { - Self::Bare { table: s.into() } - } -} - impl From for TableReference { fn from(resolved: ResolvedTableReference) -> Self { Self::Full { @@ -396,12 +368,18 @@ mod tests { let actual = TableReference::from("TABLE"); assert_eq!(expected, actual); - // if fail to parse, take entire input string as identifier - let expected = TableReference::Bare { - table: "TABLE()".into(), - }; - let actual = TableReference::from("TABLE()"); - assert_eq!(expected, actual); + // Disable this test for non-sql features so that we don't need to reproduce + // things like table function upper case conventions, since those will not + // be used if SQL is not selected. + #[cfg(feature = "sql")] + { + // if fail to parse, take entire input string as identifier + let expected = TableReference::Bare { + table: "TABLE()".into(), + }; + let actual = TableReference::from("TABLE()"); + assert_eq!(expected, actual); + } } #[test] diff --git a/datafusion/common/src/utils/mod.rs b/datafusion/common/src/utils/mod.rs index c7f57980abc1..7443625276d0 100644 --- a/datafusion/common/src/utils/mod.rs +++ b/datafusion/common/src/utils/mod.rs @@ -302,6 +302,59 @@ pub(crate) fn parse_identifiers_normalized(s: &str, ignore_case: bool) -> Vec>() } +#[cfg(not(feature = "sql"))] +pub(crate) fn parse_identifiers(s: &str) -> Result> { + let mut result = Vec::new(); + let mut current = String::new(); + let mut in_quotes = false; + + for ch in s.chars() { + match ch { + '"' => { + in_quotes = !in_quotes; + current.push(ch); + } + '.' if !in_quotes => { + result.push(current.clone()); + current.clear(); + } + _ => { + current.push(ch); + } + } + } + + // Push the last part if it's not empty + if !current.is_empty() { + result.push(current); + } + + Ok(result) +} + +#[cfg(not(feature = "sql"))] +pub(crate) fn parse_identifiers_normalized(s: &str, ignore_case: bool) -> Vec { + parse_identifiers(s) + .unwrap_or_default() + .into_iter() + .map(|id| { + let is_double_quoted = if id.len() > 2 { + let mut chars = id.chars(); + chars.next() == Some('"') && chars.last() == Some('"') + } else { + false + }; + if is_double_quoted { + id[1..id.len() - 1].to_string().replace("\"\"", "\"") + } else if ignore_case { + id + } else { + id.to_ascii_lowercase() + } + }) + .collect::>() +} + /// This function "takes" the elements at `indices` from the slice `items`. pub fn get_at_indices>( items: &[T], From d40eb8be7c6a8d17fa2870bbe8ae045f53938745 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Fri, 5 Sep 2025 14:55:11 -0400 Subject: [PATCH 14/15] Conditionally import sql features --- datafusion/core/Cargo.toml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml index 80542efb3d4b..e61ab2f99005 100644 --- a/datafusion/core/Cargo.toml +++ b/datafusion/core/Cargo.toml @@ -97,12 +97,13 @@ serde = [ ] sql = [ "datafusion-common/sql", + "datafusion-functions-nested?/sql", "datafusion-sql", "sqlparser", ] string_expressions = ["datafusion-functions/string_expressions"] unicode_expressions = [ - "datafusion-sql/unicode_expressions", + "datafusion-sql?/unicode_expressions", "datafusion-functions/unicode_expressions", ] extended_tests = [] @@ -125,11 +126,11 @@ datafusion-datasource-csv = { workspace = true } datafusion-datasource-json = { workspace = true } datafusion-datasource-parquet = { workspace = true, optional = true } datafusion-execution = { workspace = true } -datafusion-expr = { workspace = true } +datafusion-expr = { workspace = true, default-features = false } datafusion-expr-common = { workspace = true } datafusion-functions = { workspace = true } datafusion-functions-aggregate = { workspace = true } -datafusion-functions-nested = { workspace = true, optional = true } +datafusion-functions-nested = { workspace = true, default-features = false, optional = true } datafusion-functions-table = { workspace = true } datafusion-functions-window = { workspace = true } datafusion-optimizer = { workspace = true } From 7066b0ea65838dd431ee691fa0d75e7d9c141ed4 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Fri, 5 Sep 2025 15:22:29 -0400 Subject: [PATCH 15/15] Set default features to false so we do not always pull in sql --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index b29cf5d7e376..cf7e03db0391 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -128,7 +128,7 @@ datafusion-ffi = { path = "datafusion/ffi", version = "49.0.0" } datafusion-functions = { path = "datafusion/functions", version = "49.0.0" } datafusion-functions-aggregate = { path = "datafusion/functions-aggregate", version = "49.0.0" } datafusion-functions-aggregate-common = { path = "datafusion/functions-aggregate-common", version = "49.0.0" } -datafusion-functions-nested = { path = "datafusion/functions-nested", version = "49.0.0" } +datafusion-functions-nested = { path = "datafusion/functions-nested", version = "49.0.0", default-features = false } datafusion-functions-table = { path = "datafusion/functions-table", version = "49.0.0" } datafusion-functions-window = { path = "datafusion/functions-window", version = "49.0.0" } datafusion-functions-window-common = { path = "datafusion/functions-window-common", version = "49.0.0" }