From 16c03912787a4b8effc8dbd074d7cb39b1f6b3f9 Mon Sep 17 00:00:00 2001 From: Aldrin M Date: Mon, 20 Oct 2025 15:30:16 -0700 Subject: [PATCH 1/3] fix: protobuf -> plan normalizes column relations This fixes a semantic error when reading a logical plan from protobuf. Existing behavior is to use the `relation` field of `ColumnRelation` message to construct a `TableReference`. However, the `relation` field is a string and `From for TableReference` always calls parse_identifiers_normalized with `ignore_case: False`, which always normalizes the identifier to lower case. New behavior is to implement `From for TableReference` which calls a new method, `parse_str_normalized`, with `ignore_case: True`. Overall, if normalization occurs, it should happen prior to serialization to protobuf; thus, deserialization from protobuf should not normalize (if it is desirable, though, `parse_str_normalized` propagates its boolean parameter to `parse_identifiers_normalized` unlike `parse_str`). Issue: #18122 --- datafusion/common/src/table_reference.rs | 54 +++++++++++++------ datafusion/proto-common/src/from_proto/mod.rs | 16 +++++- 2 files changed, 53 insertions(+), 17 deletions(-) diff --git a/datafusion/common/src/table_reference.rs b/datafusion/common/src/table_reference.rs index 9b6f9696c00b..8cc692cbbbf6 100644 --- a/datafusion/common/src/table_reference.rs +++ b/datafusion/common/src/table_reference.rs @@ -268,24 +268,48 @@ impl TableReference { } /// Forms a [`TableReference`] by parsing `s` as a multipart SQL - /// identifier. See docs on [`TableReference`] for more details. + /// identifier, normalizing `s` to lowercase. + /// See docs on [`TableReference`] for more details. pub fn parse_str(s: &str) -> Self { - let mut parts = parse_identifiers_normalized(s, false); + Self::parse_str_normalized(s, false) + } + + /// Forms a [`TableReference`] by parsing `s` as a multipart SQL + /// identifier, normalizing `s` to lowercase if `ignore_case` is `false`. + /// See docs on [`TableReference`] for more details. + pub fn parse_str_normalized(s: &str, ignore_case: bool) -> Self { + let table_parts = parse_identifiers_normalized(s, ignore_case); + + Self::from_vec(table_parts) + .unwrap_or(Self::Bare { table: s.into() }) + } + /// Compose a [`TableReference`] from separate parts. The input vector should contain + /// at most three elements in the following sequence: + /// ```no_rust + /// [, , table] + /// ``` + pub fn from_vec(mut parts: Vec) -> Option { match parts.len() { - 1 => Self::Bare { - table: parts.remove(0).into(), - }, - 2 => Self::Partial { - schema: parts.remove(0).into(), - table: parts.remove(0).into(), - }, - 3 => Self::Full { - catalog: parts.remove(0).into(), - schema: parts.remove(0).into(), - table: parts.remove(0).into(), - }, - _ => Self::Bare { table: s.into() }, + 1 => Some( + Self::Bare { + table: parts.remove(0).into(), + } + ), + 2 => Some( + Self::Partial { + schema: parts.remove(0).into(), + table: parts.remove(0).into(), + } + ), + 3 => Some( + Self::Full { + catalog: parts.remove(0).into(), + schema: parts.remove(0).into(), + table: parts.remove(0).into(), + } + ), + _ => None, } } diff --git a/datafusion/proto-common/src/from_proto/mod.rs b/datafusion/proto-common/src/from_proto/mod.rs index bd969db31687..469448f83a89 100644 --- a/datafusion/proto-common/src/from_proto/mod.rs +++ b/datafusion/proto-common/src/from_proto/mod.rs @@ -138,11 +138,23 @@ where } } +impl From for TableReference { + fn from(rel: protobuf::ColumnRelation) -> Self { + Self::parse_str_normalized(rel.relation.as_str(), true) + } +} + +impl From<&protobuf::ColumnRelation> for TableReference { + fn from(rel: &protobuf::ColumnRelation) -> Self { + rel.clone().into() + } +} + impl From for Column { fn from(c: protobuf::Column) -> Self { let protobuf::Column { relation, name } = c; - Self::new(relation.map(|r| r.relation), name) + Self::new(relation, name) } } @@ -167,7 +179,7 @@ impl TryFrom<&protobuf::DfSchema> for DFSchema { df_field .qualifier .as_ref() - .map(|q| q.relation.clone().into()), + .map(|q| q.into()), Arc::new(field), )) }) From 67d828f45a94ef5e5ce1c5f9ec7d05fba65c1a09 Mon Sep 17 00:00:00 2001 From: Aldrin M Date: Mon, 20 Oct 2025 15:47:00 -0700 Subject: [PATCH 2/3] test: added test for protobuf roundtrip A new test case, `roundtrip_mixed_case_table_reference`, exercises a scenario where a logical plan containing a table reference with uppercase characters is roundtripped through protobuf and the deserialization side erroneously normalizes the table reference to lowercase. Issue: #18122 --- .../tests/cases/roundtrip_logical_plan.rs | 38 +++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs index 993cc6f87ca3..d9a537a7a001 100644 --- a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs +++ b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs @@ -2656,3 +2656,41 @@ async fn roundtrip_custom_listing_tables_schema() -> Result<()> { assert_eq!(plan, new_plan); Ok(()) } + +#[tokio::test] +async fn roundtrip_mixed_case_table_reference() -> Result<()> { + // Prepare "client" database + let client_ctx = SessionContext::new_with_config(SessionConfig::new() + .set_bool("datafusion.sql_parser.enable_ident_normalization", false) + ); + client_ctx.register_csv( + "\"TestData\"", + "tests/testdata/test.csv", + CsvReadOptions::default() + ) + .await?; + + // Prepare "server" database + let server_ctx = SessionContext::new_with_config(SessionConfig::new() + .set_bool("datafusion.sql_parser.enable_ident_normalization", false) + ); + server_ctx.register_csv( + "\"TestData\"", + "tests/testdata/test.csv", + CsvReadOptions::default() + ) + .await?; + + // Create a logical plan, serialize it (client), then deserialize it (server) + let dataframe = client_ctx.sql("SELECT a FROM TestData WHERE TestData.a = 1").await?; + let client_logical_plan = dataframe.into_optimized_plan()?; + let plan_bytes = logical_plan_to_bytes(&client_logical_plan)?; + let server_logical_plan = logical_plan_from_bytes(&plan_bytes, &server_ctx)?; + + assert_eq!( + format!("{}", client_logical_plan.display_indent_schema()), + format!("{}", server_logical_plan.display_indent_schema()) + ); + + Ok(()) +} From bad6090ee43f7b63a4b9b54fdea1216e04c0039f Mon Sep 17 00:00:00 2001 From: Aldrin M Date: Mon, 20 Oct 2025 15:51:09 -0700 Subject: [PATCH 3/3] minor: added clarifying comment For parse_identifiers_normalized, the `ignore_case` parameter controls whether the parsing should be case-sensitive (ignore_case: true) or insensitive (ignore_case: false). The name of the parameter is counter-intuitive to the behavior, so this adds a clarifying comment for the method. --- datafusion/common/src/utils/mod.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/datafusion/common/src/utils/mod.rs b/datafusion/common/src/utils/mod.rs index 409f248621f7..9d15727ea787 100644 --- a/datafusion/common/src/utils/mod.rs +++ b/datafusion/common/src/utils/mod.rs @@ -290,6 +290,9 @@ pub(crate) fn parse_identifiers(s: &str) -> Result> { Ok(idents) } +/// Parse a string into a vector of identifiers. +/// +/// Note: If ignore_case is false, the string will be normalized to lowercase. pub(crate) fn parse_identifiers_normalized(s: &str, ignore_case: bool) -> Vec { parse_identifiers(s) .unwrap_or_default()