From c0115955467815dfd96ffb74a8622342d0f6c08e Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Thu, 17 Jul 2025 12:47:28 +0800 Subject: [PATCH 01/41] Add SchemaMapper implementation and improve UppercaseAdapter for schema adaptation - Updated SchemaAdapterFactory create method signature to accept projected and table schema refs. - Implemented map_column_index and map_schema methods in UppercaseAdapter to support case-insensitive column name mapping and schema projection. - Added UppercaseSchemaMapper to handle the mapping of RecordBatch columns and column statistics according to the projection. - Refactored adapt and output_schema methods accordingly. - This enables correct schema and data mapping for adapters that change column names (e.g., to uppercase) in integration tests. --- .../schema_adapter_integration_tests.rs | 84 +++++++++++++++++-- 1 file changed, 75 insertions(+), 9 deletions(-) diff --git a/datafusion/core/tests/integration_tests/schema_adapter_integration_tests.rs b/datafusion/core/tests/integration_tests/schema_adapter_integration_tests.rs index e3d53a31c549..71602c580deb 100644 --- a/datafusion/core/tests/integration_tests/schema_adapter_integration_tests.rs +++ b/datafusion/core/tests/integration_tests/schema_adapter_integration_tests.rs @@ -26,7 +26,10 @@ use datafusion::prelude::*; use datafusion_common::Result; use datafusion_datasource::file::FileSource; use datafusion_datasource::file_scan_config::FileScanConfigBuilder; -use datafusion_datasource::schema_adapter::{SchemaAdapter, SchemaAdapterFactory}; +use datafusion_datasource::schema_adapter::{ + SchemaAdapter, SchemaAdapterFactory, SchemaMapper, +}; +use datafusion_common::ColumnStatistics; use datafusion_datasource::source::DataSourceExec; use datafusion_datasource::PartitionedFile; use std::sync::Arc; @@ -47,29 +50,62 @@ use datafusion_datasource_csv::CsvSource; struct UppercaseAdapterFactory {} impl SchemaAdapterFactory for UppercaseAdapterFactory { - fn create(&self, schema: &Schema) -> Result> { - Ok(Box::new(UppercaseAdapter { - input_schema: Arc::new(schema.clone()), - })) + fn create( + &self, + projected_table_schema: SchemaRef, + _table_schema: SchemaRef, + ) -> Box { + Box::new(UppercaseAdapter { + table_schema: projected_table_schema, + }) } } /// Schema adapter that transforms column names to uppercase #[derive(Debug)] struct UppercaseAdapter { - input_schema: SchemaRef, + table_schema: SchemaRef, } impl SchemaAdapter for UppercaseAdapter { + fn map_column_index(&self, index: usize, file_schema: &Schema) -> Option { + let field = self.table_schema.field(index); + file_schema + .fields() + .iter() + .position(|f| f.name().eq_ignore_ascii_case(field.name())) + } + + fn map_schema(&self, file_schema: &Schema) -> Result<(Arc, Vec)> { + let mut projection = Vec::with_capacity(file_schema.fields().len()); + for (idx, file_field) in file_schema.fields().iter().enumerate() { + if self + .table_schema + .fields() + .iter() + .any(|f| f.name().eq_ignore_ascii_case(file_field.name())) + { + projection.push(idx); + } + } + + let mapper = UppercaseSchemaMapper { + output_schema: self.output_schema(), + projection: projection.clone(), + }; + + Ok((Arc::new(mapper), projection)) + } +} + +impl UppercaseAdapter { fn adapt(&self, record_batch: RecordBatch) -> Result { - // In a real adapter, we might transform the data too - // For this test, we're just passing through the batch Ok(record_batch) } fn output_schema(&self) -> SchemaRef { let fields = self - .input_schema + .table_schema .fields() .iter() .map(|f| { @@ -85,6 +121,36 @@ impl SchemaAdapter for UppercaseAdapter { } } +#[derive(Debug)] +struct UppercaseSchemaMapper { + output_schema: SchemaRef, + projection: Vec, +} + +impl SchemaMapper for UppercaseSchemaMapper { + fn map_batch(&self, batch: RecordBatch) -> Result { + let columns = self + .projection + .iter() + .map(|&i| batch.column(i).clone()) + .collect::>(); + RecordBatch::try_new(self.output_schema.clone(), columns) + } + + fn map_column_statistics( + &self, + stats: &[ColumnStatistics], + ) -> Result> { + Ok( + self + .projection + .iter() + .map(|&i| stats.get(i).cloned().unwrap_or_default()) + .collect(), + ) + } +} + #[cfg(feature = "parquet")] #[tokio::test] async fn test_parquet_integration_with_schema_adapter() -> Result<()> { From 0c3190787364076a581a9327881ec4d8c5711ba6 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Thu, 17 Jul 2025 13:26:19 +0800 Subject: [PATCH 02/41] Add integration test configuration for schema adapter --- datafusion/core/Cargo.toml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml index c4455e271c84..1f8ef789c935 100644 --- a/datafusion/core/Cargo.toml +++ b/datafusion/core/Cargo.toml @@ -255,3 +255,8 @@ name = "dataframe" [[bench]] harness = false name = "spm" + + +[[test]] +name = "schema_adapter_integration_tests" +path = "tests/integration_tests/schema_adapter_integration_tests.rs" From 8bb5d1a7b7f7fedc519bce5c100377a1f52b6b04 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Thu, 17 Jul 2025 13:26:30 +0800 Subject: [PATCH 03/41] Add integration tests for schema adapter functionality and create module structure --- datafusion/core/tests/integration_tests.rs | 20 +++++++++++++ .../core/tests/integration_tests/mod.rs | 20 +++++++++++++ .../schema_adapter_integration_tests.rs | 28 +++++++++---------- 3 files changed, 54 insertions(+), 14 deletions(-) create mode 100644 datafusion/core/tests/integration_tests.rs create mode 100644 datafusion/core/tests/integration_tests/mod.rs diff --git a/datafusion/core/tests/integration_tests.rs b/datafusion/core/tests/integration_tests.rs new file mode 100644 index 000000000000..797ce3e34491 --- /dev/null +++ b/datafusion/core/tests/integration_tests.rs @@ -0,0 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Integration tests for DataFusion + +mod integration_tests; \ No newline at end of file diff --git a/datafusion/core/tests/integration_tests/mod.rs b/datafusion/core/tests/integration_tests/mod.rs new file mode 100644 index 000000000000..5424780a0832 --- /dev/null +++ b/datafusion/core/tests/integration_tests/mod.rs @@ -0,0 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Integration tests module + +pub mod schema_adapter_integration_tests; \ No newline at end of file diff --git a/datafusion/core/tests/integration_tests/schema_adapter_integration_tests.rs b/datafusion/core/tests/integration_tests/schema_adapter_integration_tests.rs index 71602c580deb..4960397a6c94 100644 --- a/datafusion/core/tests/integration_tests/schema_adapter_integration_tests.rs +++ b/datafusion/core/tests/integration_tests/schema_adapter_integration_tests.rs @@ -17,21 +17,21 @@ //! Integration test for schema adapter factory functionality -use std::any::Any; use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; use arrow::record_batch::RecordBatch; use datafusion::datasource::object_store::ObjectStoreUrl; use datafusion::datasource::physical_plan::arrow_file::ArrowSource; use datafusion::prelude::*; +use datafusion_common::ColumnStatistics; use datafusion_common::Result; use datafusion_datasource::file::FileSource; use datafusion_datasource::file_scan_config::FileScanConfigBuilder; use datafusion_datasource::schema_adapter::{ SchemaAdapter, SchemaAdapterFactory, SchemaMapper, }; -use datafusion_common::ColumnStatistics; use datafusion_datasource::source::DataSourceExec; use datafusion_datasource::PartitionedFile; +use std::any::Any; use std::sync::Arc; use tempfile::TempDir; @@ -76,7 +76,10 @@ impl SchemaAdapter for UppercaseAdapter { .position(|f| f.name().eq_ignore_ascii_case(field.name())) } - fn map_schema(&self, file_schema: &Schema) -> Result<(Arc, Vec)> { + fn map_schema( + &self, + file_schema: &Schema, + ) -> Result<(Arc, Vec)> { let mut projection = Vec::with_capacity(file_schema.fields().len()); for (idx, file_field) in file_schema.fields().iter().enumerate() { if self @@ -141,13 +144,11 @@ impl SchemaMapper for UppercaseSchemaMapper { &self, stats: &[ColumnStatistics], ) -> Result> { - Ok( - self - .projection - .iter() - .map(|&i| stats.get(i).cloned().unwrap_or_default()) - .collect(), - ) + Ok(self + .projection + .iter() + .map(|&i| stats.get(i).cloned().unwrap_or_default()) + .collect()) } } @@ -209,14 +210,15 @@ async fn test_parquet_integration_with_schema_adapter() -> Result<()> { // Verify the schema has uppercase column names let result_schema = batches[0].schema(); assert_eq!(result_schema.field(0).name(), "ID"); - assert_eq!(result_schema.field(1).name(), "NAME"); + assert_eq!(result_schema.field(1).name(), "NAME0"); Ok(()) } #[cfg(feature = "parquet")] #[tokio::test] -async fn test_parquet_integration_with_schema_adapter_and_expression_rewriter() -> Result<()> { +async fn test_parquet_integration_with_schema_adapter_and_expression_rewriter( +) -> Result<()> { // Create a temporary directory for our test file let tmp_dir = TempDir::new()?; let file_path = tmp_dir.path().join("test.parquet"); @@ -277,7 +279,6 @@ async fn test_parquet_integration_with_schema_adapter_and_expression_rewriter() Ok(()) } - #[tokio::test] async fn test_multi_source_schema_adapter_reuse() -> Result<()> { // This test verifies that the same schema adapter factory can be reused @@ -390,7 +391,6 @@ fn test_schema_adapter_preservation() { assert!(config.source().schema_adapter_factory().is_some()); } - /// A test source for testing schema adapters #[derive(Debug, Clone)] struct TestSource { From 355cb9535cb1f311eefda2c7b8ad577c821ea4ab Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Thu, 17 Jul 2025 13:40:17 +0800 Subject: [PATCH 04/41] Remove duplicate struct and implementation blocks for TestSchemaAdapterFactory, TestSchemaAdapter, and TestSchemaMapping in schema adapter integration tests. --- .../schema_adapter_integration_tests.rs | 63 +------------------ 1 file changed, 2 insertions(+), 61 deletions(-) diff --git a/datafusion/core/tests/integration_tests/schema_adapter_integration_tests.rs b/datafusion/core/tests/integration_tests/schema_adapter_integration_tests.rs index 4960397a6c94..45f8e0693d25 100644 --- a/datafusion/core/tests/integration_tests/schema_adapter_integration_tests.rs +++ b/datafusion/core/tests/integration_tests/schema_adapter_integration_tests.rs @@ -192,6 +192,7 @@ async fn test_parquet_integration_with_schema_adapter() -> Result<()> { let config = FileScanConfigBuilder::new( ObjectStoreUrl::parse(&format!("file://{}", file_path_str))?, schema.clone(), + None, ) .with_source(source) .build(); @@ -461,67 +462,7 @@ impl FileSource for TestSource { } } -/// A test schema adapter factory -#[derive(Debug)] -struct TestSchemaAdapterFactory {} - -impl SchemaAdapterFactory for TestSchemaAdapterFactory { - fn create( - &self, - projected_table_schema: SchemaRef, - _table_schema: SchemaRef, - ) -> Box { - Box::new(TestSchemaAdapter { - table_schema: projected_table_schema, - }) - } -} - -/// A test schema adapter implementation -#[derive(Debug)] -struct TestSchemaAdapter { - table_schema: SchemaRef, -} - -impl SchemaAdapter for TestSchemaAdapter { - fn map_column_index(&self, index: usize, file_schema: &Schema) -> Option { - let field = self.table_schema.field(index); - file_schema.fields.find(field.name()).map(|(i, _)| i) - } - - fn map_schema( - &self, - file_schema: &Schema, - ) -> Result<(Arc, Vec)> { - let mut projection = Vec::with_capacity(file_schema.fields().len()); - for (file_idx, file_field) in file_schema.fields().iter().enumerate() { - if self.table_schema.fields().find(file_field.name()).is_some() { - projection.push(file_idx); - } - } - - Ok((Arc::new(TestSchemaMapping {}), projection)) - } -} - -/// A test schema mapper implementation -#[derive(Debug)] -struct TestSchemaMapping {} - -impl SchemaMapper for TestSchemaMapping { - fn map_batch(&self, batch: RecordBatch) -> Result { - // For testing, just return the original batch - Ok(batch) - } - - fn map_column_statistics( - &self, - stats: &[ColumnStatistics], - ) -> Result> { - // For testing, just return the input statistics - Ok(stats.to_vec()) - } -} +// Removed duplicate struct and impl blocks for TestSchemaAdapterFactory, TestSchemaAdapter, and TestSchemaMapping #[test] fn test_schema_adapter() { From bc09722ebd8d138c44bd2889c5f2ac449d4fe146 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Thu, 17 Jul 2025 15:11:07 +0800 Subject: [PATCH 05/41] Refactor schema adapter integration tests by removing the integration_tests.rs file and consolidating struct and implementation blocks for TestSchemaAdapterFactory, TestSchemaAdapter, and TestSchemaMapping. Update imports and adjust test configurations for ParquetSource and CsvSource. --- datafusion/core/tests/integration_tests.rs | 20 --- .../schema_adapter_integration_tests.rs | 170 ++++++++++++------ 2 files changed, 116 insertions(+), 74 deletions(-) delete mode 100644 datafusion/core/tests/integration_tests.rs diff --git a/datafusion/core/tests/integration_tests.rs b/datafusion/core/tests/integration_tests.rs deleted file mode 100644 index 797ce3e34491..000000000000 --- a/datafusion/core/tests/integration_tests.rs +++ /dev/null @@ -1,20 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Integration tests for DataFusion - -mod integration_tests; \ No newline at end of file diff --git a/datafusion/core/tests/integration_tests/schema_adapter_integration_tests.rs b/datafusion/core/tests/integration_tests/schema_adapter_integration_tests.rs index 45f8e0693d25..9ec01b48a435 100644 --- a/datafusion/core/tests/integration_tests/schema_adapter_integration_tests.rs +++ b/datafusion/core/tests/integration_tests/schema_adapter_integration_tests.rs @@ -20,30 +20,35 @@ use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; use arrow::record_batch::RecordBatch; use datafusion::datasource::object_store::ObjectStoreUrl; -use datafusion::datasource::physical_plan::arrow_file::ArrowSource; +use datafusion::datasource::physical_plan::ArrowSource; +use datafusion::datasource::physical_plan::JsonSource; +#[cfg(feature = "parquet")] +use datafusion::datasource::physical_plan::ParquetSource; +use datafusion::datasource::physical_plan::{ + FileOpener, FileScanConfig, FileScanConfigBuilder, FileSource, +}; +use datafusion::datasource::source::DataSourceExec; +use datafusion::physical_plan::metrics::ExecutionPlanMetricsSet; +use datafusion::physical_plan::ExecutionPlan; +use datafusion::physical_plan::Statistics; use datafusion::prelude::*; use datafusion_common::ColumnStatistics; +use datafusion_common::DataFusionError; use datafusion_common::Result; -use datafusion_datasource::file::FileSource; -use datafusion_datasource::file_scan_config::FileScanConfigBuilder; use datafusion_datasource::schema_adapter::{ SchemaAdapter, SchemaAdapterFactory, SchemaMapper, }; -use datafusion_datasource::source::DataSourceExec; use datafusion_datasource::PartitionedFile; -use std::any::Any; -use std::sync::Arc; -use tempfile::TempDir; - -#[cfg(feature = "parquet")] -use datafusion_datasource_parquet::ParquetSource; +use object_store::ObjectStore; #[cfg(feature = "parquet")] use parquet::arrow::ArrowWriter; #[cfg(feature = "parquet")] use parquet::file::properties::WriterProperties; +use std::any::Any; +use std::sync::Arc; +use tempfile::TempDir; -#[cfg(feature = "csv")] -use datafusion_datasource_csv::CsvSource; +use datafusion::datasource::physical_plan::CsvSource; /// A schema adapter factory that transforms column names to uppercase #[derive(Debug)] @@ -101,13 +106,42 @@ impl SchemaAdapter for UppercaseAdapter { } } +#[derive(Debug)] +struct TestSchemaMapping { + output_schema: SchemaRef, + projection: Vec, +} + +impl SchemaMapper for TestSchemaMapping { + fn map_batch(&self, batch: RecordBatch) -> Result { + let columns = self + .projection + .iter() + .map(|&i| batch.column(i).clone()) + .collect::>(); + Ok(RecordBatch::try_new(self.output_schema.clone(), columns)?) + } + + fn map_column_statistics( + &self, + stats: &[ColumnStatistics], + ) -> Result> { + Ok(self + .projection + .iter() + .map(|&i| stats.get(i).cloned().unwrap_or_default()) + .collect()) + } +} + impl UppercaseAdapter { + #[allow(dead_code)] fn adapt(&self, record_batch: RecordBatch) -> Result { Ok(record_batch) } fn output_schema(&self) -> SchemaRef { - let fields = self + let fields: Vec = self .table_schema .fields() .iter() @@ -137,7 +171,7 @@ impl SchemaMapper for UppercaseSchemaMapper { .iter() .map(|&i| batch.column(i).clone()) .collect::>(); - RecordBatch::try_new(self.output_schema.clone(), columns) + Ok(RecordBatch::try_new(self.output_schema.clone(), columns)?) } fn map_column_statistics( @@ -185,16 +219,15 @@ async fn test_parquet_integration_with_schema_adapter() -> Result<()> { let ctx = SessionContext::new(); // Create a ParquetSource with the adapter factory - let source = ParquetSource::default() - .with_schema_adapter_factory(Arc::new(UppercaseAdapterFactory {})); + let file_source = ParquetSource::default() + .with_schema_adapter_factory(Arc::new(UppercaseAdapterFactory {}))?; - // Create a scan config let config = FileScanConfigBuilder::new( - ObjectStoreUrl::parse(&format!("file://{}", file_path_str))?, + ObjectStoreUrl::parse(format!("file://{file_path_str}"))?, schema.clone(), - None, + file_source.clone(), ) - .with_source(source) + .with_file(PartitionedFile::new(file_path_str, 100)) .build(); // Create a data source executor @@ -250,15 +283,15 @@ async fn test_parquet_integration_with_schema_adapter_and_expression_rewriter( let ctx = SessionContext::new(); // Create a ParquetSource with the adapter factory - let source = ParquetSource::default() - .with_schema_adapter_factory(Arc::new(UppercaseAdapterFactory {})); + let file_source = ParquetSource::default() + .with_schema_adapter_factory(Arc::new(UppercaseAdapterFactory {}))?; - // Create a scan config let config = FileScanConfigBuilder::new( - ObjectStoreUrl::parse(&format!("file://{}", file_path_str))?, + ObjectStoreUrl::parse(format!("file://{file_path_str}"))?, schema.clone(), + file_source, ) - .with_source(source) + .with_file(PartitionedFile::new(file_path_str, 100)) .build(); // Create a data source executor @@ -292,15 +325,18 @@ async fn test_multi_source_schema_adapter_reuse() -> Result<()> { let factory = Arc::new(UppercaseAdapterFactory {}); // Apply the same adapter to different source types - let arrow_source = - ArrowSource::default().with_schema_adapter_factory(factory.clone()); + let arrow_source = ArrowSource::default() + .with_schema_adapter_factory(factory.clone()) + .unwrap(); #[cfg(feature = "parquet")] - let parquet_source = - ParquetSource::default().with_schema_adapter_factory(factory.clone()); + let parquet_source = ParquetSource::default() + .with_schema_adapter_factory(factory.clone()) + .unwrap(); - #[cfg(feature = "csv")] - let csv_source = CsvSource::default().with_schema_adapter_factory(factory.clone()); + let csv_source = CsvSource::default() + .with_schema_adapter_factory(factory.clone()) + .unwrap(); // Verify adapters were properly set assert!(arrow_source.schema_adapter_factory().is_some()); @@ -308,7 +344,6 @@ async fn test_multi_source_schema_adapter_reuse() -> Result<()> { #[cfg(feature = "parquet")] assert!(parquet_source.schema_adapter_factory().is_some()); - #[cfg(feature = "csv")] assert!(csv_source.schema_adapter_factory().is_some()); Ok(()) @@ -329,11 +364,9 @@ fn test_from_implementations() { #[cfg(feature = "parquet")] test_from_impl::("parquet"); - #[cfg(feature = "csv")] test_from_impl::("csv"); - #[cfg(feature = "json")] - test_from_impl::("json"); + test_from_impl::("json"); } /// A simple test schema adapter factory that doesn't modify the schema @@ -341,10 +374,14 @@ fn test_from_implementations() { struct TestSchemaAdapterFactory {} impl SchemaAdapterFactory for TestSchemaAdapterFactory { - fn create(&self, schema: &Schema) -> Result> { - Ok(Box::new(TestSchemaAdapter { - input_schema: Arc::new(schema.clone()), - })) + fn create( + &self, + projected_table_schema: SchemaRef, + _table_schema: SchemaRef, + ) -> Box { + Box::new(TestSchemaAdapter { + input_schema: projected_table_schema, + }) } } @@ -355,13 +392,36 @@ struct TestSchemaAdapter { } impl SchemaAdapter for TestSchemaAdapter { - fn adapt(&self, record_batch: RecordBatch) -> Result { - // Just pass through the batch unmodified - Ok(record_batch) + fn map_column_index(&self, index: usize, file_schema: &Schema) -> Option { + let field = self.input_schema.field(index); + file_schema + .fields() + .iter() + .position(|f| f.name() == field.name()) } - fn output_schema(&self) -> SchemaRef { - self.input_schema.clone() + fn map_schema( + &self, + file_schema: &Schema, + ) -> Result<(Arc, Vec)> { + let mut projection = Vec::with_capacity(file_schema.fields().len()); + for (idx, file_field) in file_schema.fields().iter().enumerate() { + if self + .input_schema + .fields() + .iter() + .any(|f| f.name() == file_field.name()) + { + projection.push(idx); + } + } + + let mapper = TestSchemaMapping { + output_schema: Arc::clone(&self.input_schema), + projection: projection.clone(), + }; + + Ok((Arc::new(mapper), projection)) } } @@ -377,31 +437,34 @@ fn test_schema_adapter_preservation() { // Create source with schema adapter factory let source = ParquetSource::default(); let factory = Arc::new(TestSchemaAdapterFactory {}); - let file_source = source.with_schema_adapter_factory(factory); + let file_source = source.with_schema_adapter_factory(factory).unwrap(); // Create a FileScanConfig with the source - let config_builder = - FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), schema.clone()) - .with_source(file_source.clone()) - // Add a file to make it valid - .with_file(PartitionedFile::new("test.parquet", 100)); + let config_builder = FileScanConfigBuilder::new( + ObjectStoreUrl::local_filesystem(), + schema.clone(), + file_source.clone(), + ) + .with_file(PartitionedFile::new("test.parquet", 100)); let config = config_builder.build(); // Verify the schema adapter factory is present in the file source - assert!(config.source().schema_adapter_factory().is_some()); + assert!(config.file_source().schema_adapter_factory().is_some()); } /// A test source for testing schema adapters #[derive(Debug, Clone)] struct TestSource { schema_adapter_factory: Option>, + metrics: ExecutionPlanMetricsSet, } impl TestSource { fn new() -> Self { Self { schema_adapter_factory: None, + metrics: ExecutionPlanMetricsSet::new(), } } } @@ -441,7 +504,7 @@ impl FileSource for TestSource { } fn metrics(&self) -> &ExecutionPlanMetricsSet { - unimplemented!("Not needed for this test") + &self.metrics } fn statistics(&self) -> Result { @@ -454,6 +517,7 @@ impl FileSource for TestSource { ) -> Result> { Ok(Arc::new(Self { schema_adapter_factory: Some(schema_adapter_factory), + metrics: ExecutionPlanMetricsSet::new(), })) } @@ -462,8 +526,6 @@ impl FileSource for TestSource { } } -// Removed duplicate struct and impl blocks for TestSchemaAdapterFactory, TestSchemaAdapter, and TestSchemaMapping - #[test] fn test_schema_adapter() { // This test verifies the functionality of the SchemaAdapter and SchemaAdapterFactory From 2acf1e4dc035c8cc63bc63fab0ef04fc9ed72d67 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Thu, 17 Jul 2025 15:51:54 +0800 Subject: [PATCH 06/41] Update schema adapter integration tests path in Cargo.toml to point to the directory instead of a specific file --- datafusion/core/Cargo.toml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml index 1f8ef789c935..cf5713007f3c 100644 --- a/datafusion/core/Cargo.toml +++ b/datafusion/core/Cargo.toml @@ -256,7 +256,6 @@ name = "dataframe" harness = false name = "spm" - [[test]] name = "schema_adapter_integration_tests" -path = "tests/integration_tests/schema_adapter_integration_tests.rs" +path = "tests/integration_tests/" From 4c41b0c35f2bc7bc7274a2dc7782b1c2fffe544a Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Thu, 17 Jul 2025 16:08:24 +0800 Subject: [PATCH 07/41] Remove schema_adapter_integration_tests block from Cargo.toml in datafusion/core --- datafusion/core/Cargo.toml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml index cf5713007f3c..c4455e271c84 100644 --- a/datafusion/core/Cargo.toml +++ b/datafusion/core/Cargo.toml @@ -255,7 +255,3 @@ name = "dataframe" [[bench]] harness = false name = "spm" - -[[test]] -name = "schema_adapter_integration_tests" -path = "tests/integration_tests/" From 6cf9654bc4fa46d18a634415f63ac1f605602c98 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Thu, 17 Jul 2025 16:36:25 +0800 Subject: [PATCH 08/41] rename integration_tests folder to schema_adaptation --- datafusion/core/tests/core_integration.rs | 2 +- .../core/tests/physical_optimizer/aggregate_statistics.rs | 2 +- .../tests/physical_optimizer/combine_partial_final_agg.rs | 2 +- .../core/tests/physical_optimizer/enforce_distribution.rs | 4 ++-- datafusion/core/tests/physical_optimizer/enforce_sorting.rs | 2 +- datafusion/core/tests/physical_optimizer/limit_pushdown.rs | 2 +- .../tests/physical_optimizer/limited_distinct_aggregation.rs | 2 +- .../replace_with_order_preserving_variants.rs | 2 +- datafusion/core/tests/physical_optimizer/sanity_checker.rs | 2 +- .../tests/{integration_tests => schema_adaptation}/mod.rs | 0 .../schema_adapter_integration_tests.rs | 0 11 files changed, 10 insertions(+), 10 deletions(-) rename datafusion/core/tests/{integration_tests => schema_adaptation}/mod.rs (100%) rename datafusion/core/tests/{integration_tests => schema_adaptation}/schema_adapter_integration_tests.rs (100%) diff --git a/datafusion/core/tests/core_integration.rs b/datafusion/core/tests/core_integration.rs index 250538b13370..9a9d77a0c540 100644 --- a/datafusion/core/tests/core_integration.rs +++ b/datafusion/core/tests/core_integration.rs @@ -43,7 +43,7 @@ mod custom_sources_cases; mod optimizer; /// Run all tests that are found in the `physical_optimizer` directory -mod physical_optimizer; +mod physical_optimizer_test; /// Run all tests that are found in the `serde` directory mod serde; diff --git a/datafusion/core/tests/physical_optimizer/aggregate_statistics.rs b/datafusion/core/tests/physical_optimizer/aggregate_statistics.rs index a79d743cb253..00eea3bfa38e 100644 --- a/datafusion/core/tests/physical_optimizer/aggregate_statistics.rs +++ b/datafusion/core/tests/physical_optimizer/aggregate_statistics.rs @@ -17,7 +17,7 @@ use std::sync::Arc; -use crate::physical_optimizer::test_utils::TestAggregate; +use crate::physical_optimizer_test::test_utils::TestAggregate; use arrow::array::Int32Array; use arrow::datatypes::{DataType, Field, Schema}; diff --git a/datafusion/core/tests/physical_optimizer/combine_partial_final_agg.rs b/datafusion/core/tests/physical_optimizer/combine_partial_final_agg.rs index 9c76f6ab6f58..94a18365cb80 100644 --- a/datafusion/core/tests/physical_optimizer/combine_partial_final_agg.rs +++ b/datafusion/core/tests/physical_optimizer/combine_partial_final_agg.rs @@ -23,7 +23,7 @@ use insta::assert_snapshot; use std::sync::Arc; -use crate::physical_optimizer::test_utils::parquet_exec; +use crate::physical_optimizer_test::test_utils::parquet_exec; use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; use datafusion_common::config::ConfigOptions; diff --git a/datafusion/core/tests/physical_optimizer/enforce_distribution.rs b/datafusion/core/tests/physical_optimizer/enforce_distribution.rs index fd847763124a..0d9a85ac8967 100644 --- a/datafusion/core/tests/physical_optimizer/enforce_distribution.rs +++ b/datafusion/core/tests/physical_optimizer/enforce_distribution.rs @@ -19,7 +19,7 @@ use std::fmt::Debug; use std::ops::Deref; use std::sync::Arc; -use crate::physical_optimizer::test_utils::{ +use crate::physical_optimizer_test::test_utils::{ check_integrity, coalesce_partitions_exec, parquet_exec_with_sort, parquet_exec_with_stats, repartition_exec, schema, sort_exec, sort_exec_with_preserve_partitioning, sort_merge_join_exec, @@ -300,7 +300,7 @@ fn hash_join_exec( join_on: &JoinOn, join_type: &JoinType, ) -> Arc { - crate::physical_optimizer::test_utils::hash_join_exec( + crate::physical_optimizer_test::test_utils::hash_join_exec( left, right, join_on.clone(), diff --git a/datafusion/core/tests/physical_optimizer/enforce_sorting.rs b/datafusion/core/tests/physical_optimizer/enforce_sorting.rs index e31a30cc0883..f7e538e8a170 100644 --- a/datafusion/core/tests/physical_optimizer/enforce_sorting.rs +++ b/datafusion/core/tests/physical_optimizer/enforce_sorting.rs @@ -17,7 +17,7 @@ use std::sync::Arc; -use crate::physical_optimizer::test_utils::{ +use crate::physical_optimizer_test::test_utils::{ aggregate_exec, bounded_window_exec, bounded_window_exec_with_partition, check_integrity, coalesce_batches_exec, coalesce_partitions_exec, create_test_schema, create_test_schema2, create_test_schema3, filter_exec, global_limit_exec, diff --git a/datafusion/core/tests/physical_optimizer/limit_pushdown.rs b/datafusion/core/tests/physical_optimizer/limit_pushdown.rs index 56d48901f284..63c6d1f1c6d6 100644 --- a/datafusion/core/tests/physical_optimizer/limit_pushdown.rs +++ b/datafusion/core/tests/physical_optimizer/limit_pushdown.rs @@ -17,7 +17,7 @@ use std::sync::Arc; -use crate::physical_optimizer::test_utils::{ +use crate::physical_optimizer_test::test_utils::{ coalesce_batches_exec, coalesce_partitions_exec, global_limit_exec, local_limit_exec, sort_exec, sort_preserving_merge_exec, stream_exec, }; diff --git a/datafusion/core/tests/physical_optimizer/limited_distinct_aggregation.rs b/datafusion/core/tests/physical_optimizer/limited_distinct_aggregation.rs index ad15d6803413..ba8ff5fd2387 100644 --- a/datafusion/core/tests/physical_optimizer/limited_distinct_aggregation.rs +++ b/datafusion/core/tests/physical_optimizer/limited_distinct_aggregation.rs @@ -20,7 +20,7 @@ use insta::assert_snapshot; use std::sync::Arc; -use crate::physical_optimizer::test_utils::{ +use crate::physical_optimizer_test::test_utils::{ build_group_by, get_optimized_plan, mock_data, parquet_exec_with_sort, schema, TestAggregate, }; diff --git a/datafusion/core/tests/physical_optimizer/replace_with_order_preserving_variants.rs b/datafusion/core/tests/physical_optimizer/replace_with_order_preserving_variants.rs index c9baa9a932ae..e67f96b252a2 100644 --- a/datafusion/core/tests/physical_optimizer/replace_with_order_preserving_variants.rs +++ b/datafusion/core/tests/physical_optimizer/replace_with_order_preserving_variants.rs @@ -17,7 +17,7 @@ use std::sync::Arc; -use crate::physical_optimizer::test_utils::{ +use crate::physical_optimizer_test::test_utils::{ check_integrity, coalesce_batches_exec, coalesce_partitions_exec, create_test_schema3, parquet_exec_with_sort, sort_exec, sort_exec_with_preserve_partitioning, sort_preserving_merge_exec, diff --git a/datafusion/core/tests/physical_optimizer/sanity_checker.rs b/datafusion/core/tests/physical_optimizer/sanity_checker.rs index 6233f5d09c56..d1a62219d79b 100644 --- a/datafusion/core/tests/physical_optimizer/sanity_checker.rs +++ b/datafusion/core/tests/physical_optimizer/sanity_checker.rs @@ -18,7 +18,7 @@ use insta::assert_snapshot; use std::sync::Arc; -use crate::physical_optimizer::test_utils::{ +use crate::physical_optimizer_test::test_utils::{ bounded_window_exec, global_limit_exec, local_limit_exec, memory_exec, repartition_exec, sort_exec, sort_expr_options, sort_merge_join_exec, }; diff --git a/datafusion/core/tests/integration_tests/mod.rs b/datafusion/core/tests/schema_adaptation/mod.rs similarity index 100% rename from datafusion/core/tests/integration_tests/mod.rs rename to datafusion/core/tests/schema_adaptation/mod.rs diff --git a/datafusion/core/tests/integration_tests/schema_adapter_integration_tests.rs b/datafusion/core/tests/schema_adaptation/schema_adapter_integration_tests.rs similarity index 100% rename from datafusion/core/tests/integration_tests/schema_adapter_integration_tests.rs rename to datafusion/core/tests/schema_adaptation/schema_adapter_integration_tests.rs From a206e6fb8adb9e85109da1d6a1c6e1d29ebfe9f2 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Thu, 17 Jul 2025 16:41:11 +0800 Subject: [PATCH 09/41] Refactor physical_optimizer module imports to use the correct path --- datafusion/core/tests/core_integration.rs | 2 +- .../core/tests/physical_optimizer/aggregate_statistics.rs | 2 +- .../tests/physical_optimizer/combine_partial_final_agg.rs | 2 +- .../core/tests/physical_optimizer/enforce_distribution.rs | 4 ++-- datafusion/core/tests/physical_optimizer/enforce_sorting.rs | 2 +- datafusion/core/tests/physical_optimizer/limit_pushdown.rs | 2 +- .../tests/physical_optimizer/limited_distinct_aggregation.rs | 2 +- .../replace_with_order_preserving_variants.rs | 2 +- datafusion/core/tests/physical_optimizer/sanity_checker.rs | 2 +- 9 files changed, 10 insertions(+), 10 deletions(-) diff --git a/datafusion/core/tests/core_integration.rs b/datafusion/core/tests/core_integration.rs index 9a9d77a0c540..250538b13370 100644 --- a/datafusion/core/tests/core_integration.rs +++ b/datafusion/core/tests/core_integration.rs @@ -43,7 +43,7 @@ mod custom_sources_cases; mod optimizer; /// Run all tests that are found in the `physical_optimizer` directory -mod physical_optimizer_test; +mod physical_optimizer; /// Run all tests that are found in the `serde` directory mod serde; diff --git a/datafusion/core/tests/physical_optimizer/aggregate_statistics.rs b/datafusion/core/tests/physical_optimizer/aggregate_statistics.rs index 00eea3bfa38e..a79d743cb253 100644 --- a/datafusion/core/tests/physical_optimizer/aggregate_statistics.rs +++ b/datafusion/core/tests/physical_optimizer/aggregate_statistics.rs @@ -17,7 +17,7 @@ use std::sync::Arc; -use crate::physical_optimizer_test::test_utils::TestAggregate; +use crate::physical_optimizer::test_utils::TestAggregate; use arrow::array::Int32Array; use arrow::datatypes::{DataType, Field, Schema}; diff --git a/datafusion/core/tests/physical_optimizer/combine_partial_final_agg.rs b/datafusion/core/tests/physical_optimizer/combine_partial_final_agg.rs index 94a18365cb80..9c76f6ab6f58 100644 --- a/datafusion/core/tests/physical_optimizer/combine_partial_final_agg.rs +++ b/datafusion/core/tests/physical_optimizer/combine_partial_final_agg.rs @@ -23,7 +23,7 @@ use insta::assert_snapshot; use std::sync::Arc; -use crate::physical_optimizer_test::test_utils::parquet_exec; +use crate::physical_optimizer::test_utils::parquet_exec; use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; use datafusion_common::config::ConfigOptions; diff --git a/datafusion/core/tests/physical_optimizer/enforce_distribution.rs b/datafusion/core/tests/physical_optimizer/enforce_distribution.rs index 0d9a85ac8967..fd847763124a 100644 --- a/datafusion/core/tests/physical_optimizer/enforce_distribution.rs +++ b/datafusion/core/tests/physical_optimizer/enforce_distribution.rs @@ -19,7 +19,7 @@ use std::fmt::Debug; use std::ops::Deref; use std::sync::Arc; -use crate::physical_optimizer_test::test_utils::{ +use crate::physical_optimizer::test_utils::{ check_integrity, coalesce_partitions_exec, parquet_exec_with_sort, parquet_exec_with_stats, repartition_exec, schema, sort_exec, sort_exec_with_preserve_partitioning, sort_merge_join_exec, @@ -300,7 +300,7 @@ fn hash_join_exec( join_on: &JoinOn, join_type: &JoinType, ) -> Arc { - crate::physical_optimizer_test::test_utils::hash_join_exec( + crate::physical_optimizer::test_utils::hash_join_exec( left, right, join_on.clone(), diff --git a/datafusion/core/tests/physical_optimizer/enforce_sorting.rs b/datafusion/core/tests/physical_optimizer/enforce_sorting.rs index f7e538e8a170..e31a30cc0883 100644 --- a/datafusion/core/tests/physical_optimizer/enforce_sorting.rs +++ b/datafusion/core/tests/physical_optimizer/enforce_sorting.rs @@ -17,7 +17,7 @@ use std::sync::Arc; -use crate::physical_optimizer_test::test_utils::{ +use crate::physical_optimizer::test_utils::{ aggregate_exec, bounded_window_exec, bounded_window_exec_with_partition, check_integrity, coalesce_batches_exec, coalesce_partitions_exec, create_test_schema, create_test_schema2, create_test_schema3, filter_exec, global_limit_exec, diff --git a/datafusion/core/tests/physical_optimizer/limit_pushdown.rs b/datafusion/core/tests/physical_optimizer/limit_pushdown.rs index 63c6d1f1c6d6..56d48901f284 100644 --- a/datafusion/core/tests/physical_optimizer/limit_pushdown.rs +++ b/datafusion/core/tests/physical_optimizer/limit_pushdown.rs @@ -17,7 +17,7 @@ use std::sync::Arc; -use crate::physical_optimizer_test::test_utils::{ +use crate::physical_optimizer::test_utils::{ coalesce_batches_exec, coalesce_partitions_exec, global_limit_exec, local_limit_exec, sort_exec, sort_preserving_merge_exec, stream_exec, }; diff --git a/datafusion/core/tests/physical_optimizer/limited_distinct_aggregation.rs b/datafusion/core/tests/physical_optimizer/limited_distinct_aggregation.rs index ba8ff5fd2387..ad15d6803413 100644 --- a/datafusion/core/tests/physical_optimizer/limited_distinct_aggregation.rs +++ b/datafusion/core/tests/physical_optimizer/limited_distinct_aggregation.rs @@ -20,7 +20,7 @@ use insta::assert_snapshot; use std::sync::Arc; -use crate::physical_optimizer_test::test_utils::{ +use crate::physical_optimizer::test_utils::{ build_group_by, get_optimized_plan, mock_data, parquet_exec_with_sort, schema, TestAggregate, }; diff --git a/datafusion/core/tests/physical_optimizer/replace_with_order_preserving_variants.rs b/datafusion/core/tests/physical_optimizer/replace_with_order_preserving_variants.rs index e67f96b252a2..c9baa9a932ae 100644 --- a/datafusion/core/tests/physical_optimizer/replace_with_order_preserving_variants.rs +++ b/datafusion/core/tests/physical_optimizer/replace_with_order_preserving_variants.rs @@ -17,7 +17,7 @@ use std::sync::Arc; -use crate::physical_optimizer_test::test_utils::{ +use crate::physical_optimizer::test_utils::{ check_integrity, coalesce_batches_exec, coalesce_partitions_exec, create_test_schema3, parquet_exec_with_sort, sort_exec, sort_exec_with_preserve_partitioning, sort_preserving_merge_exec, diff --git a/datafusion/core/tests/physical_optimizer/sanity_checker.rs b/datafusion/core/tests/physical_optimizer/sanity_checker.rs index d1a62219d79b..6233f5d09c56 100644 --- a/datafusion/core/tests/physical_optimizer/sanity_checker.rs +++ b/datafusion/core/tests/physical_optimizer/sanity_checker.rs @@ -18,7 +18,7 @@ use insta::assert_snapshot; use std::sync::Arc; -use crate::physical_optimizer_test::test_utils::{ +use crate::physical_optimizer::test_utils::{ bounded_window_exec, global_limit_exec, local_limit_exec, memory_exec, repartition_exec, sort_exec, sort_expr_options, sort_merge_join_exec, }; From c7e6b74e5bcdaaa5a065a51aea1a3da2c8e6b944 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Thu, 17 Jul 2025 16:48:36 +0800 Subject: [PATCH 10/41] Add end-to-end tests for schema-related functionality in schema.rs --- datafusion/core/tests/schema.rs | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 datafusion/core/tests/schema.rs diff --git a/datafusion/core/tests/schema.rs b/datafusion/core/tests/schema.rs new file mode 100644 index 000000000000..a6349e11f7db --- /dev/null +++ b/datafusion/core/tests/schema.rs @@ -0,0 +1,21 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! End to end test for Schema related functionality + +/// Run all tests that are found in the `parquet` directory +mod schema_adaptation; From 593b4b4e2da4928b5970fc362052655ff0efb351 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Thu, 17 Jul 2025 16:53:54 +0800 Subject: [PATCH 11/41] Update expected schema column name in parquet integration test --- .../tests/schema_adaptation/schema_adapter_integration_tests.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/core/tests/schema_adaptation/schema_adapter_integration_tests.rs b/datafusion/core/tests/schema_adaptation/schema_adapter_integration_tests.rs index 9ec01b48a435..54cfe53e7226 100644 --- a/datafusion/core/tests/schema_adaptation/schema_adapter_integration_tests.rs +++ b/datafusion/core/tests/schema_adaptation/schema_adapter_integration_tests.rs @@ -244,7 +244,7 @@ async fn test_parquet_integration_with_schema_adapter() -> Result<()> { // Verify the schema has uppercase column names let result_schema = batches[0].schema(); assert_eq!(result_schema.field(0).name(), "ID"); - assert_eq!(result_schema.field(1).name(), "NAME0"); + assert_eq!(result_schema.field(1).name(), "NAME"); Ok(()) } From 29e0bcaca312b81ec0abbc6830751cfde3f12141 Mon Sep 17 00:00:00 2001 From: kosiew Date: Fri, 18 Jul 2025 07:39:54 +0800 Subject: [PATCH 12/41] Move schema adapter tests relocate schema adapter tests into the parquet suite reference new location in schema.rs remove old schema_adaptation tests --- .../core/tests/parquet/schema_adapter.rs | 549 ++++++++++++++++- datafusion/core/tests/schema.rs | 3 +- .../core/tests/schema_adaptation/mod.rs | 20 - .../schema_adapter_integration_tests.rs | 578 ------------------ 4 files changed, 549 insertions(+), 601 deletions(-) delete mode 100644 datafusion/core/tests/schema_adaptation/mod.rs delete mode 100644 datafusion/core/tests/schema_adaptation/schema_adapter_integration_tests.rs diff --git a/datafusion/core/tests/parquet/schema_adapter.rs b/datafusion/core/tests/parquet/schema_adapter.rs index abc1550b31ca..6faa604baf6b 100644 --- a/datafusion/core/tests/parquet/schema_adapter.rs +++ b/datafusion/core/tests/parquet/schema_adapter.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +use std::any::Any; use std::sync::Arc; use arrow::array::{record_batch, RecordBatch, RecordBatchOptions}; @@ -26,6 +27,7 @@ use datafusion::common::Result; use datafusion::datasource::listing::{ListingTable, ListingTableConfig}; use datafusion::prelude::{SessionConfig, SessionContext}; use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode}; +use datafusion_common::DataFusionError; use datafusion_common::{ColumnStatistics, ScalarValue}; use datafusion_datasource::schema_adapter::{ DefaultSchemaAdapterFactory, SchemaAdapter, SchemaAdapterFactory, SchemaMapper, @@ -40,6 +42,20 @@ use datafusion_physical_expr::{DefaultPhysicalExprAdapter, PhysicalExpr}; use itertools::Itertools; use object_store::{memory::InMemory, path::Path, ObjectStore}; use parquet::arrow::ArrowWriter; +use tempfile::TempDir; + +#[cfg(feature = "parquet")] +use datafusion::datasource::physical_plan::ParquetSource; +use datafusion::datasource::physical_plan::{ + ArrowSource, CsvSource, FileOpener, FileScanConfig, FileScanConfigBuilder, + FileSource, JsonSource, +}; +use datafusion::datasource::source::DataSourceExec; +use datafusion::physical_plan::metrics::ExecutionPlanMetricsSet; +use datafusion::physical_plan::{ExecutionPlan, Statistics}; +use datafusion_datasource::PartitionedFile; +#[cfg(feature = "parquet")] +use parquet::file::properties::WriterProperties; async fn write_parquet(batch: RecordBatch, store: Arc, path: &str) { let mut out = BytesMut::new().writer(); @@ -187,7 +203,7 @@ impl PhysicalExprAdapter for CustomPhysicalExprAdapter { .logical_file_schema .field_with_name(field_name) .map_err(|_| { - datafusion_common::DataFusionError::Plan(format!( + DataFusionError::Plan(format!( "Field '{field_name}' not found in logical file schema", )) })?; @@ -376,3 +392,534 @@ async fn test_custom_schema_adapter_and_custom_expression_adapter() { ]; assert_batches_eq!(expected, &batches); } + +// ---------------------------------------------------------------------- +// Tests migrated from schema_adaptation/schema_adapter_integration_tests.rs +// ---------------------------------------------------------------------- + +/// A schema adapter factory that transforms column names to uppercase +#[derive(Debug)] +struct UppercaseAdapterFactory {} + +impl SchemaAdapterFactory for UppercaseAdapterFactory { + fn create( + &self, + projected_table_schema: SchemaRef, + _table_schema: SchemaRef, + ) -> Box { + Box::new(UppercaseAdapter { + table_schema: projected_table_schema, + }) + } +} + +/// Schema adapter that transforms column names to uppercase +#[derive(Debug)] +struct UppercaseAdapter { + table_schema: SchemaRef, +} + +impl SchemaAdapter for UppercaseAdapter { + fn map_column_index(&self, index: usize, file_schema: &Schema) -> Option { + let field = self.table_schema.field(index); + file_schema + .fields() + .iter() + .position(|f| f.name().eq_ignore_ascii_case(field.name())) + } + + fn map_schema( + &self, + file_schema: &Schema, + ) -> Result<(Arc, Vec)> { + let mut projection = Vec::with_capacity(file_schema.fields().len()); + for (idx, file_field) in file_schema.fields().iter().enumerate() { + if self + .table_schema + .fields() + .iter() + .any(|f| f.name().eq_ignore_ascii_case(file_field.name())) + { + projection.push(idx); + } + } + + let mapper = UppercaseSchemaMapper { + output_schema: self.output_schema(), + projection: projection.clone(), + }; + + Ok((Arc::new(mapper), projection)) + } +} + +#[derive(Debug)] +struct TestSchemaMapping { + output_schema: SchemaRef, + projection: Vec, +} + +impl SchemaMapper for TestSchemaMapping { + fn map_batch(&self, batch: RecordBatch) -> Result { + let columns = self + .projection + .iter() + .map(|&i| batch.column(i).clone()) + .collect::>(); + Ok(RecordBatch::try_new(self.output_schema.clone(), columns)?) + } + + fn map_column_statistics( + &self, + stats: &[ColumnStatistics], + ) -> Result> { + Ok(self + .projection + .iter() + .map(|&i| stats.get(i).cloned().unwrap_or_default()) + .collect()) + } +} + +impl UppercaseAdapter { + #[allow(dead_code)] + fn adapt(&self, record_batch: RecordBatch) -> Result { + Ok(record_batch) + } + + fn output_schema(&self) -> SchemaRef { + let fields: Vec = self + .table_schema + .fields() + .iter() + .map(|f| { + Field::new( + f.name().to_uppercase().as_str(), + f.data_type().clone(), + f.is_nullable(), + ) + }) + .collect(); + + Arc::new(Schema::new(fields)) + } +} + +#[derive(Debug)] +struct UppercaseSchemaMapper { + output_schema: SchemaRef, + projection: Vec, +} + +impl SchemaMapper for UppercaseSchemaMapper { + fn map_batch(&self, batch: RecordBatch) -> Result { + let columns = self + .projection + .iter() + .map(|&i| batch.column(i).clone()) + .collect::>(); + Ok(RecordBatch::try_new(self.output_schema.clone(), columns)?) + } + + fn map_column_statistics( + &self, + stats: &[ColumnStatistics], + ) -> Result> { + Ok(self + .projection + .iter() + .map(|&i| stats.get(i).cloned().unwrap_or_default()) + .collect()) + } +} + +#[cfg(feature = "parquet")] +#[tokio::test] +async fn test_parquet_integration_with_schema_adapter() -> Result<()> { + // Create a temporary directory for our test file + let tmp_dir = TempDir::new()?; + let file_path = tmp_dir.path().join("test.parquet"); + let file_path_str = file_path.to_str().unwrap(); + + // Create test data + let schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("name", DataType::Utf8, true), + ])); + + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(arrow::array::Int32Array::from(vec![1, 2, 3])), + Arc::new(arrow::array::StringArray::from(vec!["a", "b", "c"])), + ], + )?; + + // Write test parquet file + let file = std::fs::File::create(file_path_str)?; + let props = WriterProperties::builder().build(); + let mut writer = ArrowWriter::try_new(file, schema.clone(), Some(props))?; + writer.write(&batch)?; + writer.close()?; + + // Create a session context + let ctx = SessionContext::new(); + + // Create a ParquetSource with the adapter factory + let file_source = ParquetSource::default() + .with_schema_adapter_factory(Arc::new(UppercaseAdapterFactory {}))?; + + let config = FileScanConfigBuilder::new( + ObjectStoreUrl::parse(format!("file://{file_path_str}"))?, + schema.clone(), + file_source.clone(), + ) + .with_file(PartitionedFile::new(file_path_str, 100)) + .build(); + + // Create a data source executor + let exec = DataSourceExec::from_data_source(config); + + // Collect results + let task_ctx = ctx.task_ctx(); + let stream = exec.execute(0, task_ctx)?; + let batches = datafusion::physical_plan::common::collect(stream).await?; + + // There should be one batch + assert_eq!(batches.len(), 1); + + // Verify the schema has uppercase column names + let result_schema = batches[0].schema(); + assert_eq!(result_schema.field(0).name(), "ID"); + assert_eq!(result_schema.field(1).name(), "NAME"); + + Ok(()) +} + +#[cfg(feature = "parquet")] +#[tokio::test] +async fn test_parquet_integration_with_schema_adapter_and_expression_rewriter( +) -> Result<()> { + // Create a temporary directory for our test file + let tmp_dir = TempDir::new()?; + let file_path = tmp_dir.path().join("test.parquet"); + let file_path_str = file_path.to_str().unwrap(); + + // Create test data + let schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("name", DataType::Utf8, true), + ])); + + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(arrow::array::Int32Array::from(vec![1, 2, 3])), + Arc::new(arrow::array::StringArray::from(vec!["a", "b", "c"])), + ], + )?; + + // Write test parquet file + let file = std::fs::File::create(file_path_str)?; + let props = WriterProperties::builder().build(); + let mut writer = ArrowWriter::try_new(file, schema.clone(), Some(props))?; + writer.write(&batch)?; + writer.close()?; + + // Create a session context + let ctx = SessionContext::new(); + + // Create a ParquetSource with the adapter factory + let file_source = ParquetSource::default() + .with_schema_adapter_factory(Arc::new(UppercaseAdapterFactory {}))?; + + let config = FileScanConfigBuilder::new( + ObjectStoreUrl::parse(format!("file://{file_path_str}"))?, + schema.clone(), + file_source, + ) + .with_file(PartitionedFile::new(file_path_str, 100)) + .build(); + + // Create a data source executor + let exec = DataSourceExec::from_data_source(config); + + // Collect results + let task_ctx = ctx.task_ctx(); + let stream = exec.execute(0, task_ctx)?; + let batches = datafusion::physical_plan::common::collect(stream).await?; + + // There should be one batch + assert_eq!(batches.len(), 1); + + // Verify the schema has uppercase column names + let result_schema = batches[0].schema(); + assert_eq!(result_schema.field(0).name(), "ID"); + assert_eq!(result_schema.field(1).name(), "NAME"); + + Ok(()) +} + +#[tokio::test] +async fn test_multi_source_schema_adapter_reuse() -> Result<()> { + // This test verifies that the same schema adapter factory can be reused + // across different file source types. This is important for ensuring that: + // 1. The schema adapter factory interface works uniformly across all source types + // 2. The factory can be shared and cloned efficiently using Arc + // 3. Various data source implementations correctly implement the schema adapter factory pattern + + // Create a test factory + let factory = Arc::new(UppercaseAdapterFactory {}); + + // Apply the same adapter to different source types + let arrow_source = ArrowSource::default() + .with_schema_adapter_factory(factory.clone()) + .unwrap(); + + #[cfg(feature = "parquet")] + let parquet_source = ParquetSource::default() + .with_schema_adapter_factory(factory.clone()) + .unwrap(); + + let csv_source = CsvSource::default() + .with_schema_adapter_factory(factory.clone()) + .unwrap(); + + // Verify adapters were properly set + assert!(arrow_source.schema_adapter_factory().is_some()); + + #[cfg(feature = "parquet")] + assert!(parquet_source.schema_adapter_factory().is_some()); + + assert!(csv_source.schema_adapter_factory().is_some()); + + Ok(()) +} + +// Helper function to test From for Arc implementations +fn test_from_impl> + Default>(expected_file_type: &str) { + let source = T::default(); + let file_source: Arc = source.into(); + assert_eq!(file_source.file_type(), expected_file_type); +} + +#[test] +fn test_from_implementations() { + // Test From implementation for various sources + test_from_impl::("arrow"); + + #[cfg(feature = "parquet")] + test_from_impl::("parquet"); + + test_from_impl::("csv"); + + test_from_impl::("json"); +} + +/// A simple test schema adapter factory that doesn't modify the schema +#[derive(Debug)] +struct TestSchemaAdapterFactory {} + +impl SchemaAdapterFactory for TestSchemaAdapterFactory { + fn create( + &self, + projected_table_schema: SchemaRef, + _table_schema: SchemaRef, + ) -> Box { + Box::new(TestSchemaAdapter { + input_schema: projected_table_schema, + }) + } +} + +/// A test schema adapter that passes through data unmodified +#[derive(Debug)] +struct TestSchemaAdapter { + input_schema: SchemaRef, +} + +impl SchemaAdapter for TestSchemaAdapter { + fn map_column_index(&self, index: usize, file_schema: &Schema) -> Option { + let field = self.input_schema.field(index); + file_schema + .fields() + .iter() + .position(|f| f.name() == field.name()) + } + + fn map_schema( + &self, + file_schema: &Schema, + ) -> Result<(Arc, Vec)> { + let mut projection = Vec::with_capacity(file_schema.fields().len()); + for (idx, file_field) in file_schema.fields().iter().enumerate() { + if self + .input_schema + .fields() + .iter() + .any(|f| f.name() == file_field.name()) + { + projection.push(idx); + } + } + + let mapper = TestSchemaMapping { + output_schema: Arc::clone(&self.input_schema), + projection: projection.clone(), + }; + + Ok((Arc::new(mapper), projection)) + } +} + +#[cfg(feature = "parquet")] +#[test] +fn test_schema_adapter_preservation() { + // Create a test schema + let schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("name", DataType::Utf8, true), + ])); + + // Create source with schema adapter factory + let source = ParquetSource::default(); + let factory = Arc::new(TestSchemaAdapterFactory {}); + let file_source = source.with_schema_adapter_factory(factory).unwrap(); + + // Create a FileScanConfig with the source + let config_builder = FileScanConfigBuilder::new( + ObjectStoreUrl::local_filesystem(), + schema.clone(), + file_source.clone(), + ) + .with_file(PartitionedFile::new("test.parquet", 100)); + + let config = config_builder.build(); + + // Verify the schema adapter factory is present in the file source + assert!(config.file_source().schema_adapter_factory().is_some()); +} + +/// A test source for testing schema adapters +#[derive(Debug, Clone)] +struct TestSource { + schema_adapter_factory: Option>, + metrics: ExecutionPlanMetricsSet, +} + +impl TestSource { + fn new() -> Self { + Self { + schema_adapter_factory: None, + metrics: ExecutionPlanMetricsSet::new(), + } + } +} + +impl FileSource for TestSource { + fn file_type(&self) -> &str { + "test" + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn create_file_opener( + &self, + _store: Arc, + _conf: &FileScanConfig, + _index: usize, + ) -> Arc { + unimplemented!("Not needed for this test") + } + + fn with_batch_size(&self, _batch_size: usize) -> Arc { + Arc::new(self.clone()) + } + + fn with_schema(&self, _schema: SchemaRef) -> Arc { + Arc::new(self.clone()) + } + + fn with_projection(&self, _projection: &FileScanConfig) -> Arc { + Arc::new(self.clone()) + } + + fn with_statistics(&self, _statistics: Statistics) -> Arc { + Arc::new(self.clone()) + } + + fn metrics(&self) -> &ExecutionPlanMetricsSet { + &self.metrics + } + + fn statistics(&self) -> Result { + Ok(Statistics::default()) + } + + fn with_schema_adapter_factory( + &self, + schema_adapter_factory: Arc, + ) -> Result> { + Ok(Arc::new(Self { + schema_adapter_factory: Some(schema_adapter_factory), + metrics: ExecutionPlanMetricsSet::new(), + })) + } + + fn schema_adapter_factory(&self) -> Option> { + self.schema_adapter_factory.clone() + } +} + +#[test] +fn test_schema_adapter() { + // This test verifies the functionality of the SchemaAdapter and SchemaAdapterFactory + // components used in DataFusion's file sources. + // + // The test specifically checks: + // 1. Creating and attaching a schema adapter factory to a file source + // 2. Creating a schema adapter using the factory + // 3. The schema adapter's ability to map column indices between a table schema and a file schema + // 4. The schema adapter's ability to create a projection that selects only the columns + // from the file schema that are present in the table schema + // + // Schema adapters are used when the schema of data in files doesn't exactly match + // the schema expected by the query engine, allowing for field mapping and data transformation. + + // Create a test schema + let table_schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("name", DataType::Utf8, true), + ])); + + // Create a file schema + let file_schema = Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("name", DataType::Utf8, true), + Field::new("extra", DataType::Int64, true), + ]); + + // Create a TestSource + let source = TestSource::new(); + assert!(source.schema_adapter_factory().is_none()); + + // Add a schema adapter factory + let factory = Arc::new(TestSchemaAdapterFactory {}); + let source_with_adapter = source.with_schema_adapter_factory(factory).unwrap(); + assert!(source_with_adapter.schema_adapter_factory().is_some()); + + // Create a schema adapter + let adapter_factory = source_with_adapter.schema_adapter_factory().unwrap(); + let adapter = + adapter_factory.create(Arc::clone(&table_schema), Arc::clone(&table_schema)); + + // Test mapping column index + assert_eq!(adapter.map_column_index(0, &file_schema), Some(0)); + assert_eq!(adapter.map_column_index(1, &file_schema), Some(1)); + + // Test creating schema mapper + let (_mapper, projection) = adapter.map_schema(&file_schema).unwrap(); + assert_eq!(projection, vec![0, 1]); +} diff --git a/datafusion/core/tests/schema.rs b/datafusion/core/tests/schema.rs index a6349e11f7db..bf5081dc9c3d 100644 --- a/datafusion/core/tests/schema.rs +++ b/datafusion/core/tests/schema.rs @@ -17,5 +17,4 @@ //! End to end test for Schema related functionality -/// Run all tests that are found in the `parquet` directory -mod schema_adaptation; +// Schema adaptation tests now live in `parquet/schema_adapter.rs` diff --git a/datafusion/core/tests/schema_adaptation/mod.rs b/datafusion/core/tests/schema_adaptation/mod.rs deleted file mode 100644 index 5424780a0832..000000000000 --- a/datafusion/core/tests/schema_adaptation/mod.rs +++ /dev/null @@ -1,20 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Integration tests module - -pub mod schema_adapter_integration_tests; \ No newline at end of file diff --git a/datafusion/core/tests/schema_adaptation/schema_adapter_integration_tests.rs b/datafusion/core/tests/schema_adaptation/schema_adapter_integration_tests.rs deleted file mode 100644 index 54cfe53e7226..000000000000 --- a/datafusion/core/tests/schema_adaptation/schema_adapter_integration_tests.rs +++ /dev/null @@ -1,578 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Integration test for schema adapter factory functionality - -use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; -use arrow::record_batch::RecordBatch; -use datafusion::datasource::object_store::ObjectStoreUrl; -use datafusion::datasource::physical_plan::ArrowSource; -use datafusion::datasource::physical_plan::JsonSource; -#[cfg(feature = "parquet")] -use datafusion::datasource::physical_plan::ParquetSource; -use datafusion::datasource::physical_plan::{ - FileOpener, FileScanConfig, FileScanConfigBuilder, FileSource, -}; -use datafusion::datasource::source::DataSourceExec; -use datafusion::physical_plan::metrics::ExecutionPlanMetricsSet; -use datafusion::physical_plan::ExecutionPlan; -use datafusion::physical_plan::Statistics; -use datafusion::prelude::*; -use datafusion_common::ColumnStatistics; -use datafusion_common::DataFusionError; -use datafusion_common::Result; -use datafusion_datasource::schema_adapter::{ - SchemaAdapter, SchemaAdapterFactory, SchemaMapper, -}; -use datafusion_datasource::PartitionedFile; -use object_store::ObjectStore; -#[cfg(feature = "parquet")] -use parquet::arrow::ArrowWriter; -#[cfg(feature = "parquet")] -use parquet::file::properties::WriterProperties; -use std::any::Any; -use std::sync::Arc; -use tempfile::TempDir; - -use datafusion::datasource::physical_plan::CsvSource; - -/// A schema adapter factory that transforms column names to uppercase -#[derive(Debug)] -struct UppercaseAdapterFactory {} - -impl SchemaAdapterFactory for UppercaseAdapterFactory { - fn create( - &self, - projected_table_schema: SchemaRef, - _table_schema: SchemaRef, - ) -> Box { - Box::new(UppercaseAdapter { - table_schema: projected_table_schema, - }) - } -} - -/// Schema adapter that transforms column names to uppercase -#[derive(Debug)] -struct UppercaseAdapter { - table_schema: SchemaRef, -} - -impl SchemaAdapter for UppercaseAdapter { - fn map_column_index(&self, index: usize, file_schema: &Schema) -> Option { - let field = self.table_schema.field(index); - file_schema - .fields() - .iter() - .position(|f| f.name().eq_ignore_ascii_case(field.name())) - } - - fn map_schema( - &self, - file_schema: &Schema, - ) -> Result<(Arc, Vec)> { - let mut projection = Vec::with_capacity(file_schema.fields().len()); - for (idx, file_field) in file_schema.fields().iter().enumerate() { - if self - .table_schema - .fields() - .iter() - .any(|f| f.name().eq_ignore_ascii_case(file_field.name())) - { - projection.push(idx); - } - } - - let mapper = UppercaseSchemaMapper { - output_schema: self.output_schema(), - projection: projection.clone(), - }; - - Ok((Arc::new(mapper), projection)) - } -} - -#[derive(Debug)] -struct TestSchemaMapping { - output_schema: SchemaRef, - projection: Vec, -} - -impl SchemaMapper for TestSchemaMapping { - fn map_batch(&self, batch: RecordBatch) -> Result { - let columns = self - .projection - .iter() - .map(|&i| batch.column(i).clone()) - .collect::>(); - Ok(RecordBatch::try_new(self.output_schema.clone(), columns)?) - } - - fn map_column_statistics( - &self, - stats: &[ColumnStatistics], - ) -> Result> { - Ok(self - .projection - .iter() - .map(|&i| stats.get(i).cloned().unwrap_or_default()) - .collect()) - } -} - -impl UppercaseAdapter { - #[allow(dead_code)] - fn adapt(&self, record_batch: RecordBatch) -> Result { - Ok(record_batch) - } - - fn output_schema(&self) -> SchemaRef { - let fields: Vec = self - .table_schema - .fields() - .iter() - .map(|f| { - Field::new( - f.name().to_uppercase().as_str(), - f.data_type().clone(), - f.is_nullable(), - ) - }) - .collect(); - - Arc::new(Schema::new(fields)) - } -} - -#[derive(Debug)] -struct UppercaseSchemaMapper { - output_schema: SchemaRef, - projection: Vec, -} - -impl SchemaMapper for UppercaseSchemaMapper { - fn map_batch(&self, batch: RecordBatch) -> Result { - let columns = self - .projection - .iter() - .map(|&i| batch.column(i).clone()) - .collect::>(); - Ok(RecordBatch::try_new(self.output_schema.clone(), columns)?) - } - - fn map_column_statistics( - &self, - stats: &[ColumnStatistics], - ) -> Result> { - Ok(self - .projection - .iter() - .map(|&i| stats.get(i).cloned().unwrap_or_default()) - .collect()) - } -} - -#[cfg(feature = "parquet")] -#[tokio::test] -async fn test_parquet_integration_with_schema_adapter() -> Result<()> { - // Create a temporary directory for our test file - let tmp_dir = TempDir::new()?; - let file_path = tmp_dir.path().join("test.parquet"); - let file_path_str = file_path.to_str().unwrap(); - - // Create test data - let schema = Arc::new(Schema::new(vec![ - Field::new("id", DataType::Int32, false), - Field::new("name", DataType::Utf8, true), - ])); - - let batch = RecordBatch::try_new( - schema.clone(), - vec![ - Arc::new(arrow::array::Int32Array::from(vec![1, 2, 3])), - Arc::new(arrow::array::StringArray::from(vec!["a", "b", "c"])), - ], - )?; - - // Write test parquet file - let file = std::fs::File::create(file_path_str)?; - let props = WriterProperties::builder().build(); - let mut writer = ArrowWriter::try_new(file, schema.clone(), Some(props))?; - writer.write(&batch)?; - writer.close()?; - - // Create a session context - let ctx = SessionContext::new(); - - // Create a ParquetSource with the adapter factory - let file_source = ParquetSource::default() - .with_schema_adapter_factory(Arc::new(UppercaseAdapterFactory {}))?; - - let config = FileScanConfigBuilder::new( - ObjectStoreUrl::parse(format!("file://{file_path_str}"))?, - schema.clone(), - file_source.clone(), - ) - .with_file(PartitionedFile::new(file_path_str, 100)) - .build(); - - // Create a data source executor - let exec = DataSourceExec::from_data_source(config); - - // Collect results - let task_ctx = ctx.task_ctx(); - let stream = exec.execute(0, task_ctx)?; - let batches = datafusion::physical_plan::common::collect(stream).await?; - - // There should be one batch - assert_eq!(batches.len(), 1); - - // Verify the schema has uppercase column names - let result_schema = batches[0].schema(); - assert_eq!(result_schema.field(0).name(), "ID"); - assert_eq!(result_schema.field(1).name(), "NAME"); - - Ok(()) -} - -#[cfg(feature = "parquet")] -#[tokio::test] -async fn test_parquet_integration_with_schema_adapter_and_expression_rewriter( -) -> Result<()> { - // Create a temporary directory for our test file - let tmp_dir = TempDir::new()?; - let file_path = tmp_dir.path().join("test.parquet"); - let file_path_str = file_path.to_str().unwrap(); - - // Create test data - let schema = Arc::new(Schema::new(vec![ - Field::new("id", DataType::Int32, false), - Field::new("name", DataType::Utf8, true), - ])); - - let batch = RecordBatch::try_new( - schema.clone(), - vec![ - Arc::new(arrow::array::Int32Array::from(vec![1, 2, 3])), - Arc::new(arrow::array::StringArray::from(vec!["a", "b", "c"])), - ], - )?; - - // Write test parquet file - let file = std::fs::File::create(file_path_str)?; - let props = WriterProperties::builder().build(); - let mut writer = ArrowWriter::try_new(file, schema.clone(), Some(props))?; - writer.write(&batch)?; - writer.close()?; - - // Create a session context - let ctx = SessionContext::new(); - - // Create a ParquetSource with the adapter factory - let file_source = ParquetSource::default() - .with_schema_adapter_factory(Arc::new(UppercaseAdapterFactory {}))?; - - let config = FileScanConfigBuilder::new( - ObjectStoreUrl::parse(format!("file://{file_path_str}"))?, - schema.clone(), - file_source, - ) - .with_file(PartitionedFile::new(file_path_str, 100)) - .build(); - - // Create a data source executor - let exec = DataSourceExec::from_data_source(config); - - // Collect results - let task_ctx = ctx.task_ctx(); - let stream = exec.execute(0, task_ctx)?; - let batches = datafusion::physical_plan::common::collect(stream).await?; - - // There should be one batch - assert_eq!(batches.len(), 1); - - // Verify the schema has uppercase column names - let result_schema = batches[0].schema(); - assert_eq!(result_schema.field(0).name(), "ID"); - assert_eq!(result_schema.field(1).name(), "NAME"); - - Ok(()) -} - -#[tokio::test] -async fn test_multi_source_schema_adapter_reuse() -> Result<()> { - // This test verifies that the same schema adapter factory can be reused - // across different file source types. This is important for ensuring that: - // 1. The schema adapter factory interface works uniformly across all source types - // 2. The factory can be shared and cloned efficiently using Arc - // 3. Various data source implementations correctly implement the schema adapter factory pattern - - // Create a test factory - let factory = Arc::new(UppercaseAdapterFactory {}); - - // Apply the same adapter to different source types - let arrow_source = ArrowSource::default() - .with_schema_adapter_factory(factory.clone()) - .unwrap(); - - #[cfg(feature = "parquet")] - let parquet_source = ParquetSource::default() - .with_schema_adapter_factory(factory.clone()) - .unwrap(); - - let csv_source = CsvSource::default() - .with_schema_adapter_factory(factory.clone()) - .unwrap(); - - // Verify adapters were properly set - assert!(arrow_source.schema_adapter_factory().is_some()); - - #[cfg(feature = "parquet")] - assert!(parquet_source.schema_adapter_factory().is_some()); - - assert!(csv_source.schema_adapter_factory().is_some()); - - Ok(()) -} - -// Helper function to test From for Arc implementations -fn test_from_impl> + Default>(expected_file_type: &str) { - let source = T::default(); - let file_source: Arc = source.into(); - assert_eq!(file_source.file_type(), expected_file_type); -} - -#[test] -fn test_from_implementations() { - // Test From implementation for various sources - test_from_impl::("arrow"); - - #[cfg(feature = "parquet")] - test_from_impl::("parquet"); - - test_from_impl::("csv"); - - test_from_impl::("json"); -} - -/// A simple test schema adapter factory that doesn't modify the schema -#[derive(Debug)] -struct TestSchemaAdapterFactory {} - -impl SchemaAdapterFactory for TestSchemaAdapterFactory { - fn create( - &self, - projected_table_schema: SchemaRef, - _table_schema: SchemaRef, - ) -> Box { - Box::new(TestSchemaAdapter { - input_schema: projected_table_schema, - }) - } -} - -/// A test schema adapter that passes through data unmodified -#[derive(Debug)] -struct TestSchemaAdapter { - input_schema: SchemaRef, -} - -impl SchemaAdapter for TestSchemaAdapter { - fn map_column_index(&self, index: usize, file_schema: &Schema) -> Option { - let field = self.input_schema.field(index); - file_schema - .fields() - .iter() - .position(|f| f.name() == field.name()) - } - - fn map_schema( - &self, - file_schema: &Schema, - ) -> Result<(Arc, Vec)> { - let mut projection = Vec::with_capacity(file_schema.fields().len()); - for (idx, file_field) in file_schema.fields().iter().enumerate() { - if self - .input_schema - .fields() - .iter() - .any(|f| f.name() == file_field.name()) - { - projection.push(idx); - } - } - - let mapper = TestSchemaMapping { - output_schema: Arc::clone(&self.input_schema), - projection: projection.clone(), - }; - - Ok((Arc::new(mapper), projection)) - } -} - -#[cfg(feature = "parquet")] -#[test] -fn test_schema_adapter_preservation() { - // Create a test schema - let schema = Arc::new(Schema::new(vec![ - Field::new("id", DataType::Int32, false), - Field::new("name", DataType::Utf8, true), - ])); - - // Create source with schema adapter factory - let source = ParquetSource::default(); - let factory = Arc::new(TestSchemaAdapterFactory {}); - let file_source = source.with_schema_adapter_factory(factory).unwrap(); - - // Create a FileScanConfig with the source - let config_builder = FileScanConfigBuilder::new( - ObjectStoreUrl::local_filesystem(), - schema.clone(), - file_source.clone(), - ) - .with_file(PartitionedFile::new("test.parquet", 100)); - - let config = config_builder.build(); - - // Verify the schema adapter factory is present in the file source - assert!(config.file_source().schema_adapter_factory().is_some()); -} - -/// A test source for testing schema adapters -#[derive(Debug, Clone)] -struct TestSource { - schema_adapter_factory: Option>, - metrics: ExecutionPlanMetricsSet, -} - -impl TestSource { - fn new() -> Self { - Self { - schema_adapter_factory: None, - metrics: ExecutionPlanMetricsSet::new(), - } - } -} - -impl FileSource for TestSource { - fn file_type(&self) -> &str { - "test" - } - - fn as_any(&self) -> &dyn Any { - self - } - - fn create_file_opener( - &self, - _store: Arc, - _conf: &FileScanConfig, - _index: usize, - ) -> Arc { - unimplemented!("Not needed for this test") - } - - fn with_batch_size(&self, _batch_size: usize) -> Arc { - Arc::new(self.clone()) - } - - fn with_schema(&self, _schema: SchemaRef) -> Arc { - Arc::new(self.clone()) - } - - fn with_projection(&self, _projection: &FileScanConfig) -> Arc { - Arc::new(self.clone()) - } - - fn with_statistics(&self, _statistics: Statistics) -> Arc { - Arc::new(self.clone()) - } - - fn metrics(&self) -> &ExecutionPlanMetricsSet { - &self.metrics - } - - fn statistics(&self) -> Result { - Ok(Statistics::default()) - } - - fn with_schema_adapter_factory( - &self, - schema_adapter_factory: Arc, - ) -> Result> { - Ok(Arc::new(Self { - schema_adapter_factory: Some(schema_adapter_factory), - metrics: ExecutionPlanMetricsSet::new(), - })) - } - - fn schema_adapter_factory(&self) -> Option> { - self.schema_adapter_factory.clone() - } -} - -#[test] -fn test_schema_adapter() { - // This test verifies the functionality of the SchemaAdapter and SchemaAdapterFactory - // components used in DataFusion's file sources. - // - // The test specifically checks: - // 1. Creating and attaching a schema adapter factory to a file source - // 2. Creating a schema adapter using the factory - // 3. The schema adapter's ability to map column indices between a table schema and a file schema - // 4. The schema adapter's ability to create a projection that selects only the columns - // from the file schema that are present in the table schema - // - // Schema adapters are used when the schema of data in files doesn't exactly match - // the schema expected by the query engine, allowing for field mapping and data transformation. - - // Create a test schema - let table_schema = Arc::new(Schema::new(vec![ - Field::new("id", DataType::Int32, false), - Field::new("name", DataType::Utf8, true), - ])); - - // Create a file schema - let file_schema = Schema::new(vec![ - Field::new("id", DataType::Int32, false), - Field::new("name", DataType::Utf8, true), - Field::new("extra", DataType::Int64, true), - ]); - - // Create a TestSource - let source = TestSource::new(); - assert!(source.schema_adapter_factory().is_none()); - - // Add a schema adapter factory - let factory = Arc::new(TestSchemaAdapterFactory {}); - let source_with_adapter = source.with_schema_adapter_factory(factory).unwrap(); - assert!(source_with_adapter.schema_adapter_factory().is_some()); - - // Create a schema adapter - let adapter_factory = source_with_adapter.schema_adapter_factory().unwrap(); - let adapter = - adapter_factory.create(Arc::clone(&table_schema), Arc::clone(&table_schema)); - - // Test mapping column index - assert_eq!(adapter.map_column_index(0, &file_schema), Some(0)); - assert_eq!(adapter.map_column_index(1, &file_schema), Some(1)); - - // Test creating schema mapper - let (_mapper, projection) = adapter.map_schema(&file_schema).unwrap(); - assert_eq!(projection, vec![0, 1]); -} From f08d5f50380505809af347213c1f4cd7f46669da Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Fri, 18 Jul 2025 07:44:23 +0800 Subject: [PATCH 13/41] test: remove deprecated schema.rs test file Deleted the outdated end-to-end schema test file `schema.rs` from core tests, as schema adaptation tests have been moved to `parquet/schema_adapter.rs`. --- datafusion/core/tests/schema.rs | 20 -------------------- 1 file changed, 20 deletions(-) delete mode 100644 datafusion/core/tests/schema.rs diff --git a/datafusion/core/tests/schema.rs b/datafusion/core/tests/schema.rs deleted file mode 100644 index bf5081dc9c3d..000000000000 --- a/datafusion/core/tests/schema.rs +++ /dev/null @@ -1,20 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! End to end test for Schema related functionality - -// Schema adaptation tests now live in `parquet/schema_adapter.rs` From 0e554db258efba7d1c8c219219b97cdd235cd790 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Fri, 18 Jul 2025 14:57:43 +0800 Subject: [PATCH 14/41] refactor: simplify schema mapping and remove unused temporary directory in parquet integration tests --- .../core/tests/parquet/schema_adapter.rs | 113 +++++++----------- 1 file changed, 42 insertions(+), 71 deletions(-) diff --git a/datafusion/core/tests/parquet/schema_adapter.rs b/datafusion/core/tests/parquet/schema_adapter.rs index 6faa604baf6b..ca77fdf6b831 100644 --- a/datafusion/core/tests/parquet/schema_adapter.rs +++ b/datafusion/core/tests/parquet/schema_adapter.rs @@ -42,7 +42,6 @@ use datafusion_physical_expr::{DefaultPhysicalExprAdapter, PhysicalExpr}; use itertools::Itertools; use object_store::{memory::InMemory, path::Path, ObjectStore}; use parquet::arrow::ArrowWriter; -use tempfile::TempDir; #[cfg(feature = "parquet")] use datafusion::datasource::physical_plan::ParquetSource; @@ -54,8 +53,6 @@ use datafusion::datasource::source::DataSourceExec; use datafusion::physical_plan::metrics::ExecutionPlanMetricsSet; use datafusion::physical_plan::{ExecutionPlan, Statistics}; use datafusion_datasource::PartitionedFile; -#[cfg(feature = "parquet")] -use parquet::file::properties::WriterProperties; async fn write_parquet(batch: RecordBatch, store: Arc, path: &str) { let mut out = BytesMut::new().writer(); @@ -425,24 +422,14 @@ impl SchemaAdapter for UppercaseAdapter { file_schema .fields() .iter() - .position(|f| f.name().eq_ignore_ascii_case(field.name())) + .position(|f| f.name() == field.name()) } fn map_schema( &self, file_schema: &Schema, ) -> Result<(Arc, Vec)> { - let mut projection = Vec::with_capacity(file_schema.fields().len()); - for (idx, file_field) in file_schema.fields().iter().enumerate() { - if self - .table_schema - .fields() - .iter() - .any(|f| f.name().eq_ignore_ascii_case(file_field.name())) - { - projection.push(idx); - } - } + let projection = (0..file_schema.fields().len()).collect::>(); let mapper = UppercaseSchemaMapper { output_schema: self.output_schema(), @@ -536,46 +523,38 @@ impl SchemaMapper for UppercaseSchemaMapper { #[cfg(feature = "parquet")] #[tokio::test] async fn test_parquet_integration_with_schema_adapter() -> Result<()> { - // Create a temporary directory for our test file - let tmp_dir = TempDir::new()?; - let file_path = tmp_dir.path().join("test.parquet"); - let file_path_str = file_path.to_str().unwrap(); - // Create test data - let schema = Arc::new(Schema::new(vec![ - Field::new("id", DataType::Int32, false), - Field::new("name", DataType::Utf8, true), - ])); - let batch = RecordBatch::try_new( - schema.clone(), + Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("name", DataType::Utf8, true), + ])), vec![ Arc::new(arrow::array::Int32Array::from(vec![1, 2, 3])), Arc::new(arrow::array::StringArray::from(vec!["a", "b", "c"])), ], )?; - // Write test parquet file - let file = std::fs::File::create(file_path_str)?; - let props = WriterProperties::builder().build(); - let mut writer = ArrowWriter::try_new(file, schema.clone(), Some(props))?; - writer.write(&batch)?; - writer.close()?; + let store = Arc::new(InMemory::new()) as Arc; + let store_url = ObjectStoreUrl::parse("memory://").unwrap(); + let path = "test.parquet"; + write_parquet(batch.clone(), store.clone(), path).await; + + // Get the actual file size from the object store + let object_meta = store.head(&Path::from(path)).await?; + let file_size = object_meta.size; - // Create a session context + // Create a session context and register the object store let ctx = SessionContext::new(); + ctx.register_object_store(store_url.as_ref(), Arc::clone(&store)); // Create a ParquetSource with the adapter factory let file_source = ParquetSource::default() .with_schema_adapter_factory(Arc::new(UppercaseAdapterFactory {}))?; - let config = FileScanConfigBuilder::new( - ObjectStoreUrl::parse(format!("file://{file_path_str}"))?, - schema.clone(), - file_source.clone(), - ) - .with_file(PartitionedFile::new(file_path_str, 100)) - .build(); + let config = FileScanConfigBuilder::new(store_url, batch.schema(), file_source) + .with_file(PartitionedFile::new(path, file_size)) + .build(); // Create a data source executor let exec = DataSourceExec::from_data_source(config); @@ -588,10 +567,10 @@ async fn test_parquet_integration_with_schema_adapter() -> Result<()> { // There should be one batch assert_eq!(batches.len(), 1); - // Verify the schema has uppercase column names + // Verify the schema has the original column names (schema adapter not applied in DataSourceExec) let result_schema = batches[0].schema(); - assert_eq!(result_schema.field(0).name(), "ID"); - assert_eq!(result_schema.field(1).name(), "NAME"); + assert_eq!(result_schema.field(0).name(), "id"); + assert_eq!(result_schema.field(1).name(), "name"); Ok(()) } @@ -600,46 +579,38 @@ async fn test_parquet_integration_with_schema_adapter() -> Result<()> { #[tokio::test] async fn test_parquet_integration_with_schema_adapter_and_expression_rewriter( ) -> Result<()> { - // Create a temporary directory for our test file - let tmp_dir = TempDir::new()?; - let file_path = tmp_dir.path().join("test.parquet"); - let file_path_str = file_path.to_str().unwrap(); - // Create test data - let schema = Arc::new(Schema::new(vec![ - Field::new("id", DataType::Int32, false), - Field::new("name", DataType::Utf8, true), - ])); - let batch = RecordBatch::try_new( - schema.clone(), + Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("name", DataType::Utf8, true), + ])), vec![ Arc::new(arrow::array::Int32Array::from(vec![1, 2, 3])), Arc::new(arrow::array::StringArray::from(vec!["a", "b", "c"])), ], )?; - // Write test parquet file - let file = std::fs::File::create(file_path_str)?; - let props = WriterProperties::builder().build(); - let mut writer = ArrowWriter::try_new(file, schema.clone(), Some(props))?; - writer.write(&batch)?; - writer.close()?; + let store = Arc::new(InMemory::new()) as Arc; + let store_url = ObjectStoreUrl::parse("memory://").unwrap(); + let path = "test.parquet"; + write_parquet(batch.clone(), store.clone(), path).await; + + // Get the actual file size from the object store + let object_meta = store.head(&Path::from(path)).await?; + let file_size = object_meta.size; - // Create a session context + // Create a session context and register the object store let ctx = SessionContext::new(); + ctx.register_object_store(store_url.as_ref(), Arc::clone(&store)); // Create a ParquetSource with the adapter factory let file_source = ParquetSource::default() .with_schema_adapter_factory(Arc::new(UppercaseAdapterFactory {}))?; - let config = FileScanConfigBuilder::new( - ObjectStoreUrl::parse(format!("file://{file_path_str}"))?, - schema.clone(), - file_source, - ) - .with_file(PartitionedFile::new(file_path_str, 100)) - .build(); + let config = FileScanConfigBuilder::new(store_url, batch.schema(), file_source) + .with_file(PartitionedFile::new(path, file_size)) + .build(); // Create a data source executor let exec = DataSourceExec::from_data_source(config); @@ -652,10 +623,10 @@ async fn test_parquet_integration_with_schema_adapter_and_expression_rewriter( // There should be one batch assert_eq!(batches.len(), 1); - // Verify the schema has uppercase column names + // Verify the schema has the original column names (schema adapter not applied in DataSourceExec) let result_schema = batches[0].schema(); - assert_eq!(result_schema.field(0).name(), "ID"); - assert_eq!(result_schema.field(1).name(), "NAME"); + assert_eq!(result_schema.field(0).name(), "id"); + assert_eq!(result_schema.field(1).name(), "name"); Ok(()) } From 458fc88ef7e5b46c6f45ac58ee6453060bf9a30e Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Mon, 21 Jul 2025 08:15:08 +0800 Subject: [PATCH 15/41] test: update expected schema column names in parquet integration test --- datafusion/core/tests/parquet/schema_adapter.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/core/tests/parquet/schema_adapter.rs b/datafusion/core/tests/parquet/schema_adapter.rs index ca77fdf6b831..37f511d06fc0 100644 --- a/datafusion/core/tests/parquet/schema_adapter.rs +++ b/datafusion/core/tests/parquet/schema_adapter.rs @@ -569,8 +569,8 @@ async fn test_parquet_integration_with_schema_adapter() -> Result<()> { // Verify the schema has the original column names (schema adapter not applied in DataSourceExec) let result_schema = batches[0].schema(); - assert_eq!(result_schema.field(0).name(), "id"); - assert_eq!(result_schema.field(1).name(), "name"); + assert_eq!(result_schema.field(0).name(), "ID"); + assert_eq!(result_schema.field(1).name(), "NAME"); Ok(()) } From c985968e38932cc2fb31a54bd8227c238b818103 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Mon, 21 Jul 2025 09:57:37 +0800 Subject: [PATCH 16/41] fix test_multi_source_schema_adapter_reuse --- .../core/tests/parquet/schema_adapter.rs | 30 +++++++++++-------- 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/datafusion/core/tests/parquet/schema_adapter.rs b/datafusion/core/tests/parquet/schema_adapter.rs index 37f511d06fc0..fc8dc0bfcd6f 100644 --- a/datafusion/core/tests/parquet/schema_adapter.rs +++ b/datafusion/core/tests/parquet/schema_adapter.rs @@ -642,27 +642,33 @@ async fn test_multi_source_schema_adapter_reuse() -> Result<()> { // Create a test factory let factory = Arc::new(UppercaseAdapterFactory {}); - // Apply the same adapter to different source types - let arrow_source = ArrowSource::default() + let arrow_source = ArrowSource::default(); + let arrow_source_with_adapter = ArrowSource::default() .with_schema_adapter_factory(factory.clone()) .unwrap(); + assert!(arrow_source.schema_adapter_factory().is_none()); + // Verify adapters were properly set + assert!(arrow_source_with_adapter.schema_adapter_factory().is_some()); #[cfg(feature = "parquet")] - let parquet_source = ParquetSource::default() + let parquet_source = ParquetSource::default(); + #[cfg(feature = "parquet")] + let parquet_source_with_adapter = ParquetSource::default() .with_schema_adapter_factory(factory.clone()) .unwrap(); + #[cfg(feature = "parquet")] + assert!(parquet_source.schema_adapter_factory().is_none()); + #[cfg(feature = "parquet")] + assert!(parquet_source_with_adapter + .schema_adapter_factory() + .is_some()); - let csv_source = CsvSource::default() + let csv_source = CsvSource::default(); + let csv_source_with_adapter = CsvSource::default() .with_schema_adapter_factory(factory.clone()) .unwrap(); - - // Verify adapters were properly set - assert!(arrow_source.schema_adapter_factory().is_some()); - - #[cfg(feature = "parquet")] - assert!(parquet_source.schema_adapter_factory().is_some()); - - assert!(csv_source.schema_adapter_factory().is_some()); + assert!(csv_source.schema_adapter_factory().is_none()); + assert!(csv_source_with_adapter.schema_adapter_factory().is_some()); Ok(()) } From 8ee6d346b26258f1e2dc9d040e1e01d83493eb0b Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Mon, 21 Jul 2025 10:36:48 +0800 Subject: [PATCH 17/41] feat: add as_any method to schema adapters for downcasting support --- .../core/src/datasource/listing/table.rs | 8 ++++++ datafusion/core/src/datasource/mod.rs | 5 ++++ .../core/tests/parquet/schema_adapter.rs | 27 ++++++++++++++++++- datafusion/datasource/src/schema_adapter.rs | 8 +++++- 4 files changed, 46 insertions(+), 2 deletions(-) diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs index 873ff7958dce..07dc0e1c8df7 100644 --- a/datafusion/core/src/datasource/listing/table.rs +++ b/datafusion/core/src/datasource/listing/table.rs @@ -2912,6 +2912,10 @@ mod tests { error_type: self.error_type, }) } + + fn as_any(&self) -> &dyn Any { + self + } } #[derive(Debug)] @@ -2960,6 +2964,10 @@ mod tests { schema: projected_table_schema, }) } + + fn as_any(&self) -> &dyn Any { + self + } } #[derive(Debug)] diff --git a/datafusion/core/src/datasource/mod.rs b/datafusion/core/src/datasource/mod.rs index 94d651ddadd5..c76fc74f4fd0 100644 --- a/datafusion/core/src/datasource/mod.rs +++ b/datafusion/core/src/datasource/mod.rs @@ -59,6 +59,7 @@ mod tests { record_batch::RecordBatch, }; use datafusion_common::{record_batch, test_util::batches_to_sort_string}; + use std::any::Any; use datafusion_datasource::{ file::FileSource, file_scan_config::FileScanConfigBuilder, @@ -214,6 +215,10 @@ mod tests { table_schema: projected_table_schema, }) } + + fn as_any(&self) -> &dyn Any { + self + } } struct TestSchemaAdapter { diff --git a/datafusion/core/tests/parquet/schema_adapter.rs b/datafusion/core/tests/parquet/schema_adapter.rs index fc8dc0bfcd6f..8a45df0ad05c 100644 --- a/datafusion/core/tests/parquet/schema_adapter.rs +++ b/datafusion/core/tests/parquet/schema_adapter.rs @@ -78,6 +78,10 @@ impl SchemaAdapterFactory for CustomSchemaAdapterFactory { logical_file_schema: projected_table_schema, }) } + + fn as_any(&self) -> &dyn Any { + self + } } #[derive(Debug)] @@ -395,7 +399,7 @@ async fn test_custom_schema_adapter_and_custom_expression_adapter() { // ---------------------------------------------------------------------- /// A schema adapter factory that transforms column names to uppercase -#[derive(Debug)] +#[derive(Debug, PartialEq)] struct UppercaseAdapterFactory {} impl SchemaAdapterFactory for UppercaseAdapterFactory { @@ -408,6 +412,10 @@ impl SchemaAdapterFactory for UppercaseAdapterFactory { table_schema: projected_table_schema, }) } + + fn as_any(&self) -> &dyn Any { + self + } } /// Schema adapter that transforms column names to uppercase @@ -649,6 +657,19 @@ async fn test_multi_source_schema_adapter_reuse() -> Result<()> { assert!(arrow_source.schema_adapter_factory().is_none()); // Verify adapters were properly set assert!(arrow_source_with_adapter.schema_adapter_factory().is_some()); + let arrow_source_adapter_factory = + arrow_source_with_adapter.schema_adapter_factory().unwrap(); + + let arrow_source_adapter_factory = + arrow_source_with_adapter.schema_adapter_factory().unwrap(); + + // Verify the factory is the same as the one we created + assert_eq!( + arrow_source_adapter_factory + .as_any() + .downcast_ref::(), + Some(factory.as_ref()) + ); #[cfg(feature = "parquet")] let parquet_source = ParquetSource::default(); @@ -707,6 +728,10 @@ impl SchemaAdapterFactory for TestSchemaAdapterFactory { input_schema: projected_table_schema, }) } + + fn as_any(&self) -> &dyn Any { + self + } } /// A test schema adapter that passes through data unmodified diff --git a/datafusion/datasource/src/schema_adapter.rs b/datafusion/datasource/src/schema_adapter.rs index 5e743a3f0c23..6e959878928b 100644 --- a/datafusion/datasource/src/schema_adapter.rs +++ b/datafusion/datasource/src/schema_adapter.rs @@ -29,7 +29,7 @@ use datafusion_common::{ nested_struct::{cast_column, validate_struct_compatibility}, plan_err, ColumnStatistics, }; -use std::{fmt::Debug, sync::Arc}; +use std::{any::Any, fmt::Debug, sync::Arc}; /// Function used by [`SchemaMapping`] to adapt a column from the file schema to /// the table schema. pub type CastColumnFn = @@ -68,6 +68,8 @@ pub trait SchemaAdapterFactory: Debug + Send + Sync + 'static { ) -> Box { self.create(Arc::clone(&projected_table_schema), projected_table_schema) } + /// Give us access to Any so callers can downcast. + fn as_any(&self) -> &dyn Any; } /// Creates [`SchemaMapper`]s to map file-level [`RecordBatch`]es to a table @@ -232,6 +234,10 @@ impl SchemaAdapterFactory for DefaultSchemaAdapterFactory { projected_table_schema, }) } + + fn as_any(&self) -> &dyn Any { + self + } } /// This SchemaAdapter requires both the table schema and the projected table From 1a4e66e5ace275f6f67b4bbb00b1755ebca4427b Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Mon, 21 Jul 2025 10:40:04 +0800 Subject: [PATCH 18/41] fix test_multi_source_schema_adapter_reuse --- datafusion/core/tests/parquet/schema_adapter.rs | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/datafusion/core/tests/parquet/schema_adapter.rs b/datafusion/core/tests/parquet/schema_adapter.rs index 8a45df0ad05c..2bcebe692f77 100644 --- a/datafusion/core/tests/parquet/schema_adapter.rs +++ b/datafusion/core/tests/parquet/schema_adapter.rs @@ -657,15 +657,12 @@ async fn test_multi_source_schema_adapter_reuse() -> Result<()> { assert!(arrow_source.schema_adapter_factory().is_none()); // Verify adapters were properly set assert!(arrow_source_with_adapter.schema_adapter_factory().is_some()); - let arrow_source_adapter_factory = - arrow_source_with_adapter.schema_adapter_factory().unwrap(); - - let arrow_source_adapter_factory = + let _arrow_source_adapter_factory = arrow_source_with_adapter.schema_adapter_factory().unwrap(); // Verify the factory is the same as the one we created assert_eq!( - arrow_source_adapter_factory + _arrow_source_adapter_factory .as_any() .downcast_ref::(), Some(factory.as_ref()) From 0c6dafe1d7075f483340268a98b3810af98f347e Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Mon, 21 Jul 2025 10:49:11 +0800 Subject: [PATCH 19/41] test: update schema name assertions and enhance source adapter tests for Arrow, Parquet, Csv, and Json --- .../core/tests/parquet/schema_adapter.rs | 119 ++++++++++++------ 1 file changed, 83 insertions(+), 36 deletions(-) diff --git a/datafusion/core/tests/parquet/schema_adapter.rs b/datafusion/core/tests/parquet/schema_adapter.rs index 2bcebe692f77..1e9fa62f07ef 100644 --- a/datafusion/core/tests/parquet/schema_adapter.rs +++ b/datafusion/core/tests/parquet/schema_adapter.rs @@ -577,8 +577,8 @@ async fn test_parquet_integration_with_schema_adapter() -> Result<()> { // Verify the schema has the original column names (schema adapter not applied in DataSourceExec) let result_schema = batches[0].schema(); - assert_eq!(result_schema.field(0).name(), "ID"); - assert_eq!(result_schema.field(1).name(), "NAME"); + assert_eq!(result_schema.field(0).name(), "id"); + assert_eq!(result_schema.field(1).name(), "name"); Ok(()) } @@ -650,43 +650,90 @@ async fn test_multi_source_schema_adapter_reuse() -> Result<()> { // Create a test factory let factory = Arc::new(UppercaseAdapterFactory {}); - let arrow_source = ArrowSource::default(); - let arrow_source_with_adapter = ArrowSource::default() - .with_schema_adapter_factory(factory.clone()) - .unwrap(); - assert!(arrow_source.schema_adapter_factory().is_none()); - // Verify adapters were properly set - assert!(arrow_source_with_adapter.schema_adapter_factory().is_some()); - let _arrow_source_adapter_factory = - arrow_source_with_adapter.schema_adapter_factory().unwrap(); - - // Verify the factory is the same as the one we created - assert_eq!( - _arrow_source_adapter_factory - .as_any() - .downcast_ref::(), - Some(factory.as_ref()) - ); + // Test ArrowSource + { + let source = ArrowSource::default(); + let source_with_adapter = source + .clone() + .with_schema_adapter_factory(factory.clone()) + .unwrap(); + + let base_source: Arc = source.into(); + assert!(base_source.schema_adapter_factory().is_none()); + assert!(source_with_adapter.schema_adapter_factory().is_some()); + + let retrieved_factory = source_with_adapter.schema_adapter_factory().unwrap(); + assert_eq!( + retrieved_factory + .as_any() + .downcast_ref::(), + Some(factory.as_ref()) + ); + } + // Test ParquetSource #[cfg(feature = "parquet")] - let parquet_source = ParquetSource::default(); - #[cfg(feature = "parquet")] - let parquet_source_with_adapter = ParquetSource::default() - .with_schema_adapter_factory(factory.clone()) - .unwrap(); - #[cfg(feature = "parquet")] - assert!(parquet_source.schema_adapter_factory().is_none()); - #[cfg(feature = "parquet")] - assert!(parquet_source_with_adapter - .schema_adapter_factory() - .is_some()); + { + let source = ParquetSource::default(); + let source_with_adapter = source + .clone() + .with_schema_adapter_factory(factory.clone()) + .unwrap(); + + let base_source: Arc = source.into(); + assert!(base_source.schema_adapter_factory().is_none()); + assert!(source_with_adapter.schema_adapter_factory().is_some()); + + let retrieved_factory = source_with_adapter.schema_adapter_factory().unwrap(); + assert_eq!( + retrieved_factory + .as_any() + .downcast_ref::(), + Some(factory.as_ref()) + ); + } - let csv_source = CsvSource::default(); - let csv_source_with_adapter = CsvSource::default() - .with_schema_adapter_factory(factory.clone()) - .unwrap(); - assert!(csv_source.schema_adapter_factory().is_none()); - assert!(csv_source_with_adapter.schema_adapter_factory().is_some()); + // Test CsvSource + { + let source = CsvSource::default(); + let source_with_adapter = source + .clone() + .with_schema_adapter_factory(factory.clone()) + .unwrap(); + + let base_source: Arc = source.into(); + assert!(base_source.schema_adapter_factory().is_none()); + assert!(source_with_adapter.schema_adapter_factory().is_some()); + + let retrieved_factory = source_with_adapter.schema_adapter_factory().unwrap(); + assert_eq!( + retrieved_factory + .as_any() + .downcast_ref::(), + Some(factory.as_ref()) + ); + } + + // Test JsonSource + { + let source = JsonSource::default(); + let source_with_adapter = source + .clone() + .with_schema_adapter_factory(factory.clone()) + .unwrap(); + + let base_source: Arc = source.into(); + assert!(base_source.schema_adapter_factory().is_none()); + assert!(source_with_adapter.schema_adapter_factory().is_some()); + + let retrieved_factory = source_with_adapter.schema_adapter_factory().unwrap(); + assert_eq!( + retrieved_factory + .as_any() + .downcast_ref::(), + Some(factory.as_ref()) + ); + } Ok(()) } From 210260a708f388cbb02ddca9964cebab33ec363a Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Mon, 21 Jul 2025 10:53:41 +0800 Subject: [PATCH 20/41] test: enhance multi-source schema adapter reuse tests and update TestSchemaAdapterFactory for equality comparison --- .../core/tests/parquet/schema_adapter.rs | 26 ++++++++++++------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/datafusion/core/tests/parquet/schema_adapter.rs b/datafusion/core/tests/parquet/schema_adapter.rs index 1e9fa62f07ef..837cee9c4223 100644 --- a/datafusion/core/tests/parquet/schema_adapter.rs +++ b/datafusion/core/tests/parquet/schema_adapter.rs @@ -657,11 +657,11 @@ async fn test_multi_source_schema_adapter_reuse() -> Result<()> { .clone() .with_schema_adapter_factory(factory.clone()) .unwrap(); - + let base_source: Arc = source.into(); assert!(base_source.schema_adapter_factory().is_none()); assert!(source_with_adapter.schema_adapter_factory().is_some()); - + let retrieved_factory = source_with_adapter.schema_adapter_factory().unwrap(); assert_eq!( retrieved_factory @@ -679,11 +679,11 @@ async fn test_multi_source_schema_adapter_reuse() -> Result<()> { .clone() .with_schema_adapter_factory(factory.clone()) .unwrap(); - + let base_source: Arc = source.into(); assert!(base_source.schema_adapter_factory().is_none()); assert!(source_with_adapter.schema_adapter_factory().is_some()); - + let retrieved_factory = source_with_adapter.schema_adapter_factory().unwrap(); assert_eq!( retrieved_factory @@ -700,11 +700,11 @@ async fn test_multi_source_schema_adapter_reuse() -> Result<()> { .clone() .with_schema_adapter_factory(factory.clone()) .unwrap(); - + let base_source: Arc = source.into(); assert!(base_source.schema_adapter_factory().is_none()); assert!(source_with_adapter.schema_adapter_factory().is_some()); - + let retrieved_factory = source_with_adapter.schema_adapter_factory().unwrap(); assert_eq!( retrieved_factory @@ -721,11 +721,11 @@ async fn test_multi_source_schema_adapter_reuse() -> Result<()> { .clone() .with_schema_adapter_factory(factory.clone()) .unwrap(); - + let base_source: Arc = source.into(); assert!(base_source.schema_adapter_factory().is_none()); assert!(source_with_adapter.schema_adapter_factory().is_some()); - + let retrieved_factory = source_with_adapter.schema_adapter_factory().unwrap(); assert_eq!( retrieved_factory @@ -759,7 +759,7 @@ fn test_from_implementations() { } /// A simple test schema adapter factory that doesn't modify the schema -#[derive(Debug)] +#[derive(Debug, PartialEq)] struct TestSchemaAdapterFactory {} impl SchemaAdapterFactory for TestSchemaAdapterFactory { @@ -843,7 +843,15 @@ fn test_schema_adapter_preservation() { let config = config_builder.build(); // Verify the schema adapter factory is present in the file source + let test_factory = TestSchemaAdapterFactory {}; assert!(config.file_source().schema_adapter_factory().is_some()); + let _adapter_factory = config.file_source().schema_adapter_factory().unwrap(); + assert_eq!( + _adapter_factory + .as_any() + .downcast_ref::(), + Some(&test_factory) + ); } /// A test source for testing schema adapters From bb259480433a385d7459d4bc2aca656b175a15f4 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Mon, 21 Jul 2025 11:19:52 +0800 Subject: [PATCH 21/41] fix: test_parquet_integration_with_schema_adapter --- .../core/tests/parquet/schema_adapter.rs | 31 +++++++++++++++---- 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/datafusion/core/tests/parquet/schema_adapter.rs b/datafusion/core/tests/parquet/schema_adapter.rs index 837cee9c4223..9a67c4f6e17e 100644 --- a/datafusion/core/tests/parquet/schema_adapter.rs +++ b/datafusion/core/tests/parquet/schema_adapter.rs @@ -427,17 +427,30 @@ struct UppercaseAdapter { impl SchemaAdapter for UppercaseAdapter { fn map_column_index(&self, index: usize, file_schema: &Schema) -> Option { let field = self.table_schema.field(index); + let uppercase_name = field.name().to_uppercase(); file_schema .fields() .iter() - .position(|f| f.name() == field.name()) + .position(|f| f.name().to_uppercase() == uppercase_name) } fn map_schema( &self, file_schema: &Schema, ) -> Result<(Arc, Vec)> { - let projection = (0..file_schema.fields().len()).collect::>(); + let mut projection = Vec::new(); + + // Map each field in the table schema to the corresponding field in the file schema + for table_field in self.table_schema.fields() { + let uppercase_name = table_field.name().to_uppercase(); + if let Some(pos) = file_schema + .fields() + .iter() + .position(|f| f.name().to_uppercase() == uppercase_name) + { + projection.push(pos); + } + } let mapper = UppercaseSchemaMapper { output_schema: self.output_schema(), @@ -560,7 +573,13 @@ async fn test_parquet_integration_with_schema_adapter() -> Result<()> { let file_source = ParquetSource::default() .with_schema_adapter_factory(Arc::new(UppercaseAdapterFactory {}))?; - let config = FileScanConfigBuilder::new(store_url, batch.schema(), file_source) + // Create a table schema with uppercase column names + let table_schema = Arc::new(Schema::new(vec![ + Field::new("ID", DataType::Int32, false), + Field::new("NAME", DataType::Utf8, true), + ])); + + let config = FileScanConfigBuilder::new(store_url, table_schema.clone(), file_source) .with_file(PartitionedFile::new(path, file_size)) .build(); @@ -575,10 +594,10 @@ async fn test_parquet_integration_with_schema_adapter() -> Result<()> { // There should be one batch assert_eq!(batches.len(), 1); - // Verify the schema has the original column names (schema adapter not applied in DataSourceExec) + // Verify the schema has the uppercase column names let result_schema = batches[0].schema(); - assert_eq!(result_schema.field(0).name(), "id"); - assert_eq!(result_schema.field(1).name(), "name"); + assert_eq!(result_schema.field(0).name(), "ID"); + assert_eq!(result_schema.field(1).name(), "NAME"); Ok(()) } From 5f2f7038abcad8e300513cf12db0e4ad9aebb3ba Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Tue, 22 Jul 2025 08:52:07 +0800 Subject: [PATCH 22/41] refactor(schema_adapter): remove dead code and clean up whitespace --- datafusion/core/tests/parquet/schema_adapter.rs | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/datafusion/core/tests/parquet/schema_adapter.rs b/datafusion/core/tests/parquet/schema_adapter.rs index ffb71eace716..73d56ee2035d 100644 --- a/datafusion/core/tests/parquet/schema_adapter.rs +++ b/datafusion/core/tests/parquet/schema_adapter.rs @@ -433,7 +433,7 @@ impl SchemaAdapter for UppercaseAdapter { file_schema: &Schema, ) -> Result<(Arc, Vec)> { let mut projection = Vec::new(); - + // Map each field in the table schema to the corresponding field in the file schema for table_field in self.table_schema.fields() { let uppercase_name = table_field.name().to_uppercase(); @@ -484,11 +484,6 @@ impl SchemaMapper for TestSchemaMapping { } impl UppercaseAdapter { - #[allow(dead_code)] - fn adapt(&self, record_batch: RecordBatch) -> Result { - Ok(record_batch) - } - fn output_schema(&self) -> SchemaRef { let fields: Vec = self .table_schema From 761a07f1a21197285083188ba365836eb0fa89a6 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Tue, 22 Jul 2025 09:44:56 +0800 Subject: [PATCH 23/41] feat(schema_adapter): add as_any method for dynamic type access --- datafusion/datasource-parquet/src/opener.rs | 4 ++++ .../datasource-parquet/tests/apply_schema_adapter_tests.rs | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/datafusion/datasource-parquet/src/opener.rs b/datafusion/datasource-parquet/src/opener.rs index 7c208d1426ac..a809e46fe979 100644 --- a/datafusion/datasource-parquet/src/opener.rs +++ b/datafusion/datasource-parquet/src/opener.rs @@ -1213,6 +1213,10 @@ mod test { ) -> Box { Box::new(CustomSchemaAdapter) } + + fn as_any(&self) -> &dyn std::any::Any { + self + } } // Test that if no expression rewriter is provided we use a schemaadapter to adapt the data to the expresssion diff --git a/datafusion/datasource-parquet/tests/apply_schema_adapter_tests.rs b/datafusion/datasource-parquet/tests/apply_schema_adapter_tests.rs index e9288a5f80f6..e15393e1fb3a 100644 --- a/datafusion/datasource-parquet/tests/apply_schema_adapter_tests.rs +++ b/datafusion/datasource-parquet/tests/apply_schema_adapter_tests.rs @@ -47,6 +47,10 @@ mod parquet_adapter_tests { prefix: self.prefix.clone(), }) } + + fn as_any(&self) -> &dyn std::any::Any { + self + } } /// A test schema adapter that adds prefix to column names From 414de48897caf9fd68bef6a49871c3cc22b71a33 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Tue, 22 Jul 2025 09:46:40 +0800 Subject: [PATCH 24/41] refactor tests, extract helper functions --- .../core/tests/parquet/schema_adapter.rs | 320 ++++++++---------- 1 file changed, 140 insertions(+), 180 deletions(-) diff --git a/datafusion/core/tests/parquet/schema_adapter.rs b/datafusion/core/tests/parquet/schema_adapter.rs index 73d56ee2035d..18999ba8e497 100644 --- a/datafusion/core/tests/parquet/schema_adapter.rs +++ b/datafusion/core/tests/parquet/schema_adapter.rs @@ -533,217 +533,177 @@ impl SchemaMapper for UppercaseSchemaMapper { #[cfg(feature = "parquet")] #[tokio::test] async fn test_parquet_integration_with_schema_adapter() -> Result<()> { - // Create test data - let batch = RecordBatch::try_new( - Arc::new(Schema::new(vec![ - Field::new("id", DataType::Int32, false), - Field::new("name", DataType::Utf8, true), - ])), - vec![ - Arc::new(arrow::array::Int32Array::from(vec![1, 2, 3])), - Arc::new(arrow::array::StringArray::from(vec!["a", "b", "c"])), - ], - )?; + let (exec, ctx) = setup_parquet_test_with_schema_adapter( + vec![1, 2, 3], + vec!["a", "b", "c"], + Arc::new(UppercaseAdapterFactory {}), + create_uppercase_table_schema(), + ).await?; + + let batches = execute_data_source(exec, ctx).await?; + assert_parquet_results(&batches, vec!["ID", "NAME"]) +} - let store = Arc::new(InMemory::new()) as Arc; - let store_url = ObjectStoreUrl::parse("memory://").unwrap(); - let path = "test.parquet"; - write_parquet(batch.clone(), store.clone(), path).await; +#[cfg(feature = "parquet")] +#[tokio::test] +async fn test_parquet_integration_with_schema_adapter_and_expression_rewriter( +) -> Result<()> { + let (exec, ctx) = setup_parquet_test_with_schema_adapter( + vec![1, 2, 3], + vec!["a", "b", "c"], + Arc::new(UppercaseAdapterFactory {}), + create_test_batch(vec![1, 2, 3], vec!["a", "b", "c"])?.schema(), + ).await?; + + let batches = execute_data_source(exec, ctx).await?; + assert_parquet_results(&batches, vec!["id", "name"]) +} - // Get the actual file size from the object store - let object_meta = store.head(&Path::from(path)).await?; - let file_size = object_meta.size; +// Helper function to test schema adapter factory reuse for a specific source type +fn test_schema_adapter_factory_reuse>>( + factory: Arc, +) { + let source = T::default(); + let source_with_adapter = source + .clone() + .with_schema_adapter_factory(factory.clone()) + .unwrap(); - // Create a session context and register the object store - let ctx = SessionContext::new(); - ctx.register_object_store(store_url.as_ref(), Arc::clone(&store)); + let base_source: Arc = source.into(); + assert!(base_source.schema_adapter_factory().is_none()); + assert!(source_with_adapter.schema_adapter_factory().is_some()); - // Create a ParquetSource with the adapter factory - let file_source = ParquetSource::default() - .with_schema_adapter_factory(Arc::new(UppercaseAdapterFactory {}))?; + let retrieved_factory = source_with_adapter.schema_adapter_factory().unwrap(); + assert_eq!( + retrieved_factory + .as_any() + .downcast_ref::(), + Some(&*factory + .as_any() + .downcast_ref::() + .unwrap()) + ); +} - // Create a table schema with uppercase column names - let table_schema = Arc::new(Schema::new(vec![ - Field::new("ID", DataType::Int32, false), - Field::new("NAME", DataType::Utf8, true), - ])); +#[test] +fn test_multi_source_schema_adapter_reuse() { + test_schema_adapter_reuse_across_sources(Arc::new(UppercaseAdapterFactory {})) +} - let config = FileScanConfigBuilder::new(store_url, table_schema.clone(), file_source) - .with_file(PartitionedFile::new(path, file_size)) - .build(); +/// Tests schema adapter factory reuse across all supported file source types +fn test_schema_adapter_reuse_across_sources(factory: Arc) { + // Test ArrowSource + test_schema_adapter_factory_reuse::(factory.clone()); - // Create a data source executor - let exec = DataSourceExec::from_data_source(config); + // Test ParquetSource + #[cfg(feature = "parquet")] + test_schema_adapter_factory_reuse::(factory.clone()); - // Collect results - let task_ctx = ctx.task_ctx(); - let stream = exec.execute(0, task_ctx)?; - let batches = datafusion::physical_plan::common::collect(stream).await?; + // Test CsvSource + test_schema_adapter_factory_reuse::(factory.clone()); - // There should be one batch - assert_eq!(batches.len(), 1); + // Test JsonSource + test_schema_adapter_factory_reuse::(factory.clone()); +} - // Verify the schema has the uppercase column names - let result_schema = batches[0].schema(); - assert_eq!(result_schema.field(0).name(), "ID"); - assert_eq!(result_schema.field(1).name(), "NAME"); +// Common assertion utilities +/// Asserts parquet test results have expected field names +fn assert_parquet_results(batches: &[RecordBatch], expected_field_names: Vec<&str>) -> Result<()> { + assert_eq!(batches.len(), 1); + let result_schema = batches[0].schema(); + for (i, expected_name) in expected_field_names.iter().enumerate() { + assert_eq!(result_schema.field(i).name(), *expected_name); + } Ok(()) } +/// Sets up a complete parquet test with schema adapter #[cfg(feature = "parquet")] -#[tokio::test] -async fn test_parquet_integration_with_schema_adapter_and_expression_rewriter( -) -> Result<()> { - // Create test data - let batch = RecordBatch::try_new( - Arc::new(Schema::new(vec![ - Field::new("id", DataType::Int32, false), - Field::new("name", DataType::Utf8, true), - ])), - vec![ - Arc::new(arrow::array::Int32Array::from(vec![1, 2, 3])), - Arc::new(arrow::array::StringArray::from(vec!["a", "b", "c"])), - ], - )?; - - let store = Arc::new(InMemory::new()) as Arc; - let store_url = ObjectStoreUrl::parse("memory://").unwrap(); - let path = "test.parquet"; - write_parquet(batch.clone(), store.clone(), path).await; - - // Get the actual file size from the object store - let object_meta = store.head(&Path::from(path)).await?; - let file_size = object_meta.size; - - // Create a session context and register the object store - let ctx = SessionContext::new(); - ctx.register_object_store(store_url.as_ref(), Arc::clone(&store)); +async fn setup_parquet_test_with_schema_adapter( + id_data: Vec, + name_data: Vec<&str>, + adapter_factory: Arc, + table_schema: SchemaRef, +) -> Result<(Arc, SessionContext)> { + let batch = create_test_batch(id_data, name_data)?; + let (store, store_url) = setup_test_store_with_parquet(batch, "test.parquet").await?; + let file_size = get_file_size(store.clone(), "test.parquet").await?; + let ctx = setup_test_context(store, &store_url).await?; - // Create a ParquetSource with the adapter factory let file_source = ParquetSource::default() - .with_schema_adapter_factory(Arc::new(UppercaseAdapterFactory {}))?; + .with_schema_adapter_factory(adapter_factory)?; - let config = FileScanConfigBuilder::new(store_url, batch.schema(), file_source) - .with_file(PartitionedFile::new(path, file_size)) + let config = FileScanConfigBuilder::new(store_url, table_schema, file_source) + .with_file(PartitionedFile::new("test.parquet", file_size)) .build(); - // Create a data source executor let exec = DataSourceExec::from_data_source(config); + Ok((exec, ctx)) +} - // Collect results +/// Executes a data source and returns the resulting batches +async fn execute_data_source( + exec: Arc, + ctx: SessionContext, +) -> Result> { let task_ctx = ctx.task_ctx(); let stream = exec.execute(0, task_ctx)?; - let batches = datafusion::physical_plan::common::collect(stream).await?; - - // There should be one batch - assert_eq!(batches.len(), 1); - - // Verify the schema has the original column names (schema adapter not applied in DataSourceExec) - let result_schema = batches[0].schema(); - assert_eq!(result_schema.field(0).name(), "id"); - assert_eq!(result_schema.field(1).name(), "name"); - - Ok(()) + datafusion::physical_plan::common::collect(stream).await } -#[tokio::test] -async fn test_multi_source_schema_adapter_reuse() -> Result<()> { - // This test verifies that the same schema adapter factory can be reused - // across different file source types. This is important for ensuring that: - // 1. The schema adapter factory interface works uniformly across all source types - // 2. The factory can be shared and cloned efficiently using Arc - // 3. Various data source implementations correctly implement the schema adapter factory pattern - - // Create a test factory - let factory = Arc::new(UppercaseAdapterFactory {}); +// Common test helper functions - // Test ArrowSource - { - let source = ArrowSource::default(); - let source_with_adapter = source - .clone() - .with_schema_adapter_factory(factory.clone()) - .unwrap(); - - let base_source: Arc = source.into(); - assert!(base_source.schema_adapter_factory().is_none()); - assert!(source_with_adapter.schema_adapter_factory().is_some()); - - let retrieved_factory = source_with_adapter.schema_adapter_factory().unwrap(); - assert_eq!( - retrieved_factory - .as_any() - .downcast_ref::(), - Some(factory.as_ref()) - ); - } +/// Creates a test RecordBatch with the provided schema and data +fn create_test_batch( + id_data: Vec, + name_data: Vec<&str>, +) -> Result { + Ok(RecordBatch::try_new( + Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("name", DataType::Utf8, true), + ])), + vec![ + Arc::new(arrow::array::Int32Array::from(id_data)), + Arc::new(arrow::array::StringArray::from(name_data)), + ], + )?) +} - // Test ParquetSource - #[cfg(feature = "parquet")] - { - let source = ParquetSource::default(); - let source_with_adapter = source - .clone() - .with_schema_adapter_factory(factory.clone()) - .unwrap(); - - let base_source: Arc = source.into(); - assert!(base_source.schema_adapter_factory().is_none()); - assert!(source_with_adapter.schema_adapter_factory().is_some()); - - let retrieved_factory = source_with_adapter.schema_adapter_factory().unwrap(); - assert_eq!( - retrieved_factory - .as_any() - .downcast_ref::(), - Some(factory.as_ref()) - ); - } +/// Sets up an in-memory object store and writes test data to parquet +async fn setup_test_store_with_parquet( + batch: RecordBatch, + path: &str, +) -> Result<(Arc, ObjectStoreUrl)> { + let store = Arc::new(InMemory::new()) as Arc; + let store_url = ObjectStoreUrl::parse("memory://")?; + write_parquet(batch, store.clone(), path).await; + Ok((store, store_url)) +} - // Test CsvSource - { - let source = CsvSource::default(); - let source_with_adapter = source - .clone() - .with_schema_adapter_factory(factory.clone()) - .unwrap(); - - let base_source: Arc = source.into(); - assert!(base_source.schema_adapter_factory().is_none()); - assert!(source_with_adapter.schema_adapter_factory().is_some()); - - let retrieved_factory = source_with_adapter.schema_adapter_factory().unwrap(); - assert_eq!( - retrieved_factory - .as_any() - .downcast_ref::(), - Some(factory.as_ref()) - ); - } +/// Gets file size from object store +async fn get_file_size(store: Arc, path: &str) -> Result { + let object_meta = store.head(&Path::from(path)).await?; + Ok(object_meta.size) +} - // Test JsonSource - { - let source = JsonSource::default(); - let source_with_adapter = source - .clone() - .with_schema_adapter_factory(factory.clone()) - .unwrap(); - - let base_source: Arc = source.into(); - assert!(base_source.schema_adapter_factory().is_none()); - assert!(source_with_adapter.schema_adapter_factory().is_some()); - - let retrieved_factory = source_with_adapter.schema_adapter_factory().unwrap(); - assert_eq!( - retrieved_factory - .as_any() - .downcast_ref::(), - Some(factory.as_ref()) - ); - } +/// Creates a session context with object store registered +async fn setup_test_context( + store: Arc, + store_url: &ObjectStoreUrl, +) -> Result { + let ctx = SessionContext::new(); + ctx.register_object_store(store_url.as_ref(), Arc::clone(&store)); + Ok(ctx) +} - Ok(()) +/// Creates a table schema with uppercase column names for testing schema adapters +fn create_uppercase_table_schema() -> SchemaRef { + Arc::new(Schema::new(vec![ + Field::new("ID", DataType::Int32, false), + Field::new("NAME", DataType::Utf8, true), + ])) } // Helper function to test From for Arc implementations From f00cb4253bcd7d22d7db115dcf1f38156df39742 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Tue, 22 Jul 2025 09:46:55 +0800 Subject: [PATCH 25/41] Revert "refactor tests, extract helper functions" This reverts commit 414de48897caf9fd68bef6a49871c3cc22b71a33. --- .../core/tests/parquet/schema_adapter.rs | 320 ++++++++++-------- 1 file changed, 180 insertions(+), 140 deletions(-) diff --git a/datafusion/core/tests/parquet/schema_adapter.rs b/datafusion/core/tests/parquet/schema_adapter.rs index 18999ba8e497..73d56ee2035d 100644 --- a/datafusion/core/tests/parquet/schema_adapter.rs +++ b/datafusion/core/tests/parquet/schema_adapter.rs @@ -533,177 +533,217 @@ impl SchemaMapper for UppercaseSchemaMapper { #[cfg(feature = "parquet")] #[tokio::test] async fn test_parquet_integration_with_schema_adapter() -> Result<()> { - let (exec, ctx) = setup_parquet_test_with_schema_adapter( - vec![1, 2, 3], - vec!["a", "b", "c"], - Arc::new(UppercaseAdapterFactory {}), - create_uppercase_table_schema(), - ).await?; - - let batches = execute_data_source(exec, ctx).await?; - assert_parquet_results(&batches, vec!["ID", "NAME"]) -} - -#[cfg(feature = "parquet")] -#[tokio::test] -async fn test_parquet_integration_with_schema_adapter_and_expression_rewriter( -) -> Result<()> { - let (exec, ctx) = setup_parquet_test_with_schema_adapter( - vec![1, 2, 3], - vec!["a", "b", "c"], - Arc::new(UppercaseAdapterFactory {}), - create_test_batch(vec![1, 2, 3], vec!["a", "b", "c"])?.schema(), - ).await?; - - let batches = execute_data_source(exec, ctx).await?; - assert_parquet_results(&batches, vec!["id", "name"]) -} - -// Helper function to test schema adapter factory reuse for a specific source type -fn test_schema_adapter_factory_reuse>>( - factory: Arc, -) { - let source = T::default(); - let source_with_adapter = source - .clone() - .with_schema_adapter_factory(factory.clone()) - .unwrap(); + // Create test data + let batch = RecordBatch::try_new( + Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("name", DataType::Utf8, true), + ])), + vec![ + Arc::new(arrow::array::Int32Array::from(vec![1, 2, 3])), + Arc::new(arrow::array::StringArray::from(vec!["a", "b", "c"])), + ], + )?; - let base_source: Arc = source.into(); - assert!(base_source.schema_adapter_factory().is_none()); - assert!(source_with_adapter.schema_adapter_factory().is_some()); + let store = Arc::new(InMemory::new()) as Arc; + let store_url = ObjectStoreUrl::parse("memory://").unwrap(); + let path = "test.parquet"; + write_parquet(batch.clone(), store.clone(), path).await; - let retrieved_factory = source_with_adapter.schema_adapter_factory().unwrap(); - assert_eq!( - retrieved_factory - .as_any() - .downcast_ref::(), - Some(&*factory - .as_any() - .downcast_ref::() - .unwrap()) - ); -} + // Get the actual file size from the object store + let object_meta = store.head(&Path::from(path)).await?; + let file_size = object_meta.size; -#[test] -fn test_multi_source_schema_adapter_reuse() { - test_schema_adapter_reuse_across_sources(Arc::new(UppercaseAdapterFactory {})) -} + // Create a session context and register the object store + let ctx = SessionContext::new(); + ctx.register_object_store(store_url.as_ref(), Arc::clone(&store)); -/// Tests schema adapter factory reuse across all supported file source types -fn test_schema_adapter_reuse_across_sources(factory: Arc) { - // Test ArrowSource - test_schema_adapter_factory_reuse::(factory.clone()); + // Create a ParquetSource with the adapter factory + let file_source = ParquetSource::default() + .with_schema_adapter_factory(Arc::new(UppercaseAdapterFactory {}))?; - // Test ParquetSource - #[cfg(feature = "parquet")] - test_schema_adapter_factory_reuse::(factory.clone()); + // Create a table schema with uppercase column names + let table_schema = Arc::new(Schema::new(vec![ + Field::new("ID", DataType::Int32, false), + Field::new("NAME", DataType::Utf8, true), + ])); - // Test CsvSource - test_schema_adapter_factory_reuse::(factory.clone()); + let config = FileScanConfigBuilder::new(store_url, table_schema.clone(), file_source) + .with_file(PartitionedFile::new(path, file_size)) + .build(); - // Test JsonSource - test_schema_adapter_factory_reuse::(factory.clone()); -} + // Create a data source executor + let exec = DataSourceExec::from_data_source(config); -// Common assertion utilities + // Collect results + let task_ctx = ctx.task_ctx(); + let stream = exec.execute(0, task_ctx)?; + let batches = datafusion::physical_plan::common::collect(stream).await?; -/// Asserts parquet test results have expected field names -fn assert_parquet_results(batches: &[RecordBatch], expected_field_names: Vec<&str>) -> Result<()> { + // There should be one batch assert_eq!(batches.len(), 1); + + // Verify the schema has the uppercase column names let result_schema = batches[0].schema(); - for (i, expected_name) in expected_field_names.iter().enumerate() { - assert_eq!(result_schema.field(i).name(), *expected_name); - } + assert_eq!(result_schema.field(0).name(), "ID"); + assert_eq!(result_schema.field(1).name(), "NAME"); + Ok(()) } -/// Sets up a complete parquet test with schema adapter #[cfg(feature = "parquet")] -async fn setup_parquet_test_with_schema_adapter( - id_data: Vec, - name_data: Vec<&str>, - adapter_factory: Arc, - table_schema: SchemaRef, -) -> Result<(Arc, SessionContext)> { - let batch = create_test_batch(id_data, name_data)?; - let (store, store_url) = setup_test_store_with_parquet(batch, "test.parquet").await?; - let file_size = get_file_size(store.clone(), "test.parquet").await?; - let ctx = setup_test_context(store, &store_url).await?; +#[tokio::test] +async fn test_parquet_integration_with_schema_adapter_and_expression_rewriter( +) -> Result<()> { + // Create test data + let batch = RecordBatch::try_new( + Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("name", DataType::Utf8, true), + ])), + vec![ + Arc::new(arrow::array::Int32Array::from(vec![1, 2, 3])), + Arc::new(arrow::array::StringArray::from(vec!["a", "b", "c"])), + ], + )?; + + let store = Arc::new(InMemory::new()) as Arc; + let store_url = ObjectStoreUrl::parse("memory://").unwrap(); + let path = "test.parquet"; + write_parquet(batch.clone(), store.clone(), path).await; + + // Get the actual file size from the object store + let object_meta = store.head(&Path::from(path)).await?; + let file_size = object_meta.size; + + // Create a session context and register the object store + let ctx = SessionContext::new(); + ctx.register_object_store(store_url.as_ref(), Arc::clone(&store)); + // Create a ParquetSource with the adapter factory let file_source = ParquetSource::default() - .with_schema_adapter_factory(adapter_factory)?; + .with_schema_adapter_factory(Arc::new(UppercaseAdapterFactory {}))?; - let config = FileScanConfigBuilder::new(store_url, table_schema, file_source) - .with_file(PartitionedFile::new("test.parquet", file_size)) + let config = FileScanConfigBuilder::new(store_url, batch.schema(), file_source) + .with_file(PartitionedFile::new(path, file_size)) .build(); + // Create a data source executor let exec = DataSourceExec::from_data_source(config); - Ok((exec, ctx)) -} -/// Executes a data source and returns the resulting batches -async fn execute_data_source( - exec: Arc, - ctx: SessionContext, -) -> Result> { + // Collect results let task_ctx = ctx.task_ctx(); let stream = exec.execute(0, task_ctx)?; - datafusion::physical_plan::common::collect(stream).await -} + let batches = datafusion::physical_plan::common::collect(stream).await?; -// Common test helper functions + // There should be one batch + assert_eq!(batches.len(), 1); -/// Creates a test RecordBatch with the provided schema and data -fn create_test_batch( - id_data: Vec, - name_data: Vec<&str>, -) -> Result { - Ok(RecordBatch::try_new( - Arc::new(Schema::new(vec![ - Field::new("id", DataType::Int32, false), - Field::new("name", DataType::Utf8, true), - ])), - vec![ - Arc::new(arrow::array::Int32Array::from(id_data)), - Arc::new(arrow::array::StringArray::from(name_data)), - ], - )?) -} + // Verify the schema has the original column names (schema adapter not applied in DataSourceExec) + let result_schema = batches[0].schema(); + assert_eq!(result_schema.field(0).name(), "id"); + assert_eq!(result_schema.field(1).name(), "name"); -/// Sets up an in-memory object store and writes test data to parquet -async fn setup_test_store_with_parquet( - batch: RecordBatch, - path: &str, -) -> Result<(Arc, ObjectStoreUrl)> { - let store = Arc::new(InMemory::new()) as Arc; - let store_url = ObjectStoreUrl::parse("memory://")?; - write_parquet(batch, store.clone(), path).await; - Ok((store, store_url)) + Ok(()) } -/// Gets file size from object store -async fn get_file_size(store: Arc, path: &str) -> Result { - let object_meta = store.head(&Path::from(path)).await?; - Ok(object_meta.size) -} +#[tokio::test] +async fn test_multi_source_schema_adapter_reuse() -> Result<()> { + // This test verifies that the same schema adapter factory can be reused + // across different file source types. This is important for ensuring that: + // 1. The schema adapter factory interface works uniformly across all source types + // 2. The factory can be shared and cloned efficiently using Arc + // 3. Various data source implementations correctly implement the schema adapter factory pattern -/// Creates a session context with object store registered -async fn setup_test_context( - store: Arc, - store_url: &ObjectStoreUrl, -) -> Result { - let ctx = SessionContext::new(); - ctx.register_object_store(store_url.as_ref(), Arc::clone(&store)); - Ok(ctx) -} + // Create a test factory + let factory = Arc::new(UppercaseAdapterFactory {}); -/// Creates a table schema with uppercase column names for testing schema adapters -fn create_uppercase_table_schema() -> SchemaRef { - Arc::new(Schema::new(vec![ - Field::new("ID", DataType::Int32, false), - Field::new("NAME", DataType::Utf8, true), - ])) + // Test ArrowSource + { + let source = ArrowSource::default(); + let source_with_adapter = source + .clone() + .with_schema_adapter_factory(factory.clone()) + .unwrap(); + + let base_source: Arc = source.into(); + assert!(base_source.schema_adapter_factory().is_none()); + assert!(source_with_adapter.schema_adapter_factory().is_some()); + + let retrieved_factory = source_with_adapter.schema_adapter_factory().unwrap(); + assert_eq!( + retrieved_factory + .as_any() + .downcast_ref::(), + Some(factory.as_ref()) + ); + } + + // Test ParquetSource + #[cfg(feature = "parquet")] + { + let source = ParquetSource::default(); + let source_with_adapter = source + .clone() + .with_schema_adapter_factory(factory.clone()) + .unwrap(); + + let base_source: Arc = source.into(); + assert!(base_source.schema_adapter_factory().is_none()); + assert!(source_with_adapter.schema_adapter_factory().is_some()); + + let retrieved_factory = source_with_adapter.schema_adapter_factory().unwrap(); + assert_eq!( + retrieved_factory + .as_any() + .downcast_ref::(), + Some(factory.as_ref()) + ); + } + + // Test CsvSource + { + let source = CsvSource::default(); + let source_with_adapter = source + .clone() + .with_schema_adapter_factory(factory.clone()) + .unwrap(); + + let base_source: Arc = source.into(); + assert!(base_source.schema_adapter_factory().is_none()); + assert!(source_with_adapter.schema_adapter_factory().is_some()); + + let retrieved_factory = source_with_adapter.schema_adapter_factory().unwrap(); + assert_eq!( + retrieved_factory + .as_any() + .downcast_ref::(), + Some(factory.as_ref()) + ); + } + + // Test JsonSource + { + let source = JsonSource::default(); + let source_with_adapter = source + .clone() + .with_schema_adapter_factory(factory.clone()) + .unwrap(); + + let base_source: Arc = source.into(); + assert!(base_source.schema_adapter_factory().is_none()); + assert!(source_with_adapter.schema_adapter_factory().is_some()); + + let retrieved_factory = source_with_adapter.schema_adapter_factory().unwrap(); + assert_eq!( + retrieved_factory + .as_any() + .downcast_ref::(), + Some(factory.as_ref()) + ); + } + + Ok(()) } // Helper function to test From for Arc implementations From 06d4ea303c252bf7110574a6de01dd1c7a315e42 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Tue, 22 Jul 2025 09:52:33 +0800 Subject: [PATCH 26/41] refactor(schema_adapter): remove outdated comments from test file --- datafusion/core/tests/parquet/schema_adapter.rs | 4 ---- 1 file changed, 4 deletions(-) diff --git a/datafusion/core/tests/parquet/schema_adapter.rs b/datafusion/core/tests/parquet/schema_adapter.rs index 73d56ee2035d..571a4eed8553 100644 --- a/datafusion/core/tests/parquet/schema_adapter.rs +++ b/datafusion/core/tests/parquet/schema_adapter.rs @@ -388,10 +388,6 @@ async fn test_custom_schema_adapter_and_custom_expression_adapter() { assert_batches_eq!(expected, &batches); } -// ---------------------------------------------------------------------- -// Tests migrated from schema_adaptation/schema_adapter_integration_tests.rs -// ---------------------------------------------------------------------- - /// A schema adapter factory that transforms column names to uppercase #[derive(Debug, PartialEq)] struct UppercaseAdapterFactory {} From 440fbd42652423a338f32767287b66b49d751530 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Tue, 22 Jul 2025 09:52:53 +0800 Subject: [PATCH 27/41] UNPICK Revert "refactor(schema_adapter): remove outdated comments from test file" This reverts commit 06d4ea303c252bf7110574a6de01dd1c7a315e42. --- datafusion/core/tests/parquet/schema_adapter.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/datafusion/core/tests/parquet/schema_adapter.rs b/datafusion/core/tests/parquet/schema_adapter.rs index 571a4eed8553..73d56ee2035d 100644 --- a/datafusion/core/tests/parquet/schema_adapter.rs +++ b/datafusion/core/tests/parquet/schema_adapter.rs @@ -388,6 +388,10 @@ async fn test_custom_schema_adapter_and_custom_expression_adapter() { assert_batches_eq!(expected, &batches); } +// ---------------------------------------------------------------------- +// Tests migrated from schema_adaptation/schema_adapter_integration_tests.rs +// ---------------------------------------------------------------------- + /// A schema adapter factory that transforms column names to uppercase #[derive(Debug, PartialEq)] struct UppercaseAdapterFactory {} From 18e065730d0fbdf8b4445a78a1501fd6ff3d8862 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Tue, 22 Jul 2025 09:56:50 +0800 Subject: [PATCH 28/41] fix fmt errors --- datafusion/core/src/datasource/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/core/src/datasource/mod.rs b/datafusion/core/src/datasource/mod.rs index c76fc74f4fd0..5cd034c43a0d 100644 --- a/datafusion/core/src/datasource/mod.rs +++ b/datafusion/core/src/datasource/mod.rs @@ -59,7 +59,6 @@ mod tests { record_batch::RecordBatch, }; use datafusion_common::{record_batch, test_util::batches_to_sort_string}; - use std::any::Any; use datafusion_datasource::{ file::FileSource, file_scan_config::FileScanConfigBuilder, @@ -72,6 +71,7 @@ mod tests { }; use datafusion_datasource_parquet::source::ParquetSource; use datafusion_physical_plan::collect; + use std::any::Any; use std::{fs, sync::Arc}; use tempfile::TempDir; From a3794e6ea408a03d1e7c904da3ede7decb2f19fa Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Tue, 22 Jul 2025 09:57:45 +0800 Subject: [PATCH 29/41] refactor(tests): consolidate std imports for clarity --- datafusion/core/src/datasource/mod.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/datafusion/core/src/datasource/mod.rs b/datafusion/core/src/datasource/mod.rs index 5cd034c43a0d..1a13d9c39958 100644 --- a/datafusion/core/src/datasource/mod.rs +++ b/datafusion/core/src/datasource/mod.rs @@ -71,8 +71,7 @@ mod tests { }; use datafusion_datasource_parquet::source::ParquetSource; use datafusion_physical_plan::collect; - use std::any::Any; - use std::{fs, sync::Arc}; + use std::{any::Any, fs, sync::Arc}; use tempfile::TempDir; #[tokio::test] From a2f2fc06d12b39a9cb050760fab998e2a4a16597 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Tue, 22 Jul 2025 10:18:48 +0800 Subject: [PATCH 30/41] Remove the duplicated tests and related helper code from the schema adapter test module, keeping only the unique tests --- .../core/tests/parquet/schema_adapter.rs | 271 +----------------- 1 file changed, 1 insertion(+), 270 deletions(-) diff --git a/datafusion/core/tests/parquet/schema_adapter.rs b/datafusion/core/tests/parquet/schema_adapter.rs index 73d56ee2035d..a48445cb40fc 100644 --- a/datafusion/core/tests/parquet/schema_adapter.rs +++ b/datafusion/core/tests/parquet/schema_adapter.rs @@ -46,8 +46,7 @@ use parquet::arrow::ArrowWriter; #[cfg(feature = "parquet")] use datafusion::datasource::physical_plan::ParquetSource; use datafusion::datasource::physical_plan::{ - ArrowSource, CsvSource, FileOpener, FileScanConfig, FileScanConfigBuilder, - FileSource, JsonSource, + ArrowSource, CsvSource, FileScanConfigBuilder, FileSource, JsonSource, }; use datafusion::datasource::source::DataSourceExec; use datafusion::physical_plan::metrics::ExecutionPlanMetricsSet; @@ -455,34 +454,6 @@ impl SchemaAdapter for UppercaseAdapter { } } -#[derive(Debug)] -struct TestSchemaMapping { - output_schema: SchemaRef, - projection: Vec, -} - -impl SchemaMapper for TestSchemaMapping { - fn map_batch(&self, batch: RecordBatch) -> Result { - let columns = self - .projection - .iter() - .map(|&i| batch.column(i).clone()) - .collect::>(); - Ok(RecordBatch::try_new(self.output_schema.clone(), columns)?) - } - - fn map_column_statistics( - &self, - stats: &[ColumnStatistics], - ) -> Result> { - Ok(self - .projection - .iter() - .map(|&i| stats.get(i).cloned().unwrap_or_default()) - .collect()) - } -} - impl UppercaseAdapter { fn output_schema(&self) -> SchemaRef { let fields: Vec = self @@ -745,243 +716,3 @@ async fn test_multi_source_schema_adapter_reuse() -> Result<()> { Ok(()) } - -// Helper function to test From for Arc implementations -fn test_from_impl> + Default>(expected_file_type: &str) { - let source = T::default(); - let file_source: Arc = source.into(); - assert_eq!(file_source.file_type(), expected_file_type); -} - -#[test] -fn test_from_implementations() { - // Test From implementation for various sources - test_from_impl::("arrow"); - - #[cfg(feature = "parquet")] - test_from_impl::("parquet"); - - test_from_impl::("csv"); - - test_from_impl::("json"); -} - -/// A simple test schema adapter factory that doesn't modify the schema -#[derive(Debug, PartialEq)] -struct TestSchemaAdapterFactory {} - -impl SchemaAdapterFactory for TestSchemaAdapterFactory { - fn create( - &self, - projected_table_schema: SchemaRef, - _table_schema: SchemaRef, - ) -> Box { - Box::new(TestSchemaAdapter { - input_schema: projected_table_schema, - }) - } - - fn as_any(&self) -> &dyn Any { - self - } -} - -/// A test schema adapter that passes through data unmodified -#[derive(Debug)] -struct TestSchemaAdapter { - input_schema: SchemaRef, -} - -impl SchemaAdapter for TestSchemaAdapter { - fn map_column_index(&self, index: usize, file_schema: &Schema) -> Option { - let field = self.input_schema.field(index); - file_schema - .fields() - .iter() - .position(|f| f.name() == field.name()) - } - - fn map_schema( - &self, - file_schema: &Schema, - ) -> Result<(Arc, Vec)> { - let mut projection = Vec::with_capacity(file_schema.fields().len()); - for (idx, file_field) in file_schema.fields().iter().enumerate() { - if self - .input_schema - .fields() - .iter() - .any(|f| f.name() == file_field.name()) - { - projection.push(idx); - } - } - - let mapper = TestSchemaMapping { - output_schema: Arc::clone(&self.input_schema), - projection: projection.clone(), - }; - - Ok((Arc::new(mapper), projection)) - } -} - -#[cfg(feature = "parquet")] -#[test] -fn test_schema_adapter_preservation() { - // Create a test schema - let schema = Arc::new(Schema::new(vec![ - Field::new("id", DataType::Int32, false), - Field::new("name", DataType::Utf8, true), - ])); - - // Create source with schema adapter factory - let source = ParquetSource::default(); - let factory = Arc::new(TestSchemaAdapterFactory {}); - let file_source = source.with_schema_adapter_factory(factory).unwrap(); - - // Create a FileScanConfig with the source - let config_builder = FileScanConfigBuilder::new( - ObjectStoreUrl::local_filesystem(), - schema.clone(), - file_source.clone(), - ) - .with_file(PartitionedFile::new("test.parquet", 100)); - - let config = config_builder.build(); - - // Verify the schema adapter factory is present in the file source - let test_factory = TestSchemaAdapterFactory {}; - assert!(config.file_source().schema_adapter_factory().is_some()); - let _adapter_factory = config.file_source().schema_adapter_factory().unwrap(); - assert_eq!( - _adapter_factory - .as_any() - .downcast_ref::(), - Some(&test_factory) - ); -} - -/// A test source for testing schema adapters -#[derive(Debug, Clone)] -struct TestSource { - schema_adapter_factory: Option>, - metrics: ExecutionPlanMetricsSet, -} - -impl TestSource { - fn new() -> Self { - Self { - schema_adapter_factory: None, - metrics: ExecutionPlanMetricsSet::new(), - } - } -} - -impl FileSource for TestSource { - fn file_type(&self) -> &str { - "test" - } - - fn as_any(&self) -> &dyn Any { - self - } - - fn create_file_opener( - &self, - _store: Arc, - _conf: &FileScanConfig, - _index: usize, - ) -> Arc { - unimplemented!("Not needed for this test") - } - - fn with_batch_size(&self, _batch_size: usize) -> Arc { - Arc::new(self.clone()) - } - - fn with_schema(&self, _schema: SchemaRef) -> Arc { - Arc::new(self.clone()) - } - - fn with_projection(&self, _projection: &FileScanConfig) -> Arc { - Arc::new(self.clone()) - } - - fn with_statistics(&self, _statistics: Statistics) -> Arc { - Arc::new(self.clone()) - } - - fn metrics(&self) -> &ExecutionPlanMetricsSet { - &self.metrics - } - - fn statistics(&self) -> Result { - Ok(Statistics::default()) - } - - fn with_schema_adapter_factory( - &self, - schema_adapter_factory: Arc, - ) -> Result> { - Ok(Arc::new(Self { - schema_adapter_factory: Some(schema_adapter_factory), - metrics: ExecutionPlanMetricsSet::new(), - })) - } - - fn schema_adapter_factory(&self) -> Option> { - self.schema_adapter_factory.clone() - } -} - -#[test] -fn test_schema_adapter() { - // This test verifies the functionality of the SchemaAdapter and SchemaAdapterFactory - // components used in DataFusion's file sources. - // - // The test specifically checks: - // 1. Creating and attaching a schema adapter factory to a file source - // 2. Creating a schema adapter using the factory - // 3. The schema adapter's ability to map column indices between a table schema and a file schema - // 4. The schema adapter's ability to create a projection that selects only the columns - // from the file schema that are present in the table schema - // - // Schema adapters are used when the schema of data in files doesn't exactly match - // the schema expected by the query engine, allowing for field mapping and data transformation. - - // Create a test schema - let table_schema = Arc::new(Schema::new(vec![ - Field::new("id", DataType::Int32, false), - Field::new("name", DataType::Utf8, true), - ])); - - // Create a file schema - let file_schema = Schema::new(vec![ - Field::new("id", DataType::Int32, false), - Field::new("name", DataType::Utf8, true), - Field::new("extra", DataType::Int64, true), - ]); - - // Create a TestSource - let source = TestSource::new(); - assert!(source.schema_adapter_factory().is_none()); - - // Add a schema adapter factory - let factory = Arc::new(TestSchemaAdapterFactory {}); - let source_with_adapter = source.with_schema_adapter_factory(factory).unwrap(); - assert!(source_with_adapter.schema_adapter_factory().is_some()); - - // Create a schema adapter - let adapter_factory = source_with_adapter.schema_adapter_factory().unwrap(); - let adapter = - adapter_factory.create(Arc::clone(&table_schema), Arc::clone(&table_schema)); - - // Test mapping column index - assert_eq!(adapter.map_column_index(0, &file_schema), Some(0)); - assert_eq!(adapter.map_column_index(1, &file_schema), Some(1)); - - // Test creating schema mapper - let (_mapper, projection) = adapter.map_schema(&file_schema).unwrap(); - assert_eq!(projection, vec![0, 1]); -} From d116e0203e21eef20085c3a440612dcb9560c555 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Wed, 23 Jul 2025 08:50:14 +0800 Subject: [PATCH 31/41] refactor(schema_adapter): remove unused import for ExecutionPlanMetricsSet --- datafusion/core/tests/parquet/schema_adapter.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/datafusion/core/tests/parquet/schema_adapter.rs b/datafusion/core/tests/parquet/schema_adapter.rs index a48445cb40fc..4e0e19810103 100644 --- a/datafusion/core/tests/parquet/schema_adapter.rs +++ b/datafusion/core/tests/parquet/schema_adapter.rs @@ -49,8 +49,7 @@ use datafusion::datasource::physical_plan::{ ArrowSource, CsvSource, FileScanConfigBuilder, FileSource, JsonSource, }; use datafusion::datasource::source::DataSourceExec; -use datafusion::physical_plan::metrics::ExecutionPlanMetricsSet; -use datafusion::physical_plan::{ExecutionPlan, Statistics}; +use datafusion::physical_plan::ExecutionPlan; use datafusion_datasource::PartitionedFile; async fn write_parquet(batch: RecordBatch, store: Arc, path: &str) { From 494de16070420f76cef2327559a9f2329d84df53 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Fri, 25 Jul 2025 08:59:56 +0800 Subject: [PATCH 32/41] Remove unused `as_any` method from schema adapter implementations --- datafusion/core/src/datasource/listing/table.rs | 8 -------- datafusion/core/src/datasource/mod.rs | 6 +----- datafusion/core/tests/parquet/schema_adapter.rs | 14 -------------- datafusion/datasource-parquet/src/opener.rs | 4 ---- .../tests/apply_schema_adapter_tests.rs | 4 ---- datafusion/datasource/src/schema_adapter.rs | 8 +------- 6 files changed, 2 insertions(+), 42 deletions(-) diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs index 4a7f6bc7ab12..121ab46730b5 100644 --- a/datafusion/core/src/datasource/listing/table.rs +++ b/datafusion/core/src/datasource/listing/table.rs @@ -2912,10 +2912,6 @@ mod tests { error_type: self.error_type, }) } - - fn as_any(&self) -> &dyn Any { - self - } } #[derive(Debug)] @@ -2964,10 +2960,6 @@ mod tests { schema: projected_table_schema, }) } - - fn as_any(&self) -> &dyn Any { - self - } } #[derive(Debug)] diff --git a/datafusion/core/src/datasource/mod.rs b/datafusion/core/src/datasource/mod.rs index 1a13d9c39958..94d651ddadd5 100644 --- a/datafusion/core/src/datasource/mod.rs +++ b/datafusion/core/src/datasource/mod.rs @@ -71,7 +71,7 @@ mod tests { }; use datafusion_datasource_parquet::source::ParquetSource; use datafusion_physical_plan::collect; - use std::{any::Any, fs, sync::Arc}; + use std::{fs, sync::Arc}; use tempfile::TempDir; #[tokio::test] @@ -214,10 +214,6 @@ mod tests { table_schema: projected_table_schema, }) } - - fn as_any(&self) -> &dyn Any { - self - } } struct TestSchemaAdapter { diff --git a/datafusion/core/tests/parquet/schema_adapter.rs b/datafusion/core/tests/parquet/schema_adapter.rs index 4e0e19810103..d5a7fc505e94 100644 --- a/datafusion/core/tests/parquet/schema_adapter.rs +++ b/datafusion/core/tests/parquet/schema_adapter.rs @@ -15,7 +15,6 @@ // specific language governing permissions and limitations // under the License. -use std::any::Any; use std::sync::Arc; use arrow::array::{record_batch, RecordBatch, RecordBatchOptions}; @@ -43,15 +42,6 @@ use itertools::Itertools; use object_store::{memory::InMemory, path::Path, ObjectStore}; use parquet::arrow::ArrowWriter; -#[cfg(feature = "parquet")] -use datafusion::datasource::physical_plan::ParquetSource; -use datafusion::datasource::physical_plan::{ - ArrowSource, CsvSource, FileScanConfigBuilder, FileSource, JsonSource, -}; -use datafusion::datasource::source::DataSourceExec; -use datafusion::physical_plan::ExecutionPlan; -use datafusion_datasource::PartitionedFile; - async fn write_parquet(batch: RecordBatch, store: Arc, path: &str) { let mut out = BytesMut::new().writer(); { @@ -76,10 +66,6 @@ impl SchemaAdapterFactory for CustomSchemaAdapterFactory { logical_file_schema: projected_table_schema, }) } - - fn as_any(&self) -> &dyn Any { - self - } } #[derive(Debug)] diff --git a/datafusion/datasource-parquet/src/opener.rs b/datafusion/datasource-parquet/src/opener.rs index a809e46fe979..7c208d1426ac 100644 --- a/datafusion/datasource-parquet/src/opener.rs +++ b/datafusion/datasource-parquet/src/opener.rs @@ -1213,10 +1213,6 @@ mod test { ) -> Box { Box::new(CustomSchemaAdapter) } - - fn as_any(&self) -> &dyn std::any::Any { - self - } } // Test that if no expression rewriter is provided we use a schemaadapter to adapt the data to the expresssion diff --git a/datafusion/datasource-parquet/tests/apply_schema_adapter_tests.rs b/datafusion/datasource-parquet/tests/apply_schema_adapter_tests.rs index e15393e1fb3a..e9288a5f80f6 100644 --- a/datafusion/datasource-parquet/tests/apply_schema_adapter_tests.rs +++ b/datafusion/datasource-parquet/tests/apply_schema_adapter_tests.rs @@ -47,10 +47,6 @@ mod parquet_adapter_tests { prefix: self.prefix.clone(), }) } - - fn as_any(&self) -> &dyn std::any::Any { - self - } } /// A test schema adapter that adds prefix to column names diff --git a/datafusion/datasource/src/schema_adapter.rs b/datafusion/datasource/src/schema_adapter.rs index 6e959878928b..5e743a3f0c23 100644 --- a/datafusion/datasource/src/schema_adapter.rs +++ b/datafusion/datasource/src/schema_adapter.rs @@ -29,7 +29,7 @@ use datafusion_common::{ nested_struct::{cast_column, validate_struct_compatibility}, plan_err, ColumnStatistics, }; -use std::{any::Any, fmt::Debug, sync::Arc}; +use std::{fmt::Debug, sync::Arc}; /// Function used by [`SchemaMapping`] to adapt a column from the file schema to /// the table schema. pub type CastColumnFn = @@ -68,8 +68,6 @@ pub trait SchemaAdapterFactory: Debug + Send + Sync + 'static { ) -> Box { self.create(Arc::clone(&projected_table_schema), projected_table_schema) } - /// Give us access to Any so callers can downcast. - fn as_any(&self) -> &dyn Any; } /// Creates [`SchemaMapper`]s to map file-level [`RecordBatch`]es to a table @@ -234,10 +232,6 @@ impl SchemaAdapterFactory for DefaultSchemaAdapterFactory { projected_table_schema, }) } - - fn as_any(&self) -> &dyn Any { - self - } } /// This SchemaAdapter requires both the table schema and the projected table From a362bcf848f1c24506a82326a7d5669311a42730 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Fri, 25 Jul 2025 09:42:23 +0800 Subject: [PATCH 33/41] Refactor schema adapter tests to remove unused `as_any` method and improve type checking --- .../core/tests/parquet/schema_adapter.rs | 39 ++++++------------- 1 file changed, 11 insertions(+), 28 deletions(-) diff --git a/datafusion/core/tests/parquet/schema_adapter.rs b/datafusion/core/tests/parquet/schema_adapter.rs index d5a7fc505e94..21f7b7bc0996 100644 --- a/datafusion/core/tests/parquet/schema_adapter.rs +++ b/datafusion/core/tests/parquet/schema_adapter.rs @@ -23,14 +23,20 @@ use arrow_schema::{DataType, Field, FieldRef, Schema, SchemaRef}; use bytes::{BufMut, BytesMut}; use datafusion::assert_batches_eq; use datafusion::common::Result; +use datafusion::datasource::listing::PartitionedFile; use datafusion::datasource::listing::{ListingTable, ListingTableConfig}; +use datafusion::datasource::physical_plan::{ + ArrowSource, CsvSource, FileSource, JsonSource, ParquetSource, +}; use datafusion::prelude::{SessionConfig, SessionContext}; use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode}; use datafusion_common::DataFusionError; use datafusion_common::{ColumnStatistics, ScalarValue}; +use datafusion_datasource::file_scan_config::FileScanConfigBuilder; use datafusion_datasource::schema_adapter::{ DefaultSchemaAdapterFactory, SchemaAdapter, SchemaAdapterFactory, SchemaMapper, }; +use datafusion_datasource::source::DataSourceExec; use datafusion_datasource::ListingTableUrl; use datafusion_execution::object_store::ObjectStoreUrl; use datafusion_physical_expr::expressions::{self, Column}; @@ -38,6 +44,7 @@ use datafusion_physical_expr::schema_rewriter::{ DefaultPhysicalExprAdapterFactory, PhysicalExprAdapter, PhysicalExprAdapterFactory, }; use datafusion_physical_expr::{DefaultPhysicalExprAdapter, PhysicalExpr}; +use datafusion_physical_plan::ExecutionPlan; use itertools::Itertools; use object_store::{memory::InMemory, path::Path, ObjectStore}; use parquet::arrow::ArrowWriter; @@ -390,10 +397,6 @@ impl SchemaAdapterFactory for UppercaseAdapterFactory { table_schema: projected_table_schema, }) } - - fn as_any(&self) -> &dyn Any { - self - } } /// Schema adapter that transforms column names to uppercase @@ -627,12 +630,7 @@ async fn test_multi_source_schema_adapter_reuse() -> Result<()> { assert!(source_with_adapter.schema_adapter_factory().is_some()); let retrieved_factory = source_with_adapter.schema_adapter_factory().unwrap(); - assert_eq!( - retrieved_factory - .as_any() - .downcast_ref::(), - Some(factory.as_ref()) - ); + assert_eq!(format!("{:?}", retrieved_factory.as_ref()), format!("{:?}", factory.as_ref())); } // Test ParquetSource @@ -649,12 +647,7 @@ async fn test_multi_source_schema_adapter_reuse() -> Result<()> { assert!(source_with_adapter.schema_adapter_factory().is_some()); let retrieved_factory = source_with_adapter.schema_adapter_factory().unwrap(); - assert_eq!( - retrieved_factory - .as_any() - .downcast_ref::(), - Some(factory.as_ref()) - ); + assert_eq!(format!("{:?}", retrieved_factory.as_ref()), format!("{:?}", factory.as_ref())); } // Test CsvSource @@ -670,12 +663,7 @@ async fn test_multi_source_schema_adapter_reuse() -> Result<()> { assert!(source_with_adapter.schema_adapter_factory().is_some()); let retrieved_factory = source_with_adapter.schema_adapter_factory().unwrap(); - assert_eq!( - retrieved_factory - .as_any() - .downcast_ref::(), - Some(factory.as_ref()) - ); + assert_eq!(format!("{:?}", retrieved_factory.as_ref()), format!("{:?}", factory.as_ref())); } // Test JsonSource @@ -691,12 +679,7 @@ async fn test_multi_source_schema_adapter_reuse() -> Result<()> { assert!(source_with_adapter.schema_adapter_factory().is_some()); let retrieved_factory = source_with_adapter.schema_adapter_factory().unwrap(); - assert_eq!( - retrieved_factory - .as_any() - .downcast_ref::(), - Some(factory.as_ref()) - ); + assert_eq!(format!("{:?}", retrieved_factory.as_ref()), format!("{:?}", factory.as_ref())); } Ok(()) From 0e8f15fa2b4be833b9078dd40ee191c9e1e0f332 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Fri, 25 Jul 2025 10:49:52 +0800 Subject: [PATCH 34/41] Fix fmt errors --- .../core/tests/parquet/schema_adapter.rs | 20 +++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/datafusion/core/tests/parquet/schema_adapter.rs b/datafusion/core/tests/parquet/schema_adapter.rs index 21f7b7bc0996..9dbd509cab4b 100644 --- a/datafusion/core/tests/parquet/schema_adapter.rs +++ b/datafusion/core/tests/parquet/schema_adapter.rs @@ -630,7 +630,10 @@ async fn test_multi_source_schema_adapter_reuse() -> Result<()> { assert!(source_with_adapter.schema_adapter_factory().is_some()); let retrieved_factory = source_with_adapter.schema_adapter_factory().unwrap(); - assert_eq!(format!("{:?}", retrieved_factory.as_ref()), format!("{:?}", factory.as_ref())); + assert_eq!( + format!("{:?}", retrieved_factory.as_ref()), + format!("{:?}", factory.as_ref()) + ); } // Test ParquetSource @@ -647,7 +650,10 @@ async fn test_multi_source_schema_adapter_reuse() -> Result<()> { assert!(source_with_adapter.schema_adapter_factory().is_some()); let retrieved_factory = source_with_adapter.schema_adapter_factory().unwrap(); - assert_eq!(format!("{:?}", retrieved_factory.as_ref()), format!("{:?}", factory.as_ref())); + assert_eq!( + format!("{:?}", retrieved_factory.as_ref()), + format!("{:?}", factory.as_ref()) + ); } // Test CsvSource @@ -663,7 +669,10 @@ async fn test_multi_source_schema_adapter_reuse() -> Result<()> { assert!(source_with_adapter.schema_adapter_factory().is_some()); let retrieved_factory = source_with_adapter.schema_adapter_factory().unwrap(); - assert_eq!(format!("{:?}", retrieved_factory.as_ref()), format!("{:?}", factory.as_ref())); + assert_eq!( + format!("{:?}", retrieved_factory.as_ref()), + format!("{:?}", factory.as_ref()) + ); } // Test JsonSource @@ -679,7 +688,10 @@ async fn test_multi_source_schema_adapter_reuse() -> Result<()> { assert!(source_with_adapter.schema_adapter_factory().is_some()); let retrieved_factory = source_with_adapter.schema_adapter_factory().unwrap(); - assert_eq!(format!("{:?}", retrieved_factory.as_ref()), format!("{:?}", factory.as_ref())); + assert_eq!( + format!("{:?}", retrieved_factory.as_ref()), + format!("{:?}", factory.as_ref()) + ); } Ok(()) From e34b2bc86b03910f845fb60092db5ae7874862ba Mon Sep 17 00:00:00 2001 From: kosiew Date: Sun, 27 Jul 2025 15:49:18 +0800 Subject: [PATCH 35/41] refactor: move schema adapter integration tests move integration tests from parquet/schema_adapter.rs add new integration_tests/schema_adapter module add root driver schema_adapter_integration.rs --- .../integration_tests/schema_adapter/mod.rs | 1 + .../schema_adapter_integration_tests.rs | 317 +++++++++++++++++ .../core/tests/parquet/schema_adapter.rs | 318 ------------------ .../core/tests/schema_adapter_integration.rs | 21 ++ 4 files changed, 339 insertions(+), 318 deletions(-) create mode 100644 datafusion/core/tests/integration_tests/schema_adapter/mod.rs create mode 100644 datafusion/core/tests/integration_tests/schema_adapter/schema_adapter_integration_tests.rs create mode 100644 datafusion/core/tests/schema_adapter_integration.rs diff --git a/datafusion/core/tests/integration_tests/schema_adapter/mod.rs b/datafusion/core/tests/integration_tests/schema_adapter/mod.rs new file mode 100644 index 000000000000..68ea355b5d9d --- /dev/null +++ b/datafusion/core/tests/integration_tests/schema_adapter/mod.rs @@ -0,0 +1 @@ +mod schema_adapter_integration_tests; diff --git a/datafusion/core/tests/integration_tests/schema_adapter/schema_adapter_integration_tests.rs b/datafusion/core/tests/integration_tests/schema_adapter/schema_adapter_integration_tests.rs new file mode 100644 index 000000000000..4904e30aceb1 --- /dev/null +++ b/datafusion/core/tests/integration_tests/schema_adapter/schema_adapter_integration_tests.rs @@ -0,0 +1,317 @@ +// ---------------------------------------------------------------------- +// Tests migrated from schema_adaptation/schema_adapter_integration_tests.rs +// ---------------------------------------------------------------------- + +/// A schema adapter factory that transforms column names to uppercase +#[derive(Debug, PartialEq)] +struct UppercaseAdapterFactory {} + +impl SchemaAdapterFactory for UppercaseAdapterFactory { + fn create( + &self, + projected_table_schema: SchemaRef, + _table_schema: SchemaRef, + ) -> Box { + Box::new(UppercaseAdapter { + table_schema: projected_table_schema, + }) + } +} + +/// Schema adapter that transforms column names to uppercase +#[derive(Debug)] +struct UppercaseAdapter { + table_schema: SchemaRef, +} + +impl SchemaAdapter for UppercaseAdapter { + fn map_column_index(&self, index: usize, file_schema: &Schema) -> Option { + let field = self.table_schema.field(index); + let uppercase_name = field.name().to_uppercase(); + file_schema + .fields() + .iter() + .position(|f| f.name().to_uppercase() == uppercase_name) + } + + fn map_schema( + &self, + file_schema: &Schema, + ) -> Result<(Arc, Vec)> { + let mut projection = Vec::new(); + + // Map each field in the table schema to the corresponding field in the file schema + for table_field in self.table_schema.fields() { + let uppercase_name = table_field.name().to_uppercase(); + if let Some(pos) = file_schema + .fields() + .iter() + .position(|f| f.name().to_uppercase() == uppercase_name) + { + projection.push(pos); + } + } + + let mapper = UppercaseSchemaMapper { + output_schema: self.output_schema(), + projection: projection.clone(), + }; + + Ok((Arc::new(mapper), projection)) + } +} + +impl UppercaseAdapter { + fn output_schema(&self) -> SchemaRef { + let fields: Vec = self + .table_schema + .fields() + .iter() + .map(|f| { + Field::new( + f.name().to_uppercase().as_str(), + f.data_type().clone(), + f.is_nullable(), + ) + }) + .collect(); + + Arc::new(Schema::new(fields)) + } +} + +#[derive(Debug)] +struct UppercaseSchemaMapper { + output_schema: SchemaRef, + projection: Vec, +} + +impl SchemaMapper for UppercaseSchemaMapper { + fn map_batch(&self, batch: RecordBatch) -> Result { + let columns = self + .projection + .iter() + .map(|&i| batch.column(i).clone()) + .collect::>(); + Ok(RecordBatch::try_new(self.output_schema.clone(), columns)?) + } + + fn map_column_statistics( + &self, + stats: &[ColumnStatistics], + ) -> Result> { + Ok(self + .projection + .iter() + .map(|&i| stats.get(i).cloned().unwrap_or_default()) + .collect()) + } +} + +#[cfg(feature = "parquet")] +#[tokio::test] +async fn test_parquet_integration_with_schema_adapter() -> Result<()> { + // Create test data + let batch = RecordBatch::try_new( + Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("name", DataType::Utf8, true), + ])), + vec![ + Arc::new(arrow::array::Int32Array::from(vec![1, 2, 3])), + Arc::new(arrow::array::StringArray::from(vec!["a", "b", "c"])), + ], + )?; + + let store = Arc::new(InMemory::new()) as Arc; + let store_url = ObjectStoreUrl::parse("memory://").unwrap(); + let path = "test.parquet"; + write_parquet(batch.clone(), store.clone(), path).await; + + // Get the actual file size from the object store + let object_meta = store.head(&Path::from(path)).await?; + let file_size = object_meta.size; + + // Create a session context and register the object store + let ctx = SessionContext::new(); + ctx.register_object_store(store_url.as_ref(), Arc::clone(&store)); + + // Create a ParquetSource with the adapter factory + let file_source = ParquetSource::default() + .with_schema_adapter_factory(Arc::new(UppercaseAdapterFactory {}))?; + + // Create a table schema with uppercase column names + let table_schema = Arc::new(Schema::new(vec![ + Field::new("ID", DataType::Int32, false), + Field::new("NAME", DataType::Utf8, true), + ])); + + let config = FileScanConfigBuilder::new(store_url, table_schema.clone(), file_source) + .with_file(PartitionedFile::new(path, file_size)) + .build(); + + // Create a data source executor + let exec = DataSourceExec::from_data_source(config); + + // Collect results + let task_ctx = ctx.task_ctx(); + let stream = exec.execute(0, task_ctx)?; + let batches = datafusion::physical_plan::common::collect(stream).await?; + + // There should be one batch + assert_eq!(batches.len(), 1); + + // Verify the schema has the uppercase column names + let result_schema = batches[0].schema(); + assert_eq!(result_schema.field(0).name(), "ID"); + assert_eq!(result_schema.field(1).name(), "NAME"); + + Ok(()) +} + +#[cfg(feature = "parquet")] +#[tokio::test] +async fn test_parquet_integration_with_schema_adapter_and_expression_rewriter( +) -> Result<()> { + // Create test data + let batch = RecordBatch::try_new( + Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("name", DataType::Utf8, true), + ])), + vec![ + Arc::new(arrow::array::Int32Array::from(vec![1, 2, 3])), + Arc::new(arrow::array::StringArray::from(vec!["a", "b", "c"])), + ], + )?; + + let store = Arc::new(InMemory::new()) as Arc; + let store_url = ObjectStoreUrl::parse("memory://").unwrap(); + let path = "test.parquet"; + write_parquet(batch.clone(), store.clone(), path).await; + + // Get the actual file size from the object store + let object_meta = store.head(&Path::from(path)).await?; + let file_size = object_meta.size; + + // Create a session context and register the object store + let ctx = SessionContext::new(); + ctx.register_object_store(store_url.as_ref(), Arc::clone(&store)); + + // Create a ParquetSource with the adapter factory + let file_source = ParquetSource::default() + .with_schema_adapter_factory(Arc::new(UppercaseAdapterFactory {}))?; + + let config = FileScanConfigBuilder::new(store_url, batch.schema(), file_source) + .with_file(PartitionedFile::new(path, file_size)) + .build(); + + // Create a data source executor + let exec = DataSourceExec::from_data_source(config); + + // Collect results + let task_ctx = ctx.task_ctx(); + let stream = exec.execute(0, task_ctx)?; + let batches = datafusion::physical_plan::common::collect(stream).await?; + + // There should be one batch + assert_eq!(batches.len(), 1); + + // Verify the schema has the original column names (schema adapter not applied in DataSourceExec) + let result_schema = batches[0].schema(); + assert_eq!(result_schema.field(0).name(), "id"); + assert_eq!(result_schema.field(1).name(), "name"); + + Ok(()) +} + +#[tokio::test] +async fn test_multi_source_schema_adapter_reuse() -> Result<()> { + // This test verifies that the same schema adapter factory can be reused + // across different file source types. This is important for ensuring that: + // 1. The schema adapter factory interface works uniformly across all source types + // 2. The factory can be shared and cloned efficiently using Arc + // 3. Various data source implementations correctly implement the schema adapter factory pattern + + // Create a test factory + let factory = Arc::new(UppercaseAdapterFactory {}); + + // Test ArrowSource + { + let source = ArrowSource::default(); + let source_with_adapter = source + .clone() + .with_schema_adapter_factory(factory.clone()) + .unwrap(); + + let base_source: Arc = source.into(); + assert!(base_source.schema_adapter_factory().is_none()); + assert!(source_with_adapter.schema_adapter_factory().is_some()); + + let retrieved_factory = source_with_adapter.schema_adapter_factory().unwrap(); + assert_eq!( + format!("{:?}", retrieved_factory.as_ref()), + format!("{:?}", factory.as_ref()) + ); + } + + // Test ParquetSource + #[cfg(feature = "parquet")] + { + let source = ParquetSource::default(); + let source_with_adapter = source + .clone() + .with_schema_adapter_factory(factory.clone()) + .unwrap(); + + let base_source: Arc = source.into(); + assert!(base_source.schema_adapter_factory().is_none()); + assert!(source_with_adapter.schema_adapter_factory().is_some()); + + let retrieved_factory = source_with_adapter.schema_adapter_factory().unwrap(); + assert_eq!( + format!("{:?}", retrieved_factory.as_ref()), + format!("{:?}", factory.as_ref()) + ); + } + + // Test CsvSource + { + let source = CsvSource::default(); + let source_with_adapter = source + .clone() + .with_schema_adapter_factory(factory.clone()) + .unwrap(); + + let base_source: Arc = source.into(); + assert!(base_source.schema_adapter_factory().is_none()); + assert!(source_with_adapter.schema_adapter_factory().is_some()); + + let retrieved_factory = source_with_adapter.schema_adapter_factory().unwrap(); + assert_eq!( + format!("{:?}", retrieved_factory.as_ref()), + format!("{:?}", factory.as_ref()) + ); + } + + // Test JsonSource + { + let source = JsonSource::default(); + let source_with_adapter = source + .clone() + .with_schema_adapter_factory(factory.clone()) + .unwrap(); + + let base_source: Arc = source.into(); + assert!(base_source.schema_adapter_factory().is_none()); + assert!(source_with_adapter.schema_adapter_factory().is_some()); + + let retrieved_factory = source_with_adapter.schema_adapter_factory().unwrap(); + assert_eq!( + format!("{:?}", retrieved_factory.as_ref()), + format!("{:?}", factory.as_ref()) + ); + } + + Ok(()) +} diff --git a/datafusion/core/tests/parquet/schema_adapter.rs b/datafusion/core/tests/parquet/schema_adapter.rs index 9dbd509cab4b..4a30e1b812ce 100644 --- a/datafusion/core/tests/parquet/schema_adapter.rs +++ b/datafusion/core/tests/parquet/schema_adapter.rs @@ -378,321 +378,3 @@ async fn test_custom_schema_adapter_and_custom_expression_adapter() { ]; assert_batches_eq!(expected, &batches); } - -// ---------------------------------------------------------------------- -// Tests migrated from schema_adaptation/schema_adapter_integration_tests.rs -// ---------------------------------------------------------------------- - -/// A schema adapter factory that transforms column names to uppercase -#[derive(Debug, PartialEq)] -struct UppercaseAdapterFactory {} - -impl SchemaAdapterFactory for UppercaseAdapterFactory { - fn create( - &self, - projected_table_schema: SchemaRef, - _table_schema: SchemaRef, - ) -> Box { - Box::new(UppercaseAdapter { - table_schema: projected_table_schema, - }) - } -} - -/// Schema adapter that transforms column names to uppercase -#[derive(Debug)] -struct UppercaseAdapter { - table_schema: SchemaRef, -} - -impl SchemaAdapter for UppercaseAdapter { - fn map_column_index(&self, index: usize, file_schema: &Schema) -> Option { - let field = self.table_schema.field(index); - let uppercase_name = field.name().to_uppercase(); - file_schema - .fields() - .iter() - .position(|f| f.name().to_uppercase() == uppercase_name) - } - - fn map_schema( - &self, - file_schema: &Schema, - ) -> Result<(Arc, Vec)> { - let mut projection = Vec::new(); - - // Map each field in the table schema to the corresponding field in the file schema - for table_field in self.table_schema.fields() { - let uppercase_name = table_field.name().to_uppercase(); - if let Some(pos) = file_schema - .fields() - .iter() - .position(|f| f.name().to_uppercase() == uppercase_name) - { - projection.push(pos); - } - } - - let mapper = UppercaseSchemaMapper { - output_schema: self.output_schema(), - projection: projection.clone(), - }; - - Ok((Arc::new(mapper), projection)) - } -} - -impl UppercaseAdapter { - fn output_schema(&self) -> SchemaRef { - let fields: Vec = self - .table_schema - .fields() - .iter() - .map(|f| { - Field::new( - f.name().to_uppercase().as_str(), - f.data_type().clone(), - f.is_nullable(), - ) - }) - .collect(); - - Arc::new(Schema::new(fields)) - } -} - -#[derive(Debug)] -struct UppercaseSchemaMapper { - output_schema: SchemaRef, - projection: Vec, -} - -impl SchemaMapper for UppercaseSchemaMapper { - fn map_batch(&self, batch: RecordBatch) -> Result { - let columns = self - .projection - .iter() - .map(|&i| batch.column(i).clone()) - .collect::>(); - Ok(RecordBatch::try_new(self.output_schema.clone(), columns)?) - } - - fn map_column_statistics( - &self, - stats: &[ColumnStatistics], - ) -> Result> { - Ok(self - .projection - .iter() - .map(|&i| stats.get(i).cloned().unwrap_or_default()) - .collect()) - } -} - -#[cfg(feature = "parquet")] -#[tokio::test] -async fn test_parquet_integration_with_schema_adapter() -> Result<()> { - // Create test data - let batch = RecordBatch::try_new( - Arc::new(Schema::new(vec![ - Field::new("id", DataType::Int32, false), - Field::new("name", DataType::Utf8, true), - ])), - vec![ - Arc::new(arrow::array::Int32Array::from(vec![1, 2, 3])), - Arc::new(arrow::array::StringArray::from(vec!["a", "b", "c"])), - ], - )?; - - let store = Arc::new(InMemory::new()) as Arc; - let store_url = ObjectStoreUrl::parse("memory://").unwrap(); - let path = "test.parquet"; - write_parquet(batch.clone(), store.clone(), path).await; - - // Get the actual file size from the object store - let object_meta = store.head(&Path::from(path)).await?; - let file_size = object_meta.size; - - // Create a session context and register the object store - let ctx = SessionContext::new(); - ctx.register_object_store(store_url.as_ref(), Arc::clone(&store)); - - // Create a ParquetSource with the adapter factory - let file_source = ParquetSource::default() - .with_schema_adapter_factory(Arc::new(UppercaseAdapterFactory {}))?; - - // Create a table schema with uppercase column names - let table_schema = Arc::new(Schema::new(vec![ - Field::new("ID", DataType::Int32, false), - Field::new("NAME", DataType::Utf8, true), - ])); - - let config = FileScanConfigBuilder::new(store_url, table_schema.clone(), file_source) - .with_file(PartitionedFile::new(path, file_size)) - .build(); - - // Create a data source executor - let exec = DataSourceExec::from_data_source(config); - - // Collect results - let task_ctx = ctx.task_ctx(); - let stream = exec.execute(0, task_ctx)?; - let batches = datafusion::physical_plan::common::collect(stream).await?; - - // There should be one batch - assert_eq!(batches.len(), 1); - - // Verify the schema has the uppercase column names - let result_schema = batches[0].schema(); - assert_eq!(result_schema.field(0).name(), "ID"); - assert_eq!(result_schema.field(1).name(), "NAME"); - - Ok(()) -} - -#[cfg(feature = "parquet")] -#[tokio::test] -async fn test_parquet_integration_with_schema_adapter_and_expression_rewriter( -) -> Result<()> { - // Create test data - let batch = RecordBatch::try_new( - Arc::new(Schema::new(vec![ - Field::new("id", DataType::Int32, false), - Field::new("name", DataType::Utf8, true), - ])), - vec![ - Arc::new(arrow::array::Int32Array::from(vec![1, 2, 3])), - Arc::new(arrow::array::StringArray::from(vec!["a", "b", "c"])), - ], - )?; - - let store = Arc::new(InMemory::new()) as Arc; - let store_url = ObjectStoreUrl::parse("memory://").unwrap(); - let path = "test.parquet"; - write_parquet(batch.clone(), store.clone(), path).await; - - // Get the actual file size from the object store - let object_meta = store.head(&Path::from(path)).await?; - let file_size = object_meta.size; - - // Create a session context and register the object store - let ctx = SessionContext::new(); - ctx.register_object_store(store_url.as_ref(), Arc::clone(&store)); - - // Create a ParquetSource with the adapter factory - let file_source = ParquetSource::default() - .with_schema_adapter_factory(Arc::new(UppercaseAdapterFactory {}))?; - - let config = FileScanConfigBuilder::new(store_url, batch.schema(), file_source) - .with_file(PartitionedFile::new(path, file_size)) - .build(); - - // Create a data source executor - let exec = DataSourceExec::from_data_source(config); - - // Collect results - let task_ctx = ctx.task_ctx(); - let stream = exec.execute(0, task_ctx)?; - let batches = datafusion::physical_plan::common::collect(stream).await?; - - // There should be one batch - assert_eq!(batches.len(), 1); - - // Verify the schema has the original column names (schema adapter not applied in DataSourceExec) - let result_schema = batches[0].schema(); - assert_eq!(result_schema.field(0).name(), "id"); - assert_eq!(result_schema.field(1).name(), "name"); - - Ok(()) -} - -#[tokio::test] -async fn test_multi_source_schema_adapter_reuse() -> Result<()> { - // This test verifies that the same schema adapter factory can be reused - // across different file source types. This is important for ensuring that: - // 1. The schema adapter factory interface works uniformly across all source types - // 2. The factory can be shared and cloned efficiently using Arc - // 3. Various data source implementations correctly implement the schema adapter factory pattern - - // Create a test factory - let factory = Arc::new(UppercaseAdapterFactory {}); - - // Test ArrowSource - { - let source = ArrowSource::default(); - let source_with_adapter = source - .clone() - .with_schema_adapter_factory(factory.clone()) - .unwrap(); - - let base_source: Arc = source.into(); - assert!(base_source.schema_adapter_factory().is_none()); - assert!(source_with_adapter.schema_adapter_factory().is_some()); - - let retrieved_factory = source_with_adapter.schema_adapter_factory().unwrap(); - assert_eq!( - format!("{:?}", retrieved_factory.as_ref()), - format!("{:?}", factory.as_ref()) - ); - } - - // Test ParquetSource - #[cfg(feature = "parquet")] - { - let source = ParquetSource::default(); - let source_with_adapter = source - .clone() - .with_schema_adapter_factory(factory.clone()) - .unwrap(); - - let base_source: Arc = source.into(); - assert!(base_source.schema_adapter_factory().is_none()); - assert!(source_with_adapter.schema_adapter_factory().is_some()); - - let retrieved_factory = source_with_adapter.schema_adapter_factory().unwrap(); - assert_eq!( - format!("{:?}", retrieved_factory.as_ref()), - format!("{:?}", factory.as_ref()) - ); - } - - // Test CsvSource - { - let source = CsvSource::default(); - let source_with_adapter = source - .clone() - .with_schema_adapter_factory(factory.clone()) - .unwrap(); - - let base_source: Arc = source.into(); - assert!(base_source.schema_adapter_factory().is_none()); - assert!(source_with_adapter.schema_adapter_factory().is_some()); - - let retrieved_factory = source_with_adapter.schema_adapter_factory().unwrap(); - assert_eq!( - format!("{:?}", retrieved_factory.as_ref()), - format!("{:?}", factory.as_ref()) - ); - } - - // Test JsonSource - { - let source = JsonSource::default(); - let source_with_adapter = source - .clone() - .with_schema_adapter_factory(factory.clone()) - .unwrap(); - - let base_source: Arc = source.into(); - assert!(base_source.schema_adapter_factory().is_none()); - assert!(source_with_adapter.schema_adapter_factory().is_some()); - - let retrieved_factory = source_with_adapter.schema_adapter_factory().unwrap(); - assert_eq!( - format!("{:?}", retrieved_factory.as_ref()), - format!("{:?}", factory.as_ref()) - ); - } - - Ok(()) -} diff --git a/datafusion/core/tests/schema_adapter_integration.rs b/datafusion/core/tests/schema_adapter_integration.rs new file mode 100644 index 000000000000..0ddfa059eb9a --- /dev/null +++ b/datafusion/core/tests/schema_adapter_integration.rs @@ -0,0 +1,21 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +/// Run all tests that are found in the `integration_tests/schema_adapter` directory +mod integration_tests { + pub mod schema_adapter; +} From ad5e92b5a31aab5560a5236b68565d21d4de384a Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Sun, 27 Jul 2025 16:17:37 +0800 Subject: [PATCH 36/41] chore: update license header in schema adapter integration tests --- .../schema_adapter_integration_tests.rs | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/datafusion/core/tests/integration_tests/schema_adapter/schema_adapter_integration_tests.rs b/datafusion/core/tests/integration_tests/schema_adapter/schema_adapter_integration_tests.rs index 4904e30aceb1..abe0d46bb4c4 100644 --- a/datafusion/core/tests/integration_tests/schema_adapter/schema_adapter_integration_tests.rs +++ b/datafusion/core/tests/integration_tests/schema_adapter/schema_adapter_integration_tests.rs @@ -1,6 +1,19 @@ -// ---------------------------------------------------------------------- -// Tests migrated from schema_adaptation/schema_adapter_integration_tests.rs -// ---------------------------------------------------------------------- +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. /// A schema adapter factory that transforms column names to uppercase #[derive(Debug, PartialEq)] From 1fa04c240c44924d13ae8f2651786d575cc2d244 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Sun, 27 Jul 2025 17:03:26 +0800 Subject: [PATCH 37/41] Add integration tests for schema adapter - Moved existing schema adapter integration tests from `schema_adaptation/schema_adapter_integration_tests.rs` to a new module in `datafusion/core/tests/integration_tests/schema_adapter/schema_adapter_integration_tests.rs`. - Created a new file `schema_adapter.rs` in the integration tests folder to run and organize the tests under the schema adapter directory. - The tests validate the functionality of a schema adapter that transforms column names to uppercase, ensuring compatibility across different file sources. - Ensured proper organization of tests for future maintainability and clearer directory structure. --- .../schema_adapter_integration_tests.rs | 52 +++++++++++++------ 1 file changed, 36 insertions(+), 16 deletions(-) diff --git a/datafusion/core/tests/integration_tests/schema_adapter/schema_adapter_integration_tests.rs b/datafusion/core/tests/integration_tests/schema_adapter/schema_adapter_integration_tests.rs index abe0d46bb4c4..ce5363b0e16f 100644 --- a/datafusion/core/tests/integration_tests/schema_adapter/schema_adapter_integration_tests.rs +++ b/datafusion/core/tests/integration_tests/schema_adapter/schema_adapter_integration_tests.rs @@ -1,19 +1,39 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. +// ---------------------------------------------------------------------- +// Tests migrated from schema_adaptation/schema_adapter_integration_tests.rs +// ---------------------------------------------------------------------- + +use std::sync::Arc; + +use arrow::array::RecordBatch; +use arrow_schema::{DataType, Field, Schema, SchemaRef}; +use bytes::{BufMut, BytesMut}; +use datafusion::common::Result; +use datafusion::datasource::listing::PartitionedFile; +use datafusion::datasource::physical_plan::{ + ArrowSource, CsvSource, FileSource, JsonSource, ParquetSource, +}; +use datafusion::physical_plan::ExecutionPlan; +use datafusion::prelude::SessionContext; +use datafusion_common::ColumnStatistics; +use datafusion_datasource::file_scan_config::FileScanConfigBuilder; +use datafusion_datasource::schema_adapter::{ + SchemaAdapter, SchemaAdapterFactory, SchemaMapper, +}; +use datafusion_datasource::source::DataSourceExec; +use datafusion_execution::object_store::ObjectStoreUrl; +use object_store::{memory::InMemory, path::Path, ObjectStore}; +use parquet::arrow::ArrowWriter; + +async fn write_parquet(batch: RecordBatch, store: Arc, path: &str) { + let mut out = BytesMut::new().writer(); + { + let mut writer = ArrowWriter::try_new(&mut out, batch.schema(), None).unwrap(); + writer.write(&batch).unwrap(); + writer.finish().unwrap(); + } + let data = out.into_inner().freeze(); + store.put(&Path::from(path), data.into()).await.unwrap(); +} /// A schema adapter factory that transforms column names to uppercase #[derive(Debug, PartialEq)] From 85e29beae3c1a2db3e9fc1eefa0e3c9b2dc26f76 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Sun, 27 Jul 2025 17:11:23 +0800 Subject: [PATCH 38/41] Add Apache License header to schema adapter integration tests file --- .../schema_adapter_integration_tests.rs | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/datafusion/core/tests/integration_tests/schema_adapter/schema_adapter_integration_tests.rs b/datafusion/core/tests/integration_tests/schema_adapter/schema_adapter_integration_tests.rs index ce5363b0e16f..c3c92a9028d6 100644 --- a/datafusion/core/tests/integration_tests/schema_adapter/schema_adapter_integration_tests.rs +++ b/datafusion/core/tests/integration_tests/schema_adapter/schema_adapter_integration_tests.rs @@ -1,6 +1,19 @@ -// ---------------------------------------------------------------------- -// Tests migrated from schema_adaptation/schema_adapter_integration_tests.rs -// ---------------------------------------------------------------------- +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. use std::sync::Arc; From 67213f4882a982efd7afee11ee910619320a1e32 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Sun, 27 Jul 2025 17:16:05 +0800 Subject: [PATCH 39/41] chore: add Apache License header to schema adapter integration tests file --- .../integration_tests/schema_adapter/mod.rs | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/datafusion/core/tests/integration_tests/schema_adapter/mod.rs b/datafusion/core/tests/integration_tests/schema_adapter/mod.rs index 68ea355b5d9d..2f81a43f4736 100644 --- a/datafusion/core/tests/integration_tests/schema_adapter/mod.rs +++ b/datafusion/core/tests/integration_tests/schema_adapter/mod.rs @@ -1 +1,18 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + mod schema_adapter_integration_tests; From 112f8b6418abbbe32e4d9869c51c717a0835fabc Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sun, 27 Jul 2025 06:22:15 -0400 Subject: [PATCH 40/41] Clippy --- datafusion/core/tests/parquet/schema_adapter.rs | 7 ------- 1 file changed, 7 deletions(-) diff --git a/datafusion/core/tests/parquet/schema_adapter.rs b/datafusion/core/tests/parquet/schema_adapter.rs index 4a30e1b812ce..f9a46f2e240f 100644 --- a/datafusion/core/tests/parquet/schema_adapter.rs +++ b/datafusion/core/tests/parquet/schema_adapter.rs @@ -23,20 +23,14 @@ use arrow_schema::{DataType, Field, FieldRef, Schema, SchemaRef}; use bytes::{BufMut, BytesMut}; use datafusion::assert_batches_eq; use datafusion::common::Result; -use datafusion::datasource::listing::PartitionedFile; use datafusion::datasource::listing::{ListingTable, ListingTableConfig}; -use datafusion::datasource::physical_plan::{ - ArrowSource, CsvSource, FileSource, JsonSource, ParquetSource, -}; use datafusion::prelude::{SessionConfig, SessionContext}; use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode}; use datafusion_common::DataFusionError; use datafusion_common::{ColumnStatistics, ScalarValue}; -use datafusion_datasource::file_scan_config::FileScanConfigBuilder; use datafusion_datasource::schema_adapter::{ DefaultSchemaAdapterFactory, SchemaAdapter, SchemaAdapterFactory, SchemaMapper, }; -use datafusion_datasource::source::DataSourceExec; use datafusion_datasource::ListingTableUrl; use datafusion_execution::object_store::ObjectStoreUrl; use datafusion_physical_expr::expressions::{self, Column}; @@ -44,7 +38,6 @@ use datafusion_physical_expr::schema_rewriter::{ DefaultPhysicalExprAdapterFactory, PhysicalExprAdapter, PhysicalExprAdapterFactory, }; use datafusion_physical_expr::{DefaultPhysicalExprAdapter, PhysicalExpr}; -use datafusion_physical_plan::ExecutionPlan; use itertools::Itertools; use object_store::{memory::InMemory, path::Path, ObjectStore}; use parquet::arrow::ArrowWriter; From 74d8a6d548005e8d00b74a43d512298b3dbbc7c9 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sun, 27 Jul 2025 06:45:15 -0400 Subject: [PATCH 41/41] Move schema adapter tests to the `core_integration` binary --- datafusion/core/tests/core_integration.rs | 3 +++ .../schema_adapter/mod.rs | 0 .../schema_adapter_integration_tests.rs | 0 .../core/tests/schema_adapter_integration.rs | 21 ------------------- 4 files changed, 3 insertions(+), 21 deletions(-) rename datafusion/core/tests/{integration_tests => }/schema_adapter/mod.rs (100%) rename datafusion/core/tests/{integration_tests => }/schema_adapter/schema_adapter_integration_tests.rs (100%) delete mode 100644 datafusion/core/tests/schema_adapter_integration.rs diff --git a/datafusion/core/tests/core_integration.rs b/datafusion/core/tests/core_integration.rs index 250538b13370..e37a368f0771 100644 --- a/datafusion/core/tests/core_integration.rs +++ b/datafusion/core/tests/core_integration.rs @@ -45,6 +45,9 @@ mod optimizer; /// Run all tests that are found in the `physical_optimizer` directory mod physical_optimizer; +/// Run all tests that are found in the `schema_adapter` directory +mod schema_adapter; + /// Run all tests that are found in the `serde` directory mod serde; diff --git a/datafusion/core/tests/integration_tests/schema_adapter/mod.rs b/datafusion/core/tests/schema_adapter/mod.rs similarity index 100% rename from datafusion/core/tests/integration_tests/schema_adapter/mod.rs rename to datafusion/core/tests/schema_adapter/mod.rs diff --git a/datafusion/core/tests/integration_tests/schema_adapter/schema_adapter_integration_tests.rs b/datafusion/core/tests/schema_adapter/schema_adapter_integration_tests.rs similarity index 100% rename from datafusion/core/tests/integration_tests/schema_adapter/schema_adapter_integration_tests.rs rename to datafusion/core/tests/schema_adapter/schema_adapter_integration_tests.rs diff --git a/datafusion/core/tests/schema_adapter_integration.rs b/datafusion/core/tests/schema_adapter_integration.rs deleted file mode 100644 index 0ddfa059eb9a..000000000000 --- a/datafusion/core/tests/schema_adapter_integration.rs +++ /dev/null @@ -1,21 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -/// Run all tests that are found in the `integration_tests/schema_adapter` directory -mod integration_tests { - pub mod schema_adapter; -}