16
16
// under the License.
17
17
18
18
use std:: collections:: HashSet ;
19
+ use std:: num:: NonZeroUsize ;
19
20
use std:: sync:: Arc ;
20
21
21
22
use datafusion:: error:: Result as DFResult ;
22
23
use datafusion:: physical_plan:: expressions:: Column ;
23
24
use datafusion:: physical_plan:: repartition:: RepartitionExec ;
24
25
use datafusion:: physical_plan:: { ExecutionPlan , Partitioning } ;
25
- use iceberg:: spec:: { SchemaRef , TableMetadata , TableMetadataRef , Transform } ;
26
+ use iceberg:: spec:: { TableMetadata , TableMetadataRef , Transform } ;
26
27
27
28
/// Creates an Iceberg-aware repartition execution plan that optimizes data distribution
28
29
/// for parallel processing while respecting Iceberg table partitioning semantics.
@@ -57,8 +58,7 @@ use iceberg::spec::{SchemaRef, TableMetadata, TableMetadataRef, Transform};
57
58
///
58
59
/// # Arguments
59
60
///
60
- /// * `input` - The input execution plan providing data to be repartitioned
61
- /// * `table_schema` - The Iceberg table schema used to resolve column references
61
+ /// * `input` - The input execution plan providing data to be repartitioned (should already be projected to match table schema)
62
62
/// * `table_metadata` - The Iceberg table metadata containing partition spec and sort order
63
63
/// * `target_partitions` - Target number of partitions for parallel processing (must be > 0)
64
64
///
@@ -72,25 +72,17 @@ use iceberg::spec::{SchemaRef, TableMetadata, TableMetadataRef, Transform};
72
72
/// ```ignore
73
73
/// let repartitioned_plan = repartition(
74
74
/// input_plan,
75
- /// table.schema_ref(),
76
75
/// table.metadata_ref(),
77
76
/// 4, // Explicit partition count
78
77
/// )?;
79
78
/// ```
80
- pub fn repartition (
79
+ pub ( crate ) fn repartition (
81
80
input : Arc < dyn ExecutionPlan > ,
82
- table_schema : SchemaRef ,
83
81
table_metadata : TableMetadataRef ,
84
- target_partitions : usize ,
82
+ target_partitions : NonZeroUsize ,
85
83
) -> DFResult < Arc < dyn ExecutionPlan > > {
86
- if target_partitions == 0 {
87
- return Err ( datafusion:: error:: DataFusionError :: Plan (
88
- "repartition requires target_partitions > 0" . to_string ( ) ,
89
- ) ) ;
90
- }
91
-
92
84
let partitioning_strategy =
93
- determine_partitioning_strategy ( & input, & table_schema , & table_metadata, target_partitions) ?;
85
+ determine_partitioning_strategy ( & input, & table_metadata, target_partitions) ?;
94
86
95
87
if !needs_repartitioning ( & input, & partitioning_strategy) {
96
88
return Ok ( input) ;
@@ -166,12 +158,11 @@ fn same_columns(
166
158
/// falls back to round-robin batch partitioning for even load distribution.
167
159
fn determine_partitioning_strategy (
168
160
input : & Arc < dyn ExecutionPlan > ,
169
- table_schema : & SchemaRef ,
170
161
table_metadata : & TableMetadata ,
171
- target_partitions : usize ,
162
+ target_partitions : NonZeroUsize ,
172
163
) -> DFResult < Partitioning > {
173
164
let partition_spec = table_metadata. default_partition_spec ( ) ;
174
- let sort_order = table_metadata. default_sort_order ( ) ;
165
+ let table_schema = table_metadata. current_schema ( ) ;
175
166
176
167
let names_iter: Box < dyn Iterator < Item = & str > > = {
177
168
// Partition identity columns
@@ -194,40 +185,35 @@ fn determine_partitioning_strategy(
194
185
None
195
186
}
196
187
} ) ;
197
- // Bucket columns from sort order
198
- let bucket_names_sort = sort_order. fields . iter ( ) . filter_map ( |sf| {
199
- if let Transform :: Bucket ( _) = sf. transform {
200
- table_schema
201
- . field_by_id ( sf. source_id )
202
- . map ( |field| field. name . as_str ( ) )
203
- } else {
204
- None
205
- }
206
- } ) ;
207
- Box :: new ( part_names. chain ( bucket_names_part) . chain ( bucket_names_sort) )
188
+ Box :: new ( part_names. chain ( bucket_names_part) )
208
189
} ;
209
190
210
191
// Order: partitions first, then buckets
211
192
// Deduplicate while preserving order
212
193
let input_schema = input. schema ( ) ;
213
- let mut seen: HashSet < & str > = HashSet :: new ( ) ;
194
+ let mut seen = HashSet :: new ( ) ;
214
195
let hash_exprs: Vec < Arc < dyn datafusion:: physical_expr:: PhysicalExpr > > = names_iter
215
196
. filter ( |name| seen. insert ( * name) )
216
197
. map ( |name| {
217
198
let idx = input_schema
218
199
. index_of ( name)
219
- . map_err ( |e| datafusion:: error:: DataFusionError :: Plan ( e. to_string ( ) ) ) ?;
200
+ . map_err ( |e| {
201
+ datafusion:: error:: DataFusionError :: Plan ( format ! (
202
+ "Column '{}' not found in input schema. Ensure projection happens before repartitioning. Error: {}" ,
203
+ name, e
204
+ ) )
205
+ } ) ?;
220
206
Ok ( Arc :: new ( Column :: new ( name, idx) )
221
207
as Arc < dyn datafusion:: physical_expr:: PhysicalExpr > )
222
208
} )
223
209
. collect :: < DFResult < _ > > ( ) ?;
224
210
225
211
if !hash_exprs. is_empty ( ) {
226
- return Ok ( Partitioning :: Hash ( hash_exprs, target_partitions) ) ;
212
+ return Ok ( Partitioning :: Hash ( hash_exprs, target_partitions. get ( ) ) ) ;
227
213
}
228
214
229
215
// Fallback to round-robin for unpartitioned, non-bucketed tables, and range-only partitions
230
- Ok ( Partitioning :: RoundRobinBatch ( target_partitions) )
216
+ Ok ( Partitioning :: RoundRobinBatch ( target_partitions. get ( ) ) )
231
217
}
232
218
233
219
#[ cfg( test) ]
0 commit comments