Skip to content

Commit 60b8577

Browse files
committed
Update for new APIs
1 parent cdeac40 commit 60b8577

File tree

24 files changed

+826
-281
lines changed

24 files changed

+826
-281
lines changed

datafusion/common/src/config.rs

Lines changed: 2 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -601,13 +601,6 @@ config_namespace! {
601601
/// default parquet writer setting
602602
pub statistics_enabled: Option<String>, transform = str::to_lowercase, default = Some("page".into())
603603

604-
/// (writing) Sets max statistics size for any column. If NULL, uses
605-
/// default parquet writer setting
606-
/// max_statistics_size is deprecated, currently it is not being used
607-
// TODO: remove once deprecated
608-
#[deprecated(since = "45.0.0", note = "Setting does not do anything")]
609-
pub max_statistics_size: Option<usize>, default = Some(4096)
610-
611604
/// (writing) Target maximum number of rows in each row group (defaults to 1M
612605
/// rows). Writing larger row groups requires more memory to write, but
613606
/// can get better compression and be faster to read.
@@ -619,9 +612,9 @@ config_namespace! {
619612
/// (writing) Sets column index truncate length
620613
pub column_index_truncate_length: Option<usize>, default = Some(64)
621614

622-
/// (writing) Sets statictics truncate length. If NULL, uses
615+
/// (writing) Sets statistics truncate length. If NULL, uses
623616
/// default parquet writer setting
624-
pub statistics_truncate_length: Option<usize>, default = None
617+
pub statistics_truncate_length: Option<usize>, default = Some(64)
625618

626619
/// (writing) Sets best effort maximum number of rows in data page
627620
pub data_page_row_count_limit: usize, default = 20_000
@@ -2058,13 +2051,6 @@ config_namespace_with_hashmap! {
20582051
/// Sets bloom filter number of distinct values. If NULL, uses
20592052
/// default parquet options
20602053
pub bloom_filter_ndv: Option<u64>, default = None
2061-
2062-
/// Sets max statistics size for the column path. If NULL, uses
2063-
/// default parquet options
2064-
/// max_statistics_size is deprecated, currently it is not being used
2065-
// TODO: remove once deprecated
2066-
#[deprecated(since = "45.0.0", note = "Setting does not do anything")]
2067-
pub max_statistics_size: Option<usize>, default = None
20682054
}
20692055
}
20702056

datafusion/common/src/file_options/parquet_writer.rs

Lines changed: 1 addition & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ use parquet::{
3636
metadata::KeyValue,
3737
properties::{
3838
EnabledStatistics, WriterProperties, WriterPropertiesBuilder, WriterVersion,
39-
DEFAULT_MAX_STATISTICS_SIZE, DEFAULT_STATISTICS_ENABLED,
39+
DEFAULT_STATISTICS_ENABLED,
4040
},
4141
},
4242
schema::types::ColumnPath,
@@ -161,16 +161,6 @@ impl TryFrom<&TableParquetOptions> for WriterPropertiesBuilder {
161161
builder =
162162
builder.set_column_bloom_filter_ndv(path.clone(), bloom_filter_ndv);
163163
}
164-
165-
// max_statistics_size is deprecated, currently it is not being used
166-
// TODO: remove once deprecated
167-
#[allow(deprecated)]
168-
if let Some(max_statistics_size) = options.max_statistics_size {
169-
builder = {
170-
#[allow(deprecated)]
171-
builder.set_column_max_statistics_size(path, max_statistics_size)
172-
}
173-
}
174164
}
175165

176166
Ok(builder)
@@ -219,7 +209,6 @@ impl ParquetOptions {
219209
dictionary_enabled,
220210
dictionary_page_size_limit,
221211
statistics_enabled,
222-
max_statistics_size,
223212
max_row_group_size,
224213
created_by,
225214
column_index_truncate_length,
@@ -265,13 +254,6 @@ impl ParquetOptions {
265254
.set_data_page_row_count_limit(*data_page_row_count_limit)
266255
.set_bloom_filter_enabled(*bloom_filter_on_write);
267256

268-
builder = {
269-
#[allow(deprecated)]
270-
builder.set_max_statistics_size(
271-
max_statistics_size.unwrap_or(DEFAULT_MAX_STATISTICS_SIZE),
272-
)
273-
};
274-
275257
if let Some(bloom_filter_fpp) = bloom_filter_fpp {
276258
builder = builder.set_bloom_filter_fpp(*bloom_filter_fpp);
277259
};
@@ -464,12 +446,10 @@ mod tests {
464446
fn column_options_with_non_defaults(
465447
src_col_defaults: &ParquetOptions,
466448
) -> ParquetColumnOptions {
467-
#[allow(deprecated)] // max_statistics_size
468449
ParquetColumnOptions {
469450
compression: Some("zstd(22)".into()),
470451
dictionary_enabled: src_col_defaults.dictionary_enabled.map(|v| !v),
471452
statistics_enabled: Some("none".into()),
472-
max_statistics_size: Some(72),
473453
encoding: Some("RLE".into()),
474454
bloom_filter_enabled: Some(true),
475455
bloom_filter_fpp: Some(0.72),
@@ -494,7 +474,6 @@ mod tests {
494474
dictionary_enabled: Some(!defaults.dictionary_enabled.unwrap_or(false)),
495475
dictionary_page_size_limit: 42,
496476
statistics_enabled: Some("chunk".into()),
497-
max_statistics_size: Some(42),
498477
max_row_group_size: 42,
499478
created_by: "wordy".into(),
500479
column_index_truncate_length: Some(42),
@@ -552,7 +531,6 @@ mod tests {
552531
),
553532
bloom_filter_fpp: bloom_filter_default_props.map(|p| p.fpp),
554533
bloom_filter_ndv: bloom_filter_default_props.map(|p| p.ndv),
555-
max_statistics_size: Some(props.max_statistics_size(&col)),
556534
}
557535
}
558536

@@ -609,7 +587,6 @@ mod tests {
609587
compression: default_col_props.compression,
610588
dictionary_enabled: default_col_props.dictionary_enabled,
611589
statistics_enabled: default_col_props.statistics_enabled,
612-
max_statistics_size: default_col_props.max_statistics_size,
613590
bloom_filter_on_write: default_col_props
614591
.bloom_filter_enabled
615592
.unwrap_or_default(),

datafusion/common/src/scalar/mod.rs

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -891,11 +891,10 @@ fn dict_from_values<K: ArrowDictionaryKeyType>(
891891
.map(|index| {
892892
if values_array.is_valid(index) {
893893
let native_index = K::Native::from_usize(index).ok_or_else(|| {
894-
DataFusionError::Internal(format!(
895-
"Can not create index of type {} from value {}",
896-
K::DATA_TYPE,
897-
index
898-
))
894+
_internal_datafusion_err!(
895+
"Can not create index of type {} from value {index}",
896+
K::DATA_TYPE
897+
)
899898
})?;
900899
Ok(Some(native_index))
901900
} else {
@@ -2192,6 +2191,16 @@ impl ScalarValue {
21922191
}
21932192

21942193
let array: ArrayRef = match &data_type {
2194+
DataType::Decimal32(_precision, _scale) => {
2195+
return _not_impl_err!(
2196+
"Decimal32 not supported in ScalarValue::iter_to_array"
2197+
);
2198+
}
2199+
DataType::Decimal64(_precision, _scale) => {
2200+
return _not_impl_err!(
2201+
"Decimal64 not supported in ScalarValue::iter_to_array"
2202+
);
2203+
}
21952204
DataType::Decimal128(precision, scale) => {
21962205
let decimal_array =
21972206
ScalarValue::iter_to_decimal_array(scalars, *precision, *scale)?;

datafusion/common/src/types/native.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -407,7 +407,10 @@ impl From<DataType> for NativeType {
407407
DataType::Union(union_fields, _) => {
408408
Union(LogicalUnionFields::from(&union_fields))
409409
}
410-
DataType::Decimal128(p, s) | DataType::Decimal256(p, s) => Decimal(p, s),
410+
DataType::Decimal32(p, s)
411+
| DataType::Decimal64(p, s)
412+
| DataType::Decimal128(p, s)
413+
| DataType::Decimal256(p, s) => Decimal(p, s),
411414
DataType::Map(field, _) => Map(Arc::new(field.as_ref().into())),
412415
DataType::Dictionary(_, data_type) => data_type.as_ref().clone().into(),
413416
DataType::RunEndEncoded(_, field) => field.data_type().clone().into(),

datafusion/core/tests/fuzz_cases/pruning.rs

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -319,14 +319,9 @@ async fn write_parquet_file(
319319
row_groups: Vec<Vec<String>>,
320320
) -> Bytes {
321321
let mut buf = BytesMut::new().writer();
322-
let mut props = WriterProperties::builder();
323-
if let Some(truncation_length) = truncation_length {
324-
props = {
325-
#[allow(deprecated)]
326-
props.set_max_statistics_size(truncation_length)
327-
}
328-
}
329-
props = props.set_statistics_enabled(EnabledStatistics::Chunk); // row group level
322+
let props = WriterProperties::builder()
323+
.set_statistics_enabled(EnabledStatistics::Chunk) // row group level
324+
.set_statistics_truncate_length(truncation_length);
330325
let props = props.build();
331326
{
332327
let mut writer =

datafusion/core/tests/parquet/mod.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -109,11 +109,11 @@ struct ContextWithParquet {
109109

110110
/// The output of running one of the test cases
111111
struct TestOutput {
112-
/// The input string
112+
/// The input query SQL
113113
sql: String,
114114
/// Execution metrics for the Parquet Scan
115115
parquet_metrics: MetricsSet,
116-
/// number of rows in results
116+
/// number of actual rows in results
117117
result_rows: usize,
118118
/// the contents of the input, as a string
119119
pretty_input: String,

datafusion/core/tests/parquet/row_group_pruning.rs

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ struct RowGroupPruningTest {
3434
expected_files_pruned_by_statistics: Option<usize>,
3535
expected_row_group_matched_by_bloom_filter: Option<usize>,
3636
expected_row_group_pruned_by_bloom_filter: Option<usize>,
37-
expected_results: usize,
37+
expected_rows: usize,
3838
}
3939
impl RowGroupPruningTest {
4040
// Start building the test configuration
@@ -48,7 +48,7 @@ impl RowGroupPruningTest {
4848
expected_files_pruned_by_statistics: None,
4949
expected_row_group_matched_by_bloom_filter: None,
5050
expected_row_group_pruned_by_bloom_filter: None,
51-
expected_results: 0,
51+
expected_rows: 0,
5252
}
5353
}
5454

@@ -99,9 +99,9 @@ impl RowGroupPruningTest {
9999
self
100100
}
101101

102-
// Set the expected rows for the test
102+
/// Set the number of expected rows from the output of this test
103103
fn with_expected_rows(mut self, rows: usize) -> Self {
104-
self.expected_results = rows;
104+
self.expected_rows = rows;
105105
self
106106
}
107107

@@ -145,8 +145,10 @@ impl RowGroupPruningTest {
145145
);
146146
assert_eq!(
147147
output.result_rows,
148-
self.expected_results,
149-
"mismatched expected rows: {}",
148+
self.expected_rows,
149+
"Expected {} rows, got {}: {}",
150+
output.result_rows,
151+
self.expected_rows,
150152
output.description(),
151153
);
152154
}

datafusion/datasource-avro/src/avro_to_arrow/schema.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -235,6 +235,8 @@ fn default_field_name(dt: &DataType) -> &str {
235235
| DataType::LargeListView(_) => {
236236
unimplemented!("View support not implemented")
237237
}
238+
DataType::Decimal32(_, _) => "decimal",
239+
DataType::Decimal64(_, _) => "decimal",
238240
DataType::Decimal128(_, _) => "decimal",
239241
DataType::Decimal256(_, _) => "decimal",
240242
}

datafusion/expr/src/utils.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -815,6 +815,8 @@ pub fn can_hash(data_type: &DataType) -> bool {
815815
DataType::Float16 => true,
816816
DataType::Float32 => true,
817817
DataType::Float64 => true,
818+
DataType::Decimal32(_, _) => true,
819+
DataType::Decimal64(_, _) => true,
818820
DataType::Decimal128(_, _) => true,
819821
DataType::Decimal256(_, _) => true,
820822
DataType::Timestamp(_, _) => true,

datafusion/proto-common/proto/datafusion_common.proto

Lines changed: 32 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,19 @@ enum IntervalUnit{
136136
MonthDayNano = 2;
137137
}
138138

139-
message Decimal{
139+
message Decimal32Type {
140+
reserved 1, 2;
141+
uint32 precision = 3;
142+
int32 scale = 4;
143+
}
144+
145+
message Decimal64Type {
146+
reserved 1, 2;
147+
uint32 precision = 3;
148+
int32 scale = 4;
149+
}
150+
151+
message Decimal128Type {
140152
reserved 1, 2;
141153
uint32 precision = 3;
142154
int32 scale = 4;
@@ -286,6 +298,8 @@ message ScalarValue{
286298
ScalarNestedValue struct_value = 32;
287299
ScalarNestedValue map_value = 41;
288300

301+
Decimal32 decimal32_value = 43;
302+
Decimal64 decimal64_value = 44;
289303
Decimal128 decimal128_value = 20;
290304
Decimal256 decimal256_value = 39;
291305

@@ -310,6 +324,18 @@ message ScalarValue{
310324
}
311325
}
312326

327+
message Decimal32{
328+
bytes value = 1;
329+
int64 p = 2;
330+
int64 s = 3;
331+
}
332+
333+
message Decimal64{
334+
bytes value = 1;
335+
int64 p = 2;
336+
int64 s = 3;
337+
}
338+
313339
message Decimal128{
314340
bytes value = 1;
315341
int64 p = 2;
@@ -352,7 +378,9 @@ message ArrowType{
352378
TimeUnit TIME32 = 21 ;
353379
TimeUnit TIME64 = 22 ;
354380
IntervalUnit INTERVAL = 23 ;
355-
Decimal DECIMAL = 24 ;
381+
Decimal32Type DECIMAL32 = 40;
382+
Decimal64Type DECIMAL64 = 41;
383+
Decimal128Type DECIMAL128 = 24;
356384
Decimal256Type DECIMAL256 = 36;
357385
List LIST = 25;
358386
List LARGE_LIST = 26;
@@ -480,9 +508,7 @@ message ParquetColumnOptions {
480508
uint64 bloom_filter_ndv = 7;
481509
}
482510

483-
oneof max_statistics_size_opt {
484-
uint32 max_statistics_size = 8;
485-
}
511+
reserved 8; // used to be uint32 max_statistics_size = 8;
486512
}
487513

488514
message ParquetOptions {
@@ -521,9 +547,7 @@ message ParquetOptions {
521547
string statistics_enabled = 13;
522548
}
523549

524-
oneof max_statistics_size_opt {
525-
uint64 max_statistics_size = 14;
526-
}
550+
reserved 14; // used to be uint32 max_statistics_size = 20;
527551

528552
oneof column_index_truncate_length_opt {
529553
uint64 column_index_truncate_length = 17;

0 commit comments

Comments
 (0)