-
Notifications
You must be signed in to change notification settings - Fork 299
fix: Add support for unsigned Arrow datatypes in schema conversion #1617
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -378,7 +378,24 @@ impl ArrowSchemaVisitor for ArrowSchemaConverter { | |
DataType::Int8 | DataType::Int16 | DataType::Int32 => { | ||
Ok(Type::Primitive(PrimitiveType::Int)) | ||
} | ||
// Cast unsigned types based on bit width (following Python implementation) | ||
DataType::UInt8 | DataType::UInt16 | DataType::UInt32 => { | ||
// Cast to next larger signed type to prevent overflow | ||
let bit_width = p.primitive_width().unwrap_or(0) * 8; // Convert bytes to bits | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This seems superflous, can't you just match on the data types and directly map them?
|
||
match bit_width { | ||
8 | 16 => Ok(Type::Primitive(PrimitiveType::Int)), // uint8/16 → int32 | ||
32 => Ok(Type::Primitive(PrimitiveType::Long)), // uint32 → int64 | ||
_ => Ok(Type::Primitive(PrimitiveType::Int)), // fallback | ||
} | ||
} | ||
DataType::Int64 => Ok(Type::Primitive(PrimitiveType::Long)), | ||
DataType::UInt64 => { | ||
// Block uint64 - no safe casting option | ||
Err(Error::new( | ||
ErrorKind::DataInvalid, | ||
"UInt64 is not supported. Use Int64 for values ≤ 9,223,372,036,854,775,807 or Decimal(20,0) for full uint64 range.", | ||
)) | ||
} | ||
DataType::Float32 => Ok(Type::Primitive(PrimitiveType::Float)), | ||
DataType::Float64 => Ok(Type::Primitive(PrimitiveType::Double)), | ||
DataType::Decimal128(p, s) => Type::decimal(*p as u32, *s as u32).map_err(|e| { | ||
|
@@ -1717,6 +1734,60 @@ mod tests { | |
} | ||
} | ||
|
||
#[test] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this probably isn't the right module, but it would probably be nice to have a test that actually exercises writing these types and then reading them back again? |
||
fn test_unsigned_type_casting() { | ||
// Test UInt32 → Int64 casting | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is it possible to maybe parameterize the non-error cases as least with expected input/output to avoid boiler plate? |
||
{ | ||
let arrow_field = Field::new("test", DataType::UInt32, false).with_metadata( | ||
HashMap::from([(PARQUET_FIELD_ID_META_KEY.to_string(), "1".to_string())]), | ||
); | ||
let arrow_schema = ArrowSchema::new(vec![arrow_field]); | ||
|
||
let iceberg_schema = arrow_schema_to_schema(&arrow_schema).unwrap(); | ||
|
||
// Verify UInt32 was cast to Long (int64) | ||
let iceberg_field = iceberg_schema.as_struct().fields().first().unwrap(); | ||
assert!(matches!( | ||
iceberg_field.field_type.as_ref(), | ||
Type::Primitive(PrimitiveType::Long) | ||
)); | ||
} | ||
|
||
// Test UInt8/UInt16 → Int32 casting | ||
{ | ||
let arrow_field = Field::new("test", DataType::UInt8, false).with_metadata( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: this doesn't look like this is testing UInt16? |
||
HashMap::from([(PARQUET_FIELD_ID_META_KEY.to_string(), "1".to_string())]), | ||
); | ||
let arrow_schema = ArrowSchema::new(vec![arrow_field]); | ||
|
||
let iceberg_schema = arrow_schema_to_schema(&arrow_schema).unwrap(); | ||
|
||
// Verify UInt8 was cast to Int (int32) | ||
let iceberg_field = iceberg_schema.as_struct().fields().first().unwrap(); | ||
assert!(matches!( | ||
iceberg_field.field_type.as_ref(), | ||
Type::Primitive(PrimitiveType::Int) | ||
)); | ||
} | ||
|
||
// Test UInt64 blocking | ||
{ | ||
let arrow_field = Field::new("test", DataType::UInt64, false).with_metadata( | ||
HashMap::from([(PARQUET_FIELD_ID_META_KEY.to_string(), "1".to_string())]), | ||
); | ||
let arrow_schema = ArrowSchema::new(vec![arrow_field]); | ||
|
||
let result = arrow_schema_to_schema(&arrow_schema); | ||
assert!(result.is_err()); | ||
assert!( | ||
result | ||
.unwrap_err() | ||
.to_string() | ||
.contains("UInt64 is not supported") | ||
); | ||
} | ||
} | ||
|
||
#[test] | ||
fn test_datum_conversion() { | ||
{ | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm sure python compatibility is a direct goal here?