Skip to content

Commit dda30ea

Browse files
committed
[FIX] fix issues with the spec
1 parent 397c717 commit dda30ea

File tree

1 file changed

+70
-62
lines changed

1 file changed

+70
-62
lines changed

parquet-variant-compute/src/variant_parser.rs

Lines changed: 70 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -167,9 +167,10 @@ impl VariantParser {
167167
pub fn parse_short_string_header(header_byte: u8) -> Result<ShortStringHeader, ArrowError> {
168168
let length = (header_byte >> 2) as usize;
169169

170-
if length > 13 {
170+
// Short strings can be up to 64 bytes (6-bit value: 0-63)
171+
if length > 63 {
171172
return Err(ArrowError::InvalidArgumentError(format!(
172-
"Short string length {} exceeds maximum of 13",
173+
"Short string length {} exceeds maximum of 63",
173174
length
174175
)));
175176
}
@@ -307,22 +308,23 @@ impl VariantParser {
307308
}
308309

309310
/// Get the data length for a primitive type
310-
pub fn get_primitive_data_length(primitive_type: &PrimitiveType) -> usize {
311+
/// Returns Some(len) for fixed-length types, None for variable-length types
312+
pub fn get_primitive_data_length(primitive_type: &PrimitiveType) -> Option<usize> {
311313
match primitive_type {
312-
PrimitiveType::Null | PrimitiveType::True | PrimitiveType::False => 0,
313-
PrimitiveType::Int8 => 1,
314-
PrimitiveType::Int16 => 2,
314+
PrimitiveType::Null | PrimitiveType::True | PrimitiveType::False => Some(0),
315+
PrimitiveType::Int8 => Some(1),
316+
PrimitiveType::Int16 => Some(2),
315317
PrimitiveType::Int32
316318
| PrimitiveType::Float
317319
| PrimitiveType::Decimal4
318-
| PrimitiveType::Date => 4,
320+
| PrimitiveType::Date => Some(4),
319321
PrimitiveType::Int64
320322
| PrimitiveType::Double
321323
| PrimitiveType::Decimal8
322324
| PrimitiveType::TimestampNtz
323-
| PrimitiveType::TimestampLtz => 8,
324-
PrimitiveType::Decimal16 => 16,
325-
PrimitiveType::Binary | PrimitiveType::String => 0, // Variable length, need to read from data
325+
| PrimitiveType::TimestampLtz => Some(8),
326+
PrimitiveType::Decimal16 => Some(16),
327+
PrimitiveType::Binary | PrimitiveType::String => None, // Variable length, need to read from data
326328
}
327329
}
328330

@@ -357,43 +359,41 @@ impl VariantParser {
357359
let primitive_type = Self::parse_primitive_header(value_bytes[0])?;
358360
let data_length = Self::get_primitive_data_length(&primitive_type);
359361

360-
if data_length == 0 {
361-
// Handle variable length types and null/boolean
362-
match primitive_type {
363-
PrimitiveType::Null | PrimitiveType::True | PrimitiveType::False => Ok(&[]),
364-
PrimitiveType::Binary | PrimitiveType::String => {
365-
// These require reading length from the data
366-
if value_bytes.len() < 5 {
367-
return Err(ArrowError::InvalidArgumentError(
368-
"Not enough bytes for variable length primitive".to_string(),
369-
));
370-
}
371-
let length = u32::from_le_bytes([
372-
value_bytes[1],
373-
value_bytes[2],
374-
value_bytes[3],
375-
value_bytes[4],
376-
]) as usize;
377-
if value_bytes.len() < 5 + length {
378-
return Err(ArrowError::InvalidArgumentError(
379-
"Variable length primitive data exceeds available bytes".to_string(),
380-
));
381-
}
382-
Ok(&value_bytes[5..5 + length])
362+
match data_length {
363+
Some(0) => {
364+
// Fixed-length 0-byte types (null/true/false)
365+
Ok(&[])
366+
}
367+
Some(len) => {
368+
// Fixed-length types with len bytes
369+
if value_bytes.len() < 1 + len {
370+
return Err(ArrowError::InvalidArgumentError(format!(
371+
"Fixed length primitive data length {} exceeds available bytes",
372+
len
373+
)));
383374
}
384-
_ => Err(ArrowError::InvalidArgumentError(format!(
385-
"Unhandled primitive type: {:?}",
386-
primitive_type
387-
))),
375+
Ok(&value_bytes[1..1 + len])
388376
}
389-
} else {
390-
if value_bytes.len() < 1 + data_length {
391-
return Err(ArrowError::InvalidArgumentError(format!(
392-
"Primitive data length {} exceeds available bytes",
393-
data_length
394-
)));
377+
None => {
378+
// Variable-length types (binary/string) - read length from data
379+
if value_bytes.len() < 5 {
380+
return Err(ArrowError::InvalidArgumentError(
381+
"Not enough bytes for variable length primitive".to_string(),
382+
));
383+
}
384+
let length = u32::from_le_bytes([
385+
value_bytes[1],
386+
value_bytes[2],
387+
value_bytes[3],
388+
value_bytes[4],
389+
]) as usize;
390+
if value_bytes.len() < 5 + length {
391+
return Err(ArrowError::InvalidArgumentError(
392+
"Variable length primitive data exceeds available bytes".to_string(),
393+
));
394+
}
395+
Ok(&value_bytes[5..5 + length])
395396
}
396-
Ok(&value_bytes[1..1 + data_length])
397397
}
398398
}
399399

@@ -500,14 +500,17 @@ mod tests {
500500
ShortStringHeader { length: 5 }
501501
);
502502

503-
// Test 13-length short string (maximum)
503+
// Test 63-length short string (maximum for 6-bit value)
504504
assert_eq!(
505-
VariantParser::parse_short_string_header(0b00110101).unwrap(),
506-
ShortStringHeader { length: 13 }
505+
VariantParser::parse_short_string_header(0b11111101).unwrap(),
506+
ShortStringHeader { length: 63 }
507507
);
508508

509-
// Test invalid length > 13
510-
assert!(VariantParser::parse_short_string_header(0b00111001).is_err());
509+
// Test that all values 0-63 are valid
510+
for length in 0..=63 {
511+
let header_byte = (length << 2) | 1; // short string type
512+
assert!(VariantParser::parse_short_string_header(header_byte as u8).is_ok());
513+
}
511514
}
512515

513516
#[test]
@@ -564,50 +567,55 @@ mod tests {
564567

565568
#[test]
566569
fn test_get_primitive_data_length() {
570+
// Test fixed-length 0-byte types
567571
assert_eq!(
568572
VariantParser::get_primitive_data_length(&PrimitiveType::Null),
569-
0
573+
Some(0)
570574
);
571575
assert_eq!(
572576
VariantParser::get_primitive_data_length(&PrimitiveType::True),
573-
0
577+
Some(0)
574578
);
575579
assert_eq!(
576580
VariantParser::get_primitive_data_length(&PrimitiveType::False),
577-
0
581+
Some(0)
578582
);
583+
584+
// Test fixed-length types with specific byte counts
579585
assert_eq!(
580586
VariantParser::get_primitive_data_length(&PrimitiveType::Int8),
581-
1
587+
Some(1)
582588
);
583589
assert_eq!(
584590
VariantParser::get_primitive_data_length(&PrimitiveType::Int16),
585-
2
591+
Some(2)
586592
);
587593
assert_eq!(
588594
VariantParser::get_primitive_data_length(&PrimitiveType::Int32),
589-
4
595+
Some(4)
590596
);
591597
assert_eq!(
592598
VariantParser::get_primitive_data_length(&PrimitiveType::Int64),
593-
8
599+
Some(8)
594600
);
595601
assert_eq!(
596602
VariantParser::get_primitive_data_length(&PrimitiveType::Double),
597-
8
603+
Some(8)
598604
);
599605
assert_eq!(
600606
VariantParser::get_primitive_data_length(&PrimitiveType::Decimal16),
601-
16
607+
Some(16)
602608
);
609+
610+
// Test variable-length types (should return None)
603611
assert_eq!(
604612
VariantParser::get_primitive_data_length(&PrimitiveType::Binary),
605-
0
606-
); // Variable length
613+
None
614+
);
607615
assert_eq!(
608616
VariantParser::get_primitive_data_length(&PrimitiveType::String),
609-
0
610-
); // Variable length
617+
None
618+
);
611619
}
612620

613621
#[test]

0 commit comments

Comments
 (0)