Skip to content

Commit de9d386

Browse files
committed
[FIX] fix variant_array implementation
1 parent 314a599 commit de9d386

File tree

3 files changed

+183
-79
lines changed

3 files changed

+183
-79
lines changed

parquet-variant-compute/examples/field_removal.rs

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -107,10 +107,4 @@ fn main() {
107107
}
108108
}
109109

110-
println!("\n=== Performance Features ===");
111-
println!("✓ Efficient field removal at byte level");
112-
println!("✓ Support for nested field removal");
113-
println!("✓ Batch operations for cleaning multiple fields");
114-
println!("✓ Maintains data integrity during field removal");
115-
println!("✓ Foundation for data governance and privacy compliance");
116110
}

parquet-variant-compute/src/from_json.rs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ pub fn batch_json_string_to_variant(input: &ArrayRef) -> Result<VariantArray, Ar
5454
#[cfg(test)]
5555
mod test {
5656
use crate::batch_json_string_to_variant;
57-
use arrow::array::{Array, ArrayRef, StringArray};
57+
use arrow::array::{Array, ArrayRef, AsArray, StringArray};
5858
use arrow_schema::ArrowError;
5959
use std::sync::Arc;
6060

@@ -103,10 +103,10 @@ mod test {
103103
assert!(!value_array.is_null(4));
104104

105105
// Null rows should have 0-length metadata and value
106-
assert_eq!(metadata_array.value(1).len(), 0);
107-
assert_eq!(value_array.value(1).len(), 0);
108-
assert_eq!(metadata_array.value(4).len(), 0);
109-
assert_eq!(value_array.value(4).len(), 0);
106+
assert_eq!(metadata_array.as_binary_view().value(1).len(), 0);
107+
assert_eq!(value_array.as_binary_view().value(1).len(), 0);
108+
assert_eq!(metadata_array.as_binary_view().value(4).len(), 0);
109+
assert_eq!(value_array.as_binary_view().value(4).len(), 0);
110110
Ok(())
111111
}
112112
}

parquet-variant-compute/src/variant_array.rs

Lines changed: 178 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -15,18 +15,34 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18-
//! [`VariantArray`] implementation with hybrid byte-level and high-level APIs
18+
//! [`VariantArray`] implementation
1919
2020
use crate::field_operations::FieldOperations;
21-
use arrow::array::{Array, ArrayData, BinaryViewArray, StructArray};
21+
use arrow::array::{Array, ArrayData, ArrayRef, AsArray, StructArray};
2222
use arrow::buffer::NullBuffer;
23-
use arrow::datatypes::DataType;
24-
use arrow::error::ArrowError;
25-
use parquet_variant::{Variant, VariantMetadata};
23+
use arrow_schema::{ArrowError, DataType};
24+
use parquet_variant::Variant;
2625
use std::any::Any;
2726
use std::sync::Arc;
2827

29-
/// Array implementation for variant data with hybrid byte-level and high-level APIs
28+
/// An array of Parquet [`Variant`] values
29+
///
30+
/// A [`VariantArray`] wraps an Arrow [`StructArray`] that stores the underlying
31+
/// `metadata` and `value` fields, and adds convenience methods to access
32+
/// the `Variant`s
33+
///
34+
/// See [`VariantArrayBuilder`] for constructing a `VariantArray`.
35+
///
36+
/// [`VariantArrayBuilder`]: crate::VariantArrayBuilder
37+
///
38+
/// # Specification
39+
///
40+
/// 1. This code follows the conventions for storing variants in Arrow `StructArray`
41+
/// defined by [Extension Type for Parquet Variant arrow] and this [document].
42+
/// At the time of this writing, this is not yet a standardized Arrow extension type.
43+
///
44+
/// [Extension Type for Parquet Variant arrow]: https://github.com/apache/arrow/issues/46908
45+
/// [document]: https://docs.google.com/document/d/1pw0AWoMQY3SjD7R4LgbPvMjG_xSCtXp3rZHkVp9jpZ4/edit?usp=sharing
3046
#[derive(Debug)]
3147
pub struct VariantArray {
3248
/// StructArray of up to three fields:
@@ -53,12 +69,29 @@ pub struct VariantArray {
5369
}
5470

5571
impl VariantArray {
56-
/// Create a new VariantArray from a StructArray
57-
pub fn try_new(inner: Arc<StructArray>) -> Result<Self, ArrowError> {
58-
// Validate that the struct has the expected format
59-
if inner.num_columns() != 2 {
72+
/// Creates a new `VariantArray` from a [`StructArray`].
73+
///
74+
/// # Arguments
75+
/// - `inner` - The underlying [`StructArray`] that contains the variant data.
76+
///
77+
/// # Returns
78+
/// - A new instance of `VariantArray`.
79+
///
80+
/// # Errors:
81+
/// - If the `StructArray` does not contain the required fields
82+
///
83+
/// # Current support
84+
/// This structure does not (yet) support the full Arrow Variant Array specification.
85+
///
86+
/// Only `StructArrays` with `metadata` and `value` fields that are
87+
/// [`BinaryViewArray`] are supported. Shredded values are not currently supported
88+
/// nor are using types other than `BinaryViewArray`
89+
///
90+
/// [`BinaryViewArray`]: arrow::array::BinaryViewArray
91+
pub fn try_new(inner: ArrayRef) -> Result<Self, ArrowError> {
92+
let Some(inner) = inner.as_struct_opt() else {
6093
return Err(ArrowError::InvalidArgumentError(
61-
"Expected struct with exactly 2 columns (metadata, value)".to_string(),
94+
"Invalid VariantArray: requires StructArray as input".to_string(),
6295
));
6396
};
6497
// Ensure the StructArray has a metadata field of BinaryView
@@ -160,50 +193,49 @@ impl VariantArray {
160193

161194
Ok(Self { inner })
162195
}
163-
164-
/// Get the metadata field as a BinaryViewArray
165-
pub fn metadata_field(&self) -> &BinaryViewArray {
166-
self.inner.column(0)
167-
.as_any()
168-
.downcast_ref::<BinaryViewArray>()
169-
.expect("Expected metadata field to be BinaryViewArray")
196+
197+
/// Returns a reference to the underlying [`StructArray`].
198+
pub fn inner(&self) -> &StructArray {
199+
&self.inner
170200
}
171-
172-
/// Get the value field as a BinaryViewArray
173-
pub fn value_field(&self) -> &BinaryViewArray {
174-
self.inner.column(1)
175-
.as_any()
176-
.downcast_ref::<BinaryViewArray>()
177-
.expect("Expected value field to be BinaryViewArray")
201+
202+
/// Returns the inner [`StructArray`], consuming self
203+
pub fn into_inner(self) -> StructArray {
204+
self.inner
178205
}
179-
206+
207+
/// Return the [`Variant`] instance stored at the given row
208+
///
209+
/// Panics if the index is out of bounds.
210+
///
211+
/// Note: Does not do deep validation of the [`Variant`], so it is up to the
212+
/// caller to ensure that the metadata and value were constructed correctly.
213+
pub fn value(&self, index: usize) -> Variant {
214+
let metadata = self.metadata_field().as_binary_view().value(index);
215+
let value = self.value_field().as_binary_view().value(index);
216+
Variant::new(metadata, value)
217+
}
218+
219+
/// Return a reference to the metadata field of the [`StructArray`]
220+
pub fn metadata_field(&self) -> &ArrayRef {
221+
// spec says fields order is not guaranteed, so we search by name
222+
self.inner.column_by_name("metadata").unwrap()
223+
}
224+
225+
/// Return a reference to the value field of the `StructArray`
226+
pub fn value_field(&self) -> &ArrayRef {
227+
// spec says fields order is not guaranteed, so we search by name
228+
self.inner.column_by_name("value").unwrap()
229+
}
230+
180231
/// Get the metadata bytes for a specific index
181232
pub fn metadata(&self, index: usize) -> &[u8] {
182-
self.metadata_field().value(index).as_ref()
233+
self.metadata_field().as_binary_view().value(index).as_ref()
183234
}
184235

185236
/// Get the value bytes for a specific index
186237
pub fn value_bytes(&self, index: usize) -> &[u8] {
187-
self.value_field().value(index).as_ref()
188-
}
189-
190-
/// Get the parsed variant at a specific index
191-
pub fn value(&self, index: usize) -> Variant {
192-
if index >= self.len() {
193-
panic!("Index {} out of bounds for array of length {}", index, self.len());
194-
}
195-
196-
if self.is_null(index) {
197-
return Variant::Null;
198-
}
199-
200-
let metadata = self.metadata(index);
201-
let value = self.value_bytes(index);
202-
203-
let variant_metadata = VariantMetadata::try_new(metadata)
204-
.expect("Failed to parse variant metadata");
205-
Variant::try_new_with_metadata(variant_metadata, value)
206-
.expect("Failed to create variant from metadata and value")
238+
self.value_field().as_binary_view().value(index).as_ref()
207239
}
208240

209241
/// Get value at a specific path for the variant at the given index
@@ -342,18 +374,15 @@ impl Array for VariantArray {
342374
fn as_any(&self) -> &dyn Any {
343375
self
344376
}
345-
377+
346378
fn to_data(&self) -> ArrayData {
347379
self.inner.to_data()
348380
}
349-
381+
350382
fn into_data(self) -> ArrayData {
351-
match Arc::try_unwrap(self.inner) {
352-
Ok(inner) => inner.into_data(),
353-
Err(inner) => inner.to_data(),
354-
}
383+
self.inner.into_data()
355384
}
356-
385+
357386
fn data_type(&self) -> &DataType {
358387
self.inner.data_type()
359388
}
@@ -368,38 +397,111 @@ impl Array for VariantArray {
368397
value_ref: val,
369398
})
370399
}
371-
400+
372401
fn len(&self) -> usize {
373402
self.inner.len()
374403
}
375-
404+
376405
fn is_empty(&self) -> bool {
377406
self.inner.is_empty()
378407
}
379-
380-
fn nulls(&self) -> Option<&NullBuffer> {
381-
self.inner.nulls()
382-
}
383-
408+
384409
fn offset(&self) -> usize {
385410
self.inner.offset()
386411
}
387-
412+
413+
fn nulls(&self) -> Option<&NullBuffer> {
414+
self.inner.nulls()
415+
}
416+
388417
fn get_buffer_memory_size(&self) -> usize {
389418
self.inner.get_buffer_memory_size()
390419
}
391-
420+
392421
fn get_array_memory_size(&self) -> usize {
393422
self.inner.get_array_memory_size()
394423
}
395424
}
396425

397426
#[cfg(test)]
398-
mod tests {
427+
mod test {
399428
use super::*;
400429
use crate::variant_array_builder::VariantArrayBuilder;
430+
use arrow::array::{BinaryArray, BinaryViewArray};
431+
use arrow_schema::{Field, Fields};
401432
use parquet_variant::VariantBuilder;
402433

434+
#[test]
435+
fn invalid_not_a_struct_array() {
436+
let array = make_binary_view_array();
437+
// Should fail because the input is not a StructArray
438+
let err = VariantArray::try_new(array);
439+
assert_eq!(
440+
err.unwrap_err().to_string(),
441+
"Invalid argument error: Invalid VariantArray: requires StructArray as input"
442+
);
443+
}
444+
445+
#[test]
446+
fn invalid_missing_metadata() {
447+
let fields = Fields::from(vec![Field::new("value", DataType::BinaryView, true)]);
448+
let array = StructArray::new(fields, vec![make_binary_view_array()], None);
449+
// Should fail because the StructArray does not contain a 'metadata' field
450+
let err = VariantArray::try_new(Arc::new(array));
451+
assert_eq!(
452+
err.unwrap_err().to_string(),
453+
"Invalid argument error: Invalid VariantArray: StructArray must contain a 'metadata' field"
454+
);
455+
}
456+
457+
#[test]
458+
fn invalid_missing_value() {
459+
let fields = Fields::from(vec![Field::new("metadata", DataType::BinaryView, false)]);
460+
let array = StructArray::new(fields, vec![make_binary_view_array()], None);
461+
// Should fail because the StructArray does not contain a 'value' field
462+
let err = VariantArray::try_new(Arc::new(array));
463+
assert_eq!(
464+
err.unwrap_err().to_string(),
465+
"Invalid argument error: Invalid VariantArray: StructArray must contain a 'value' field"
466+
);
467+
}
468+
469+
#[test]
470+
fn invalid_metadata_field_type() {
471+
let fields = Fields::from(vec![
472+
Field::new("metadata", DataType::Binary, true), // Not yet supported
473+
Field::new("value", DataType::BinaryView, true),
474+
]);
475+
let array = StructArray::new(
476+
fields,
477+
vec![make_binary_array(), make_binary_view_array()],
478+
None,
479+
);
480+
let err = VariantArray::try_new(Arc::new(array));
481+
assert_eq!(
482+
err.unwrap_err().to_string(),
483+
"Not yet implemented: VariantArray 'metadata' field must be BinaryView, got Binary"
484+
);
485+
}
486+
487+
#[test]
488+
fn invalid_value_field_type() {
489+
let fields = Fields::from(vec![
490+
Field::new("metadata", DataType::BinaryView, true),
491+
Field::new("value", DataType::Binary, true), // Not yet supported
492+
]);
493+
let array = StructArray::new(
494+
fields,
495+
vec![make_binary_view_array(), make_binary_array()],
496+
None,
497+
);
498+
let err = VariantArray::try_new(Arc::new(array));
499+
assert_eq!(
500+
err.unwrap_err().to_string(),
501+
"Not yet implemented: VariantArray 'value' field must be BinaryView, got Binary"
502+
);
503+
}
504+
403505
fn create_test_variant_array() -> VariantArray {
404506
let mut builder = VariantArrayBuilder::new(2);
405507

@@ -510,10 +612,18 @@ mod tests {
510612
assert_eq!(value_field.len(), 2);
511613

512614
// Check that metadata and value bytes are non-empty
513-
assert!(!metadata_field.value(0).is_empty());
514-
assert!(!value_field.value(0).is_empty());
515-
assert!(!metadata_field.value(1).is_empty());
516-
assert!(!value_field.value(1).is_empty());
615+
assert!(!metadata_field.as_binary_view().value(0).is_empty());
616+
assert!(!value_field.as_binary_view().value(0).is_empty());
617+
assert!(!metadata_field.as_binary_view().value(1).is_empty());
618+
assert!(!value_field.as_binary_view().value(1).is_empty());
619+
}
620+
621+
fn make_binary_view_array() -> ArrayRef {
622+
Arc::new(BinaryViewArray::from(vec![b"test" as &[u8]]))
623+
}
624+
625+
fn make_binary_array() -> ArrayRef {
626+
Arc::new(BinaryArray::from(vec![b"test" as &[u8]]))
517627
}
518628
}
519629

0 commit comments

Comments
 (0)