diff --git a/parquet-variant-compute/src/variant_array_builder.rs b/parquet-variant-compute/src/variant_array_builder.rs index 6bc405c27b06..d9b58eee708a 100644 --- a/parquet-variant-compute/src/variant_array_builder.rs +++ b/parquet-variant-compute/src/variant_array_builder.rs @@ -146,7 +146,7 @@ impl VariantArrayBuilder { } /// Append the [`Variant`] to the builder as the next row - pub fn append_variant(&mut self, variant: Variant) { + pub fn append_variant<'m, 'v>(&mut self, variant: impl Into>) { // TODO make this more efficient by avoiding the intermediate buffers let mut variant_builder = VariantBuilder::new(); variant_builder.append_value(variant); diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index b3a3d9e41f13..7ce233ed40ce 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -62,7 +62,7 @@ pub fn variant_get(input: &ArrayRef, options: GetOptions) -> Result { } /// Controls the action of the variant_get kernel. -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Default)] pub struct GetOptions<'a> { /// What path to extract pub path: VariantPath<'a>, @@ -75,6 +75,11 @@ pub struct GetOptions<'a> { } impl<'a> GetOptions<'a> { + /// Construct default options to get the specified path as a variant. + pub fn new() -> Self { + Default::default() + } + /// Construct options to get the specified path as a variant. pub fn new_with_path(path: VariantPath<'a>) -> Self { Self { @@ -83,17 +88,27 @@ impl<'a> GetOptions<'a> { cast_options: Default::default(), } } + + /// Specify the type to return. + pub fn with_as_type(mut self, as_type: Option) -> Self { + self.as_type = as_type; + self + } } #[cfg(test)] mod test { use std::sync::Arc; - use arrow::array::{Array, ArrayRef, StringArray}; - use parquet_variant::VariantPath; + use arrow::array::{ + Array, ArrayRef, AsArray, BinaryViewArray, Int32Array, StringArray, StructArray, + }; + use arrow::buffer::NullBuffer; + use arrow_schema::{DataType, Field, FieldRef, Fields}; + use parquet_variant::{Variant, VariantPath}; - use crate::batch_json_string_to_variant; use crate::VariantArray; + use crate::{batch_json_string_to_variant, VariantArrayBuilder}; use super::{variant_get, GetOptions}; @@ -177,4 +192,209 @@ mod test { r#"{"inner_field": 1234}"#, ); } + + /// Shredding: extract a value as a VariantArray + #[test] + fn get_variant_shredded_int32_as_variant() { + let array = shredded_int32_variant_array(); + let options = GetOptions::new(); + let result = variant_get(&array, options).unwrap(); + + // expect the result is a VariantArray + let result: &VariantArray = result.as_any().downcast_ref().unwrap(); + assert_eq!(result.len(), 4); + + // Expect the values are the same as the original values + assert_eq!(result.value(0), Variant::Int32(34)); + assert!(!result.is_valid(1)); + assert_eq!(result.value(2), Variant::from("N/A")); + assert_eq!(result.value(3), Variant::Int32(100)); + } + + /// Shredding: extract a value as an Int32Array + #[test] + fn get_variant_shredded_int32_as_int32() { + // Extract the typed value as Int32Array + let array = shredded_int32_variant_array(); + let options = GetOptions::new() + // specify we want the typed value as Int32 + .with_as_type(Some(Field::new("typed_value", DataType::Int32, true))); + let result = variant_get(&array, options).unwrap(); + let expected: ArrayRef = Arc::new(Int32Array::from(vec![Some(34), None, None, Some(100)])); + assert_eq!(&result, &expected) + } + + /// Perfect Shredding: extract the typed value as a VariantArray + #[test] + fn get_variant_perfectly_shredded_int32_as_variant() { + let array = perfectly_shredded_int32_variant_array(); + let options = GetOptions::new(); + let result = variant_get(&array, options).unwrap(); + + // expect the result is a VariantArray + let result: &VariantArray = result.as_any().downcast_ref().unwrap(); + assert_eq!(result.len(), 3); + + // Expect the values are the same as the original values + assert_eq!(result.value(0), Variant::Int32(1)); + assert_eq!(result.value(1), Variant::Int32(2)); + assert_eq!(result.value(2), Variant::Int32(3)); + } + + /// Shredding: Extract the typed value as Int32Array + #[test] + fn get_variant_perfectly_shredded_int32_as_int32() { + // Extract the typed value as Int32Array + let array = perfectly_shredded_int32_variant_array(); + let options = GetOptions::new() + // specify we want the typed value as Int32 + .with_as_type(Some(Field::new("typed_value", DataType::Int32, true))); + let result = variant_get(&array, options).unwrap(); + let expected: ArrayRef = Arc::new(Int32Array::from(vec![Some(1), Some(2), Some(3)])); + assert_eq!(&result, &expected) + } + + /// Return a VariantArray that represents a perfectly "shredded" variant + /// for the following example (3 Variant::Int32 values): + /// + /// ```text + /// 1 + /// 2 + /// 3 + /// ``` + /// + /// The schema of the corresponding `StructArray` would look like this: + /// + /// ```text + /// StructArray { + /// metadata: BinaryViewArray, + /// typed_value: Int32Array, + /// } + /// ``` + fn perfectly_shredded_int32_variant_array() -> ArrayRef { + // At the time of writing, the `VariantArrayBuilder` does not support shredding. + // so we must construct the array manually. see https://github.com/apache/arrow-rs/issues/7895 + let (metadata, _value) = { parquet_variant::VariantBuilder::new().finish() }; + + let metadata = BinaryViewArray::from_iter_values(std::iter::repeat_n(&metadata, 3)); + let typed_value = Int32Array::from(vec![Some(1), Some(2), Some(3)]); + + let struct_array = StructArrayBuilder::new() + .with_field("metadata", Arc::new(metadata)) + .with_field("typed_value", Arc::new(typed_value)) + .build(); + + Arc::new( + VariantArray::try_new(Arc::new(struct_array)).expect("should create variant array"), + ) + } + + /// Return a VariantArray that represents a normal "shredded" variant + /// for the following example + /// + /// Based on the example from [the doc] + /// + /// [the doc]: https://docs.google.com/document/d/1pw0AWoMQY3SjD7R4LgbPvMjG_xSCtXp3rZHkVp9jpZ4/edit?tab=t.0 + /// + /// ```text + /// 34 + /// null (an Arrow NULL, not a Variant::Null) + /// "n/a" (a string) + /// 100 + /// ``` + /// + /// The schema of the corresponding `StructArray` would look like this: + /// + /// ```text + /// StructArray { + /// metadata: BinaryViewArray, + /// value: BinaryViewArray, + /// typed_value: Int32Array, + /// } + /// ``` + fn shredded_int32_variant_array() -> ArrayRef { + // At the time of writing, the `VariantArrayBuilder` does not support shredding. + // so we must construct the array manually. see https://github.com/apache/arrow-rs/issues/7895 + let (metadata, string_value) = { + let mut builder = parquet_variant::VariantBuilder::new(); + builder.append_value("n/a"); + builder.finish() + }; + + let nulls = NullBuffer::from(vec![ + true, // row 0 non null + false, // row 1 is null + true, // row 2 non null + true, // row 3 non null + ]); + + // metadata is the same for all rows + let metadata = BinaryViewArray::from_iter_values(std::iter::repeat_n(&metadata, 4)); + + // See https://docs.google.com/document/d/1pw0AWoMQY3SjD7R4LgbPvMjG_xSCtXp3rZHkVp9jpZ4/edit?disco=AAABml8WQrY + // about why row1 is an empty but non null, value. + let values = BinaryViewArray::from(vec![ + None, // row 0 is shredded, so no value + Some(b"" as &[u8]), // row 1 is null, so empty value (why?) + Some(&string_value), // copy the string value "N/A" + None, // row 3 is shredded, so no value + ]); + + let typed_value = Int32Array::from(vec![ + Some(34), // row 0 is shredded, so it has a value + None, // row 1 is null, so no value + None, // row 2 is a string, so no typed value + Some(100), // row 3 is shredded, so it has a value + ]); + + let struct_array = StructArrayBuilder::new() + .with_field("metadata", metadata) + .with_field("typed_value", Arc::new(typed_value)) + .with_field("value", Arc::new(values)) + .with_nulls(nulls) + .build(); + + Arc::new( + VariantArray::try_new(Arc::new(struct_array)).expect("should create variant array"), + ) + } + + /// Builds struct arrays from component fields + /// + /// TODO: move to arrow crate + #[derive(Debug, Default, Clone)] + struct StructArrayBuilder { + fields: Vec, + arrays: Vec, + nulls: Option, + } + + impl StructArrayBuilder { + fn new() -> Self { + Default::default() + } + + /// Add an array to this struct array as a field with the specified name. + fn with_field(mut self, field_name: &str, array: ArrayRef) -> Self { + let field = Field::new(field_name, array.data_type().clone(), true); + self.fields.push(Arc::new(field)); + self.arrays.push(array); + self + } + + /// Set the null buffer for this struct array. + fn with_nulls(mut self, nulls: NullBuffer) -> Self { + self.nulls = Some(nulls); + self + } + + pub fn build(self) -> StructArray { + let Self { + fields, + arrays, + nulls, + } = self; + StructArray::new(Fields::from(fields), arrays, nulls) + } + } } diff --git a/parquet-variant/src/path.rs b/parquet-variant/src/path.rs index 42dbdb3abc2d..073433393a7c 100644 --- a/parquet-variant/src/path.rs +++ b/parquet-variant/src/path.rs @@ -63,7 +63,7 @@ use std::{borrow::Cow, ops::Deref}; /// .join("baz"); /// assert_eq!(path[1], VariantPathElement::field("bar")); /// ``` -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Default)] pub struct VariantPath<'a>(Vec>); impl<'a> VariantPath<'a> {