Skip to content

[Variant] WIP Tests for variant_get of shredded variants #7965

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion parquet-variant-compute/src/variant_array_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ impl VariantArrayBuilder {
}

/// Append the [`Variant`] to the builder as the next row
pub fn append_variant(&mut self, variant: Variant) {
pub fn append_variant<'m, 'v>(&mut self, variant: impl Into<Variant<'m, 'v>>) {
// TODO make this more efficient by avoiding the intermediate buffers
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we implement the approach @scovich suggested about invoking primitive append_xxx methods directly instead of using intermediate VariantBuilder?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

something like

pub fn append_variant<'m, 'v>(&mut self, variant: impl Into<Variant<'m, 'v>>) {
    let variant = variant.into();
    match variant {
        Variant::Int32(v) => self.append_int32(v),
        Variant::String(v) => self.append_string(v),
        Variant::Object(obj) => {
            // Build object directly into our buffers without intermediate VariantBuilder
            self.append_object_start();
            for field in obj.iter() {
                self.append_variant(field.value()); // recursive
            }
            self.append_object_end();
        }
        // ... other variants
    }
}

Copy link
Contributor Author

@alamb alamb Jul 21, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you are concerned about the intermediate buffers, I think after the following two PRs are merged:

There will be a way to append variant values to the final output buffer directly without copying to intermediate buffers

Is there another potential part of your question?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Got it, will follow these PRs then

let mut variant_builder = VariantBuilder::new();
variant_builder.append_value(variant);
Expand Down
228 changes: 224 additions & 4 deletions parquet-variant-compute/src/variant_get.rs
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ pub fn variant_get(input: &ArrayRef, options: GetOptions) -> Result<ArrayRef> {
}

/// Controls the action of the variant_get kernel.
#[derive(Debug, Clone)]
#[derive(Debug, Clone, Default)]
pub struct GetOptions<'a> {
/// What path to extract
pub path: VariantPath<'a>,
Expand All @@ -75,6 +75,11 @@ pub struct GetOptions<'a> {
}

impl<'a> GetOptions<'a> {
/// Construct default options to get the specified path as a variant.
pub fn new() -> Self {
Default::default()
}

/// Construct options to get the specified path as a variant.
pub fn new_with_path(path: VariantPath<'a>) -> Self {
Self {
Expand All @@ -83,17 +88,27 @@ impl<'a> GetOptions<'a> {
cast_options: Default::default(),
}
}

/// Specify the type to return.
pub fn with_as_type(mut self, as_type: Option<Field>) -> Self {
self.as_type = as_type;
self
}
}

#[cfg(test)]
mod test {
use std::sync::Arc;

use arrow::array::{Array, ArrayRef, StringArray};
use parquet_variant::VariantPath;
use arrow::array::{
Array, ArrayRef, AsArray, BinaryViewArray, Int32Array, StringArray, StructArray,
};
use arrow::buffer::NullBuffer;
use arrow_schema::{DataType, Field, FieldRef, Fields};
use parquet_variant::{Variant, VariantPath};

use crate::batch_json_string_to_variant;
use crate::VariantArray;
use crate::{batch_json_string_to_variant, VariantArrayBuilder};

use super::{variant_get, GetOptions};

Expand Down Expand Up @@ -177,4 +192,209 @@ mod test {
r#"{"inner_field": 1234}"#,
);
}

/// Shredding: extract a value as a VariantArray
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The PR just adds tests here -- to show what I think the output of variant_get should be for different variant cases.

#[test]
fn get_variant_shredded_int32_as_variant() {
let array = shredded_int32_variant_array();
let options = GetOptions::new();
let result = variant_get(&array, options).unwrap();

// expect the result is a VariantArray
let result: &VariantArray = result.as_any().downcast_ref().unwrap();
assert_eq!(result.len(), 4);

// Expect the values are the same as the original values
assert_eq!(result.value(0), Variant::Int32(34));
assert!(!result.is_valid(1));
assert_eq!(result.value(2), Variant::from("N/A"));
assert_eq!(result.value(3), Variant::Int32(100));
}

/// Shredding: extract a value as an Int32Array
#[test]
fn get_variant_shredded_int32_as_int32() {
// Extract the typed value as Int32Array
let array = shredded_int32_variant_array();
let options = GetOptions::new()
// specify we want the typed value as Int32
.with_as_type(Some(Field::new("typed_value", DataType::Int32, true)));
let result = variant_get(&array, options).unwrap();
let expected: ArrayRef = Arc::new(Int32Array::from(vec![Some(34), None, None, Some(100)]));
assert_eq!(&result, &expected)
}

/// Perfect Shredding: extract the typed value as a VariantArray
#[test]
fn get_variant_perfectly_shredded_int32_as_variant() {
let array = perfectly_shredded_int32_variant_array();
let options = GetOptions::new();
let result = variant_get(&array, options).unwrap();

// expect the result is a VariantArray
let result: &VariantArray = result.as_any().downcast_ref().unwrap();
assert_eq!(result.len(), 3);

// Expect the values are the same as the original values
assert_eq!(result.value(0), Variant::Int32(1));
Copy link
Member

@klion26 klion26 Jul 21, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the GetOptions points to the leaf node, will the result here be equal to the result in test get_variant_perfectly_shredded_int32_as_int32

assert_eq!(result.value(1), Variant::Int32(2));
assert_eq!(result.value(2), Variant::Int32(3));
}

/// Shredding: Extract the typed value as Int32Array
#[test]
fn get_variant_perfectly_shredded_int32_as_int32() {
// Extract the typed value as Int32Array
let array = perfectly_shredded_int32_variant_array();
let options = GetOptions::new()
// specify we want the typed value as Int32
.with_as_type(Some(Field::new("typed_value", DataType::Int32, true)));
let result = variant_get(&array, options).unwrap();
let expected: ArrayRef = Arc::new(Int32Array::from(vec![Some(1), Some(2), Some(3)]));
assert_eq!(&result, &expected)
}

/// Return a VariantArray that represents a perfectly "shredded" variant
/// for the following example (3 Variant::Int32 values):
///
/// ```text
/// 1
/// 2
/// 3
/// ```
///
/// The schema of the corresponding `StructArray` would look like this:
///
/// ```text
/// StructArray {
/// metadata: BinaryViewArray,
/// typed_value: Int32Array,
/// }
/// ```
fn perfectly_shredded_int32_variant_array() -> ArrayRef {
// At the time of writing, the `VariantArrayBuilder` does not support shredding.
// so we must construct the array manually. see https://github.com/apache/arrow-rs/issues/7895
let (metadata, _value) = { parquet_variant::VariantBuilder::new().finish() };

let metadata = BinaryViewArray::from_iter_values(std::iter::repeat_n(&metadata, 3));
let typed_value = Int32Array::from(vec![Some(1), Some(2), Some(3)]);

let struct_array = StructArrayBuilder::new()
.with_field("metadata", Arc::new(metadata))
.with_field("typed_value", Arc::new(typed_value))
.build();

Arc::new(
VariantArray::try_new(Arc::new(struct_array)).expect("should create variant array"),
)
Comment on lines +282 to +289
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From what I understood of the spec, the value field needs to be present but it can have nulls. Did I understand it incorrectly? If not, this struct would be an invalid variant

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My reading of https://docs.google.com/document/d/1pw0AWoMQY3SjD7R4LgbPvMjG_xSCtXp3rZHkVp9jpZ4/ was that value was optional in the case the variant shredded "perfectly"

Screenshot 2025-07-18 at 3 04 18 PM

I may also be misreading it.

Maybe @zeroshade has some time to coment

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The shredding spec definitely says both columns are optional:

Both value and typed_value are optional fields used together to encode a single value.

Additionally:

Values in the two fields must be interpreted according to the following table:

value typed_value Meaning
null null The value is missing; only valid for shredded object fields
non-null null The value is present and may be any type, including null
null non-null The value is present and is the shredded type
non-null non-null The value is present and is a partially shredded object

As always in parquet, a physically missing column is interpreted as all-null.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

CC @cashmand in case I totally misunderstood something

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed, the intent is that a missing value column should be treated as equivalent to a present-but-all-null value column.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Correct, both are optional fields, but it's required to have at least one of them.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Got it. Thanks all!

}

/// Return a VariantArray that represents a normal "shredded" variant
/// for the following example
///
/// Based on the example from [the doc]
///
/// [the doc]: https://docs.google.com/document/d/1pw0AWoMQY3SjD7R4LgbPvMjG_xSCtXp3rZHkVp9jpZ4/edit?tab=t.0
///
/// ```text
/// 34
/// null (an Arrow NULL, not a Variant::Null)
/// "n/a" (a string)
/// 100
/// ```
///
/// The schema of the corresponding `StructArray` would look like this:
///
/// ```text
/// StructArray {
/// metadata: BinaryViewArray,
/// value: BinaryViewArray,
/// typed_value: Int32Array,
/// }
/// ```
fn shredded_int32_variant_array() -> ArrayRef {
// At the time of writing, the `VariantArrayBuilder` does not support shredding.
// so we must construct the array manually. see https://github.com/apache/arrow-rs/issues/7895
let (metadata, string_value) = {
let mut builder = parquet_variant::VariantBuilder::new();
builder.append_value("n/a");
builder.finish()
};

let nulls = NullBuffer::from(vec![
true, // row 0 non null
false, // row 1 is null
true, // row 2 non null
true, // row 3 non null
]);

// metadata is the same for all rows
let metadata = BinaryViewArray::from_iter_values(std::iter::repeat_n(&metadata, 4));

// See https://docs.google.com/document/d/1pw0AWoMQY3SjD7R4LgbPvMjG_xSCtXp3rZHkVp9jpZ4/edit?disco=AAABml8WQrY
// about why row1 is an empty but non null, value.
let values = BinaryViewArray::from(vec![
None, // row 0 is shredded, so no value
Some(b"" as &[u8]), // row 1 is null, so empty value (why?)
Some(&string_value), // copy the string value "N/A"
None, // row 3 is shredded, so no value
]);

let typed_value = Int32Array::from(vec![
Some(34), // row 0 is shredded, so it has a value
None, // row 1 is null, so no value
None, // row 2 is a string, so no typed value
Some(100), // row 3 is shredded, so it has a value
]);

let struct_array = StructArrayBuilder::new()
.with_field("metadata", metadata)
.with_field("typed_value", Arc::new(typed_value))
.with_field("value", Arc::new(values))
.with_nulls(nulls)
.build();

Arc::new(
VariantArray::try_new(Arc::new(struct_array)).expect("should create variant array"),
)
}

/// Builds struct arrays from component fields
///
/// TODO: move to arrow crate
#[derive(Debug, Default, Clone)]
struct StructArrayBuilder {
fields: Vec<FieldRef>,
arrays: Vec<ArrayRef>,
nulls: Option<NullBuffer>,
}

impl StructArrayBuilder {
fn new() -> Self {
Default::default()
}

/// Add an array to this struct array as a field with the specified name.
fn with_field(mut self, field_name: &str, array: ArrayRef) -> Self {
let field = Field::new(field_name, array.data_type().clone(), true);
self.fields.push(Arc::new(field));
self.arrays.push(array);
self
}

/// Set the null buffer for this struct array.
fn with_nulls(mut self, nulls: NullBuffer) -> Self {
self.nulls = Some(nulls);
self
}

pub fn build(self) -> StructArray {
let Self {
fields,
arrays,
nulls,
} = self;
StructArray::new(Fields::from(fields), arrays, nulls)
}
}
}
2 changes: 1 addition & 1 deletion parquet-variant/src/path.rs
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ use std::{borrow::Cow, ops::Deref};
/// .join("baz");
/// assert_eq!(path[1], VariantPathElement::field("bar"));
/// ```
#[derive(Debug, Clone, PartialEq)]
#[derive(Debug, Clone, PartialEq, Default)]
pub struct VariantPath<'a>(Vec<VariantPathElement<'a>>);

impl<'a> VariantPath<'a> {
Expand Down
Loading