Skip to content

Commit cce2453

Browse files
Initial plumbing into variant_array and builder
1 parent ef79fe0 commit cce2453

File tree

6 files changed

+71
-54
lines changed

6 files changed

+71
-54
lines changed

parquet-variant-compute/benches/variant_kernels.rs

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
use arrow::array::{Array, ArrayRef, StringArray};
1919
use arrow::util::test_util::seedable_rng;
20+
use arrow_schema::{DataType, Field, Fields};
2021
use criterion::{criterion_group, criterion_main, Criterion};
2122
use parquet_variant::{Variant, VariantBuilder};
2223
use parquet_variant_compute::variant_get::{variant_get, GetOptions};
@@ -27,6 +28,14 @@ use rand::Rng;
2728
use rand::SeedableRng;
2829
use std::fmt::Write;
2930
use std::sync::Arc;
31+
32+
fn unshredded_schema_fields() -> Fields {
33+
let metadata_field = Field::new("metadata", DataType::BinaryView, false);
34+
let value_field = Field::new("value", DataType::BinaryView, false);
35+
36+
Fields::from(vec![metadata_field, value_field])
37+
}
38+
3039
fn benchmark_batch_json_string_to_variant(c: &mut Criterion) {
3140
let input_array = StringArray::from_iter_values(json_repeated_struct(8000));
3241
let array_ref: ArrayRef = Arc::new(input_array);
@@ -93,7 +102,7 @@ pub fn variant_get_bench(c: &mut Criterion) {
93102
};
94103

95104
c.bench_function("variant_get_primitive", |b| {
96-
b.iter(|| variant_get(&input.clone(), options.clone()))
105+
b.iter(|| variant_get(&input.clone(), options.clone(), unshredded_schema_fields()))
97106
});
98107
}
99108

@@ -108,7 +117,7 @@ criterion_main!(benches);
108117
fn create_primitive_variant_array(size: usize) -> VariantArray {
109118
let mut rng = StdRng::seed_from_u64(42);
110119

111-
let mut variant_builder = VariantArrayBuilder::new(1);
120+
let mut variant_builder = VariantArrayBuilder::try_new(1, unshredded_schema_fields()).unwrap();
112121

113122
for _ in 0..size {
114123
let mut builder = VariantBuilder::new();

parquet-variant-compute/src/from_json.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ mod test {
7878
let variant_array = batch_json_string_to_variant(&array_ref).unwrap();
7979

8080
let metadata_array = variant_array.metadata_field().as_binary_view();
81-
let value_array = variant_array.value_field().as_binary_view();
81+
let value_array = variant_array.value_field().unwrap().as_binary_view();
8282

8383
// Compare row 0
8484
assert!(!variant_array.is_null(0));

parquet-variant-compute/src/shredding.rs

Lines changed: 27 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ pub const METADATA: &str = "metadata";
2222
pub const VALUE: &str = "value";
2323
pub const TYPED_VALUE: &str = "typed_value";
2424

25-
#[derive(Debug, PartialEq)]
25+
#[derive(Debug, PartialEq, Clone)]
2626
pub enum ValueSchema {
2727
MissingValue,
2828
Value(usize),
@@ -33,7 +33,7 @@ pub enum ValueSchema {
3333
},
3434
}
3535

36-
#[derive(Debug)]
36+
#[derive(Debug, Clone)]
3737
pub struct VariantSchema {
3838
inner: Fields,
3939

@@ -54,7 +54,7 @@ impl VariantSchema {
5454
let (metadata_idx, metadata_field) = fields
5555
.iter()
5656
.enumerate()
57-
.find(|(i, f)| f.name() == METADATA)
57+
.find(|(_, f)| f.name() == METADATA)
5858
.ok_or_else(|| {
5959
ArrowError::InvalidArgumentError(
6060
"Invalid VariantArray: StructArray must contain a 'metadata' field".to_string(),
@@ -165,33 +165,50 @@ impl VariantSchema {
165165
})
166166
}
167167

168-
pub fn inner(self) -> Fields {
168+
pub fn inner(&self) -> &Fields {
169+
&self.inner
170+
}
171+
172+
pub fn into_inner(self) -> Fields {
169173
self.inner
170174
}
171175

176+
pub fn metadata_idx(&self) -> usize {
177+
self.metadata_idx
178+
}
179+
172180
pub fn metadata(&self) -> &FieldRef {
173181
self.inner.get(self.metadata_idx).unwrap()
174182
}
175183

176-
pub fn value(&self) -> Option<&FieldRef> {
184+
pub fn value_idx(&self) -> Option<usize> {
177185
match self.value_schema {
178186
ValueSchema::MissingValue => None,
179187
ValueSchema::ShreddedValue(_) => None,
180-
ValueSchema::Value(value_idx) => self.inner.get(value_idx),
181-
ValueSchema::PartiallyShredded { value_idx, .. } => self.inner.get(value_idx),
188+
ValueSchema::Value(value_idx) => Some(value_idx),
189+
ValueSchema::PartiallyShredded { value_idx, .. } => Some(value_idx),
182190
}
183191
}
184192

185-
pub fn shredded_value(&self) -> Option<&FieldRef> {
193+
pub fn value(&self) -> Option<&FieldRef> {
194+
self.value_idx().map(|i| self.inner.get(i).unwrap())
195+
}
196+
197+
pub fn shredded_value_idx(&self) -> Option<usize> {
186198
match self.value_schema {
187199
ValueSchema::MissingValue => None,
188200
ValueSchema::Value(_) => None,
189-
ValueSchema::ShreddedValue(shredded_idx) => self.inner.get(shredded_idx),
201+
ValueSchema::ShreddedValue(shredded_idx) => Some(shredded_idx),
190202
ValueSchema::PartiallyShredded {
191203
shredded_value_idx, ..
192-
} => self.inner.get(shredded_value_idx),
204+
} => Some(shredded_value_idx),
193205
}
194206
}
207+
208+
pub fn shredded_value(&self) -> Option<&FieldRef> {
209+
self.shredded_value_idx()
210+
.map(|i| self.inner.get(i).unwrap())
211+
}
195212
}
196213

197214
#[cfg(test)]

parquet-variant-compute/src/variant_array.rs

Lines changed: 28 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@ use parquet_variant::Variant;
2424
use std::any::Any;
2525
use std::sync::Arc;
2626

27+
use crate::shredding::VariantSchema;
28+
2729
/// An array of Parquet [`Variant`] values
2830
///
2931
/// A [`VariantArray`] wraps an Arrow [`StructArray`] that stores the underlying
@@ -60,11 +62,7 @@ pub struct VariantArray {
6062
/// int8.
6163
inner: StructArray,
6264

63-
/// Reference to the metadata column of inner
64-
metadata_ref: ArrayRef,
65-
66-
/// Reference to the value column of inner
67-
value_ref: ArrayRef,
65+
variant_schema: VariantSchema,
6866
}
6967

7068
impl VariantArray {
@@ -94,23 +92,11 @@ impl VariantArray {
9492
));
9593
};
9694

97-
// todo, remove this since we already do it in validate_shredded_schema
98-
let Some(metadata_field) = VariantArray::find_metadata_field(inner) else {
99-
return Err(ArrowError::InvalidArgumentError(
100-
"Invalid VariantArray: StructArray must contain a 'metadata' field".to_string(),
101-
));
102-
};
103-
104-
let Some(value_field) = VariantArray::find_value_field(inner) else {
105-
return Err(ArrowError::InvalidArgumentError(
106-
"Invalid VariantArray: StructArray must contain a 'value' field".to_string(),
107-
));
108-
};
95+
let variant_schema = VariantSchema::try_new(inner.fields().clone())?;
10996

11097
Ok(Self {
11198
inner: inner.clone(),
112-
metadata_ref: metadata_field,
113-
value_ref: value_field,
99+
variant_schema,
114100
})
115101
}
116102

@@ -126,34 +112,41 @@ impl VariantArray {
126112

127113
/// Return the [`Variant`] instance stored at the given row
128114
///
129-
/// Panics if the index is out of bounds.
115+
/// Panics if the index is out of bounds or value array does not exist.
130116
///
131117
/// Note: Does not do deep validation of the [`Variant`], so it is up to the
132118
/// caller to ensure that the metadata and value were constructed correctly.
119+
///
120+
/// Todo: reconstruct partially shredded or shredded variants
133121
pub fn value(&self, index: usize) -> Variant {
134122
let metadata = self.metadata_field().as_binary_view().value(index);
135-
let value = self.value_field().as_binary_view().value(index);
123+
let value = self
124+
.value_field()
125+
.expect("value field does not exist")
126+
.as_binary_view()
127+
.value(index);
136128
Variant::new(metadata, value)
137129
}
138130

139-
fn find_metadata_field(array: &StructArray) -> Option<ArrayRef> {
140-
array.column_by_name("metadata").cloned()
141-
}
142-
143-
fn find_value_field(array: &StructArray) -> Option<ArrayRef> {
144-
array.column_by_name("value").cloned()
145-
}
146-
147131
/// Return a reference to the metadata field of the [`StructArray`]
148132
pub fn metadata_field(&self) -> &ArrayRef {
149-
// spec says fields order is not guaranteed, so we search by name
150-
&self.metadata_ref
133+
let metadata_idx = self.variant_schema.metadata_idx();
134+
135+
self.inner.column(metadata_idx)
151136
}
152137

153138
/// Return a reference to the value field of the `StructArray`
154-
pub fn value_field(&self) -> &ArrayRef {
155-
// spec says fields order is not guaranteed, so we search by name
156-
&self.value_ref
139+
pub fn value_field(&self) -> Option<&ArrayRef> {
140+
self.variant_schema
141+
.value_idx()
142+
.map(|i| self.inner.column(i))
143+
}
144+
145+
/// Return a reference to the shredded value field of the `StructArray`
146+
pub fn shredded_value_field(&self) -> Option<&ArrayRef> {
147+
self.variant_schema
148+
.shredded_value_idx()
149+
.map(|i| self.inner.column(i))
157150
}
158151
}
159152

@@ -176,12 +169,9 @@ impl Array for VariantArray {
176169

177170
fn slice(&self, offset: usize, length: usize) -> ArrayRef {
178171
let slice = self.inner.slice(offset, length);
179-
let met = self.metadata_ref.slice(offset, length);
180-
let val = self.value_ref.slice(offset, length);
181172
Arc::new(Self {
182173
inner: slice,
183-
metadata_ref: met,
184-
value_ref: val,
174+
variant_schema: self.variant_schema.clone(),
185175
})
186176
}
187177

parquet-variant-compute/src/variant_array_builder.rs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
2020
use crate::{shredding::VariantSchema, VariantArray};
2121
use arrow::array::{ArrayRef, BinaryViewArray, BinaryViewBuilder, NullBufferBuilder, StructArray};
22-
use arrow_schema::{ArrowError, DataType, Field, Fields};
22+
use arrow_schema::{ArrowError, Fields};
2323
use parquet_variant::{Variant, VariantBuilder};
2424
use std::sync::Arc;
2525

@@ -191,6 +191,7 @@ fn binary_view_array_from_buffers(
191191
mod test {
192192
use super::*;
193193
use arrow::array::Array;
194+
use arrow_schema::{DataType, Field};
194195

195196
/// Test that both the metadata and value buffers are non nullable
196197
#[test]
@@ -212,7 +213,7 @@ mod test {
212213

213214
// the metadata and value fields of non shredded variants should not be null
214215
assert!(variant_array.metadata_field().nulls().is_none());
215-
assert!(variant_array.value_field().nulls().is_none());
216+
assert!(variant_array.value_field().unwrap().nulls().is_none());
216217
let DataType::Struct(fields) = variant_array.data_type() else {
217218
panic!("Expected VariantArray to have Struct data type");
218219
};

parquet-variant-compute/src/variant_get.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ pub fn variant_get(
4949
)));
5050
}
5151

52-
let mut builder = VariantArrayBuilder::try_new(variant_array.len(), shredded_schema).unwrap();
52+
let mut builder = VariantArrayBuilder::try_new(variant_array.len(), shredded_schema)?;
5353
for i in 0..variant_array.len() {
5454
let new_variant = variant_array.value(i);
5555
// TODO: perf?

0 commit comments

Comments
 (0)