Skip to content

Commit d0fa24e

Browse files
[Variant] Impl PartialEq for VariantObject (apache#7943)
# Rationale for this change - Closes apache#7948 This PR introduces a custom implementation of `PartialEq` for variant objects. According to the spec, field values are not required to be in the same order as the field IDs, to enable flexibility when constructing Variant values. Instead of comparing the raw bytes of 2 variant objects, this implementation recursively checks whether the field values are equal -- regardless of their order
1 parent dfe907f commit d0fa24e

File tree

3 files changed

+325
-34
lines changed

3 files changed

+325
-34
lines changed

parquet-variant/src/builder.rs

Lines changed: 79 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,12 @@
1616
// under the License.
1717
use crate::decoder::{VariantBasicType, VariantPrimitiveType};
1818
use crate::{
19-
ShortString, Variant, VariantDecimal16, VariantDecimal4, VariantDecimal8, VariantMetadata,
19+
ShortString, Variant, VariantDecimal16, VariantDecimal4, VariantDecimal8, VariantList,
20+
VariantMetadata, VariantObject,
2021
};
2122
use arrow_schema::ArrowError;
2223
use indexmap::{IndexMap, IndexSet};
23-
use std::collections::{HashMap, HashSet};
24+
use std::collections::HashSet;
2425

2526
const BASIC_TYPE_BITS: u8 = 2;
2627
const UNIX_EPOCH_DATE: chrono::NaiveDate = chrono::NaiveDate::from_ymd_opt(1970, 1, 1).unwrap();
@@ -216,6 +217,57 @@ impl ValueBuffer {
216217
self.append_slice(value.as_bytes());
217218
}
218219

220+
fn append_object(&mut self, metadata_builder: &mut MetadataBuilder, obj: VariantObject) {
221+
let mut object_builder = self.new_object(metadata_builder);
222+
223+
for (field_name, value) in obj.iter() {
224+
object_builder.insert(field_name, value);
225+
}
226+
227+
object_builder.finish().unwrap();
228+
}
229+
230+
fn try_append_object(
231+
&mut self,
232+
metadata_builder: &mut MetadataBuilder,
233+
obj: VariantObject,
234+
) -> Result<(), ArrowError> {
235+
let mut object_builder = self.new_object(metadata_builder);
236+
237+
for res in obj.iter_try() {
238+
let (field_name, value) = res?;
239+
object_builder.try_insert(field_name, value)?;
240+
}
241+
242+
object_builder.finish()?;
243+
244+
Ok(())
245+
}
246+
247+
fn append_list(&mut self, metadata_builder: &mut MetadataBuilder, list: VariantList) {
248+
let mut list_builder = self.new_list(metadata_builder);
249+
for value in list.iter() {
250+
list_builder.append_value(value);
251+
}
252+
list_builder.finish();
253+
}
254+
255+
fn try_append_list(
256+
&mut self,
257+
metadata_builder: &mut MetadataBuilder,
258+
list: VariantList,
259+
) -> Result<(), ArrowError> {
260+
let mut list_builder = self.new_list(metadata_builder);
261+
for res in list.iter_try() {
262+
let value = res?;
263+
list_builder.try_append_value(value)?;
264+
}
265+
266+
list_builder.finish();
267+
268+
Ok(())
269+
}
270+
219271
fn offset(&self) -> usize {
220272
self.0.len()
221273
}
@@ -252,9 +304,31 @@ impl ValueBuffer {
252304
variant: Variant<'m, 'd>,
253305
metadata_builder: &mut MetadataBuilder,
254306
) {
255-
self.try_append_variant(variant, metadata_builder).unwrap();
307+
match variant {
308+
Variant::Null => self.append_null(),
309+
Variant::BooleanTrue => self.append_bool(true),
310+
Variant::BooleanFalse => self.append_bool(false),
311+
Variant::Int8(v) => self.append_int8(v),
312+
Variant::Int16(v) => self.append_int16(v),
313+
Variant::Int32(v) => self.append_int32(v),
314+
Variant::Int64(v) => self.append_int64(v),
315+
Variant::Date(v) => self.append_date(v),
316+
Variant::TimestampMicros(v) => self.append_timestamp_micros(v),
317+
Variant::TimestampNtzMicros(v) => self.append_timestamp_ntz_micros(v),
318+
Variant::Decimal4(decimal4) => self.append_decimal4(decimal4),
319+
Variant::Decimal8(decimal8) => self.append_decimal8(decimal8),
320+
Variant::Decimal16(decimal16) => self.append_decimal16(decimal16),
321+
Variant::Float(v) => self.append_float(v),
322+
Variant::Double(v) => self.append_double(v),
323+
Variant::Binary(v) => self.append_binary(v),
324+
Variant::String(s) => self.append_string(s),
325+
Variant::ShortString(s) => self.append_short_string(s),
326+
Variant::Object(obj) => self.append_object(metadata_builder, obj),
327+
Variant::List(list) => self.append_list(metadata_builder, list),
328+
}
256329
}
257330

331+
/// Appends a variant to the buffer
258332
fn try_append_variant<'m, 'd>(
259333
&mut self,
260334
variant: Variant<'m, 'd>,
@@ -279,35 +353,8 @@ impl ValueBuffer {
279353
Variant::Binary(v) => self.append_binary(v),
280354
Variant::String(s) => self.append_string(s),
281355
Variant::ShortString(s) => self.append_short_string(s),
282-
Variant::Object(obj) => {
283-
let metadata_field_names = metadata_builder
284-
.field_names
285-
.iter()
286-
.enumerate()
287-
.map(|(i, f)| (f.clone(), i))
288-
.collect::<HashMap<_, _>>();
289-
290-
let mut object_builder = self.new_object(metadata_builder);
291-
292-
// first add all object fields that exist in metadata builder
293-
let mut object_fields = obj.iter().collect::<Vec<_>>();
294-
295-
object_fields
296-
.sort_by_key(|(field_name, _)| metadata_field_names.get(field_name as &str));
297-
298-
for (field_name, value) in object_fields {
299-
object_builder.insert(field_name, value);
300-
}
301-
302-
object_builder.finish()?;
303-
}
304-
Variant::List(list) => {
305-
let mut list_builder = self.new_list(metadata_builder);
306-
for value in list.iter() {
307-
list_builder.append_value(value);
308-
}
309-
list_builder.finish();
310-
}
356+
Variant::Object(obj) => self.try_append_object(metadata_builder, obj)?,
357+
Variant::List(list) => self.try_append_list(metadata_builder, list)?,
311358
}
312359

313360
Ok(())

parquet-variant/src/variant/metadata.rs

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18+
use std::collections::HashSet;
19+
1820
use crate::decoder::{map_bytes_to_offsets, OffsetSizeBytes};
1921
use crate::utils::{first_byte_from_slice, overflow_error, slice_from_slice, string_from_slice};
2022

@@ -125,7 +127,7 @@ impl VariantMetadataHeader {
125127
///
126128
/// [`Variant`]: crate::Variant
127129
/// [Variant Spec]: https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#metadata-encoding
128-
#[derive(Debug, Clone, PartialEq)]
130+
#[derive(Debug, Clone)]
129131
pub struct VariantMetadata<'m> {
130132
pub(crate) bytes: &'m [u8],
131133
header: VariantMetadataHeader,
@@ -346,6 +348,30 @@ impl<'m> VariantMetadata<'m> {
346348
}
347349
}
348350

351+
// According to the spec, metadata dictionaries are not required to be in a specific order,
352+
// to enable flexibility when constructing Variant values
353+
//
354+
// Instead of comparing the raw bytes of 2 variant metadata instances, this implementation
355+
// checks whether the dictionary entries are equal -- regardless of their sorting order
356+
impl<'m> PartialEq for VariantMetadata<'m> {
357+
fn eq(&self, other: &Self) -> bool {
358+
let is_equal = self.is_empty() == other.is_empty()
359+
&& self.is_fully_validated() == other.is_fully_validated()
360+
&& self.first_value_byte == other.first_value_byte
361+
&& self.validated == other.validated;
362+
363+
let other_field_names: HashSet<&'m str> = HashSet::from_iter(other.iter());
364+
365+
for field_name in self.iter() {
366+
if !other_field_names.contains(field_name) {
367+
return false;
368+
}
369+
}
370+
371+
is_equal
372+
}
373+
}
374+
349375
/// Retrieves the ith dictionary entry, panicking if the index is out of bounds. Accessing
350376
/// [unvalidated] input could also panic if the underlying bytes are invalid.
351377
///
@@ -360,6 +386,7 @@ impl std::ops::Index<usize> for VariantMetadata<'_> {
360386

361387
#[cfg(test)]
362388
mod tests {
389+
363390
use super::*;
364391

365392
/// `"cat"`, `"dog"` – valid metadata

0 commit comments

Comments
 (0)