Skip to content

Commit f39461c

Browse files
[Variant] Revisit VariantMetadata and Object equality (#7961)
# Rationale for this change If a variant has an unsorted dictionary, you can't assume fields are unique nor ordered by name. This PR updates the logical equality check among `VariantMetadata` to properly handle this case. - Closes #7952 It also fixes a bug in #7934 where we do a uniqueness check when probing an unsorted dictionary --------- Co-authored-by: Andrew Lamb <[email protected]>
1 parent dff67c9 commit f39461c

File tree

2 files changed

+113
-51
lines changed

2 files changed

+113
-51
lines changed

parquet-variant/src/variant/metadata.rs

Lines changed: 57 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,6 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18-
use std::collections::HashSet;
19-
2018
use crate::decoder::{map_bytes_to_offsets, OffsetSizeBytes};
2119
use crate::utils::{first_byte_from_slice, overflow_error, slice_from_slice, string_from_slice};
2220

@@ -127,7 +125,7 @@ impl VariantMetadataHeader {
127125
///
128126
/// [`Variant`]: crate::Variant
129127
/// [Variant Spec]: https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#metadata-encoding
130-
#[derive(Debug, Clone)]
128+
#[derive(Debug, Clone, PartialEq)]
131129
pub struct VariantMetadata<'m> {
132130
pub(crate) bytes: &'m [u8],
133131
header: VariantMetadataHeader,
@@ -335,30 +333,6 @@ impl<'m> VariantMetadata<'m> {
335333
}
336334
}
337335

338-
// According to the spec, metadata dictionaries are not required to be in a specific order,
339-
// to enable flexibility when constructing Variant values
340-
//
341-
// Instead of comparing the raw bytes of 2 variant metadata instances, this implementation
342-
// checks whether the dictionary entries are equal -- regardless of their sorting order
343-
impl<'m> PartialEq for VariantMetadata<'m> {
344-
fn eq(&self, other: &Self) -> bool {
345-
let is_equal = self.is_empty() == other.is_empty()
346-
&& self.is_fully_validated() == other.is_fully_validated()
347-
&& self.first_value_byte == other.first_value_byte
348-
&& self.validated == other.validated;
349-
350-
let other_field_names: HashSet<&'m str> = HashSet::from_iter(other.iter());
351-
352-
for field_name in self.iter() {
353-
if !other_field_names.contains(field_name) {
354-
return false;
355-
}
356-
}
357-
358-
is_equal
359-
}
360-
}
361-
362336
/// Retrieves the ith dictionary entry, panicking if the index is out of bounds. Accessing
363337
/// [unvalidated] input could also panic if the underlying bytes are invalid.
364338
///
@@ -374,6 +348,8 @@ impl std::ops::Index<usize> for VariantMetadata<'_> {
374348
#[cfg(test)]
375349
mod tests {
376350

351+
use crate::VariantBuilder;
352+
377353
use super::*;
378354

379355
/// `"cat"`, `"dog"` – valid metadata
@@ -558,4 +534,58 @@ mod tests {
558534
"unexpected error: {err:?}"
559535
);
560536
}
537+
538+
#[test]
539+
fn test_compare_sorted_dictionary_with_unsorted_dictionary() {
540+
// create a sorted object
541+
let mut b = VariantBuilder::new();
542+
let mut o = b.new_object();
543+
544+
o.insert("a", false);
545+
o.insert("b", false);
546+
547+
o.finish().unwrap();
548+
549+
let (m, _) = b.finish();
550+
551+
let m1 = VariantMetadata::new(&m);
552+
assert!(m1.is_sorted());
553+
554+
// Create metadata with an unsorted dictionary (field names are "a", "a", "b")
555+
// Since field names are not unique, it is considered not sorted.
556+
let metadata_bytes = vec![
557+
0b0000_0001,
558+
3, // dictionary size
559+
0, // "a"
560+
1, // "a"
561+
2, // "b"
562+
3,
563+
b'a',
564+
b'a',
565+
b'b',
566+
];
567+
let m2 = VariantMetadata::try_new(&metadata_bytes).unwrap();
568+
assert!(!m2.is_sorted());
569+
570+
assert_ne!(m1, m2);
571+
}
572+
573+
#[test]
574+
fn test_compare_sorted_dictionary_with_sorted_dictionary() {
575+
// create a sorted object
576+
let mut b = VariantBuilder::new();
577+
let mut o = b.new_object();
578+
579+
o.insert("a", false);
580+
o.insert("b", false);
581+
582+
o.finish().unwrap();
583+
584+
let (m, _) = b.finish();
585+
586+
let m1 = VariantMetadata::new(&m);
587+
let m2 = VariantMetadata::new(&m);
588+
589+
assert_eq!(m1, m2);
590+
}
561591
}

parquet-variant/src/variant/object.rs

Lines changed: 56 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@ use crate::utils::{
2020
first_byte_from_slice, overflow_error, slice_from_slice, try_binary_search_range_by,
2121
};
2222
use crate::variant::{Variant, VariantMetadata};
23-
use std::collections::HashMap;
2423

2524
use arrow_schema::ArrowError;
2625

@@ -221,6 +220,7 @@ impl<'m, 'v> VariantObject<'m, 'v> {
221220

222221
let mut field_ids_iter =
223222
map_bytes_to_offsets(field_id_buffer, self.header.field_id_size);
223+
224224
// Validate all field ids exist in the metadata dictionary and the corresponding field names are lexicographically sorted
225225
if self.metadata.is_sorted() {
226226
// Since the metadata dictionary has unique and sorted field names, we can also guarantee this object's field names
@@ -263,7 +263,7 @@ impl<'m, 'v> VariantObject<'m, 'v> {
263263
let next_field_name = self.metadata.get(field_id)?;
264264

265265
if let Some(current_name) = current_field_name {
266-
if next_field_name <= current_name {
266+
if next_field_name < current_name {
267267
return Err(ArrowError::InvalidArgumentError(
268268
"field names not sorted".to_string(),
269269
));
@@ -412,26 +412,20 @@ impl<'m, 'v> VariantObject<'m, 'v> {
412412
// checks whether the field values are equal -- regardless of their order
413413
impl<'m, 'v> PartialEq for VariantObject<'m, 'v> {
414414
fn eq(&self, other: &Self) -> bool {
415-
let mut is_equal = self.metadata == other.metadata
416-
&& self.header == other.header
417-
&& self.num_elements == other.num_elements
418-
&& self.first_field_offset_byte == other.first_field_offset_byte
419-
&& self.first_value_byte == other.first_value_byte
420-
&& self.validated == other.validated;
421-
422-
// value validation
423-
let other_fields: HashMap<&str, Variant> = HashMap::from_iter(other.iter());
424-
425-
for (field_name, variant) in self.iter() {
426-
match other_fields.get(field_name as &str) {
427-
Some(other_variant) => {
428-
is_equal = is_equal && variant == *other_variant;
429-
}
430-
None => return false,
415+
if self.num_elements != other.num_elements {
416+
return false;
417+
}
418+
419+
// IFF two objects are valid and logically equal, they will have the same
420+
// field names in the same order, because the spec requires the object
421+
// fields to be sorted lexicographically.
422+
for ((name_a, value_a), (name_b, value_b)) in self.iter().zip(other.iter()) {
423+
if name_a != name_b || value_a != value_b {
424+
return false;
431425
}
432426
}
433427

434-
is_equal
428+
true
435429
}
436430
}
437431

@@ -938,28 +932,66 @@ mod tests {
938932

939933
o.finish().unwrap();
940934

941-
let (m, v) = b.finish();
935+
let (meta1, value1) = b.finish();
942936

943-
let v1 = Variant::try_new(&m, &v).unwrap();
937+
let v1 = Variant::try_new(&meta1, &value1).unwrap();
944938
// v1 is sorted
945939
assert!(v1.metadata().unwrap().is_sorted());
946940

947941
// create a second object with different insertion order
948-
let mut b = VariantBuilder::new();
942+
let mut b = VariantBuilder::new().with_field_names(["d", "c", "b", "a"].into_iter());
949943
let mut o = b.new_object();
950944

951945
o.insert("b", 4.3);
952946
o.insert("a", ());
953947

954948
o.finish().unwrap();
955949

956-
let (m, v) = b.finish();
950+
let (meta2, value2) = b.finish();
957951

958-
let v2 = Variant::try_new(&m, &v).unwrap();
952+
let v2 = Variant::try_new(&meta2, &value2).unwrap();
959953
// v2 is not sorted
960954
assert!(!v2.metadata().unwrap().is_sorted());
961955

956+
// object metadata are not the same
957+
assert_ne!(v1.metadata(), v2.metadata());
958+
962959
// objects are still logically equal
963960
assert_eq!(v1, v2);
964961
}
962+
963+
#[test]
964+
fn test_compare_object_with_unsorted_dictionary_vs_sorted_dictionary() {
965+
// create a sorted object
966+
let mut b = VariantBuilder::new();
967+
let mut o = b.new_object();
968+
969+
o.insert("a", false);
970+
o.insert("b", false);
971+
972+
o.finish().unwrap();
973+
974+
let (m, v) = b.finish();
975+
976+
let v1 = Variant::try_new(&m, &v).unwrap();
977+
978+
// Create metadata with an unsorted dictionary (field names are "a", "a", "b")
979+
// Since field names are not unique, it is considered not sorted.
980+
let metadata_bytes = vec![
981+
0b0000_0001,
982+
3, // dictionary size
983+
0, // "a"
984+
1, // "b"
985+
2, // "a"
986+
3,
987+
b'a',
988+
b'b',
989+
b'a',
990+
];
991+
let m = VariantMetadata::try_new(&metadata_bytes).unwrap();
992+
assert!(!m.is_sorted());
993+
994+
let v2 = Variant::new_with_metadata(m, &v);
995+
assert_eq!(v1, v2);
996+
}
965997
}

0 commit comments

Comments
 (0)