alamb commented on code in PR #7961: URL: https://github.com/apache/arrow-rs/pull/7961#discussion_r2216003575
########## parquet-variant/src/variant/object.rs: ########## @@ -949,4 +949,48 @@ mod tests { // objects are still logically equal assert_eq!(v1, v2); } + + #[test] + fn test_compare_object_with_unsorted_dictionary_vs_sorted_dictionary() { + // create a sorted object + let mut b = VariantBuilder::new(); + let mut o = b.new_object(); + + o.insert("a", false); + o.insert("b", false); + + o.finish().unwrap(); + + let (m, v) = b.finish(); + + let variant_metadata = VariantMetadata::new(&m); + + dbg!(variant_metadata.iter().collect::<Vec<_>>()); + + let v1 = Variant::try_new(&m, &v).unwrap(); + + // Create metadata with an unsorted dictionary (field names are "a", "a", "b") + // Since field names are not unique, it is considered not sorted. + let metadata_bytes = vec![ + 0b0000_0001, + 3, // dictionary size + 0, // "a" + 1, // "b" + 2, // "a" + 3, + b'a', + b'b', + b'a', + ]; + let m = VariantMetadata::try_new(&metadata_bytes).unwrap(); + assert!(!m.is_sorted()); + + dbg!(m.iter().collect::<Vec<_>>()); + let v2 = Variant::new_with_metadata(m, &v); + + dbg!(v1.as_object().unwrap().iter().collect::<Vec<_>>()); Review Comment: do we still need the `dbg!`? ########## parquet-variant/src/variant/metadata.rs: ########## @@ -348,27 +348,20 @@ impl<'m> VariantMetadata<'m> { } } -// According to the spec, metadata dictionaries are not required to be in a specific order, -// to enable flexibility when constructing Variant values -// -// Instead of comparing the raw bytes of 2 variant metadata instances, this implementation -// checks whether the dictionary entries are equal -- regardless of their sorting order +// Metadata dictionaries can be in any order per the spec to allow flexible Variant construction +// This comparison checks for field name presence in both metadata instances rather than raw byte equality impl<'m> PartialEq for VariantMetadata<'m> { fn eq(&self, other: &Self) -> bool { - let is_equal = self.is_empty() == other.is_empty() - && self.is_fully_validated() == other.is_fully_validated() - && self.first_value_byte == other.first_value_byte - && self.validated == other.validated; - - let other_field_names: HashSet<&'m str> = HashSet::from_iter(other.iter()); + self.is_empty() == other.is_empty() Review Comment: This is likely to be quite slow for nested variants as it will continually re-compare the same metadata. It is probably ok for now (and was not made worse by this PR) -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org