alamb commented on code in PR #7943:
URL: https://github.com/apache/arrow-rs/pull/7943#discussion_r2213275443


##########
parquet-variant/src/variant/metadata.rs:
##########
@@ -332,6 +334,30 @@ impl<'m> VariantMetadata<'m> {
     }
 }
 
+// According to the spec, metadata dictionaries are not required to be in a 
specific order,
+// to enable flexibility when constructing Variant values
+//
+// Instead of comparing the raw bytes of 2 variant metadata instances, this 
implementation
+// checks whether the dictionary entries are equal -- regardless of their 
sorting order
+impl<'m> PartialEq for VariantMetadata<'m> {
+    fn eq(&self, other: &Self) -> bool {
+        let is_equal = self.is_empty() == other.is_empty()
+            && self.is_fully_validated() == other.is_fully_validated()
+            && self.first_value_byte == other.first_value_byte
+            && self.validated == other.validated;

Review Comment:
   You could break early here if is_equal is false and not check the fields



##########
parquet-variant/src/variant/object.rs:
##########
@@ -387,6 +389,38 @@ impl<'m, 'v> VariantObject<'m, 'v> {
     }
 }
 
+// Custom implementation of PartialEq for variant objects
+//
+// According to the spec, field values are not required to be in the same 
order as the field IDs,
+// to enable flexibility when constructing Variant values
+//
+// Instead of comparing the raw bytes of 2 variant objects, this 
implementation recursively
+// checks whether the field values are equal -- regardless of their order
+impl<'m, 'v> PartialEq for VariantObject<'m, 'v> {
+    fn eq(&self, other: &Self) -> bool {
+        let mut is_equal = self.metadata == other.metadata
+            && self.header == other.header
+            && self.num_elements == other.num_elements
+            && self.first_field_offset_byte == other.first_field_offset_byte
+            && self.first_value_byte == other.first_value_byte
+            && self.validated == other.validated;

Review Comment:
   same comment here about validation



##########
parquet-variant/src/builder.rs:
##########
@@ -252,7 +304,28 @@ impl ValueBuffer {
         variant: Variant<'m, 'd>,
         metadata_builder: &mut MetadataBuilder,
     ) {
-        self.try_append_variant(variant, metadata_builder).unwrap();
+        match variant {
+            Variant::Null => self.append_null(),
+            Variant::BooleanTrue => self.append_bool(true),
+            Variant::BooleanFalse => self.append_bool(false),
+            Variant::Int8(v) => self.append_int8(v),
+            Variant::Int16(v) => self.append_int16(v),
+            Variant::Int32(v) => self.append_int32(v),
+            Variant::Int64(v) => self.append_int64(v),
+            Variant::Date(v) => self.append_date(v),
+            Variant::TimestampMicros(v) => self.append_timestamp_micros(v),
+            Variant::TimestampNtzMicros(v) => 
self.append_timestamp_ntz_micros(v),
+            Variant::Decimal4(decimal4) => self.append_decimal4(decimal4),
+            Variant::Decimal8(decimal8) => self.append_decimal8(decimal8),
+            Variant::Decimal16(decimal16) => self.append_decimal16(decimal16),
+            Variant::Float(v) => self.append_float(v),
+            Variant::Double(v) => self.append_double(v),
+            Variant::Binary(v) => self.append_binary(v),
+            Variant::String(s) => self.append_string(s),
+            Variant::ShortString(s) => self.append_short_string(s),
+            Variant::Object(obj) => self.append_object(metadata_builder, obj),
+            Variant::List(list) => self.append_list(metadata_builder, list),
+        }

Review Comment:
   I see -- I think it is the unfortunate consequence of having the distinction 
between the "validated" and "non validated" variant APIs
   
   Maybe we can add a few comments explaining the practical difference 
(performance vs extra error checking) 



##########
parquet-variant/src/variant/metadata.rs:
##########
@@ -332,6 +334,22 @@ impl<'m> VariantMetadata<'m> {
     }
 }
 
+impl<'m> PartialEq for VariantMetadata<'m> {
+    fn eq(&self, other: &Self) -> bool {
+        let mut is_equal = self.is_empty() == other.is_empty()
+            && self.is_fully_validated() == other.is_fully_validated()

Review Comment:
    I would expect the following to pass for all variants and metadata
   
   ```rust
   let variant1 = Variant::new(metadata, buffers);
   let variant2 = Variant::new(metadata, buffers).with_full_validation();
   assert_eq!(variant1, variant2)
   ```



##########
parquet-variant/src/variant/object.rs:
##########
@@ -718,4 +752,187 @@ mod tests {
         test_variant_object_with_large_data(16777216 + 1, 
OffsetSizeBytes::Four);
         // 2^24
     }
+
+    #[test]
+    fn test_objects_with_same_fields_are_equal() {
+        let mut b = VariantBuilder::new();
+        let mut o = b.new_object();
+
+        o.insert("b", ());
+        o.insert("c", ());
+        o.insert("a", ());
+
+        o.finish().unwrap();
+
+        let (m, v) = b.finish();
+
+        let v1 = Variant::try_new(&m, &v).unwrap();
+        let v2 = Variant::try_new(&m, &v).unwrap();
+
+        assert_eq!(v1, v2);
+    }
+
+    #[test]
+    fn test_same_objects_with_different_builder_are_equal() {
+        let mut b = VariantBuilder::new();
+        let mut o = b.new_object();
+
+        o.insert("a", ());
+        o.insert("b", false);
+
+        o.finish().unwrap();
+        let (m, v) = b.finish();
+
+        let v1 = Variant::try_new(&m, &v).unwrap();
+
+        let mut b = VariantBuilder::new();
+        let mut o = b.new_object();
+
+        o.insert("a", ());
+        o.insert("b", false);
+
+        o.finish().unwrap();
+        let (m, v) = b.finish();
+
+        let v2 = Variant::try_new(&m, &v).unwrap();
+
+        assert_eq!(v1, v2);
+    }
+
+    #[test]
+    fn test_objects_with_different_values_are_not_equal() {
+        let mut b = VariantBuilder::new();
+        let mut o = b.new_object();
+
+        o.insert("a", ());
+        o.insert("b", 4.3);
+
+        o.finish().unwrap();
+
+        let (m, v) = b.finish();
+
+        let v1 = Variant::try_new(&m, &v).unwrap();
+
+        // second object, same field name but different values
+        let mut b = VariantBuilder::new();
+        let mut o = b.new_object();
+
+        o.insert("a", ());
+        let mut inner_o = o.new_object("b");
+        inner_o.insert("a", 3.3);
+        inner_o.finish().unwrap();
+        o.finish().unwrap();
+
+        let (m, v) = b.finish();
+
+        let v2 = Variant::try_new(&m, &v).unwrap();
+
+        let m1 = v1.metadata().unwrap();
+        let m2 = v2.metadata().unwrap();
+
+        // metadata would be equal since they contain the same keys
+        assert_eq!(m1, m2);
+
+        // but the objects are not equal
+        assert_ne!(v1, v2);
+    }
+
+    #[test]
+    fn test_objects_with_different_field_names_are_not_equal() {
+        let mut b = VariantBuilder::new();
+        let mut o = b.new_object();
+
+        o.insert("a", ());
+        o.insert("b", 4.3);
+
+        o.finish().unwrap();
+
+        let (m, v) = b.finish();
+
+        let v1 = Variant::try_new(&m, &v).unwrap();
+
+        // second object, same field name but different values
+        let mut b = VariantBuilder::new();
+        let mut o = b.new_object();
+
+        o.insert("aardvark", ());
+        o.insert("barracuda", 3.3);
+
+        o.finish().unwrap();
+
+        let (m, v) = b.finish();
+        let v2 = Variant::try_new(&m, &v).unwrap();
+
+        assert_ne!(v1, v2);
+    }
+
+    #[test]
+    fn test_objects_with_different_insertion_order_are_equal() {
+        let mut b = VariantBuilder::new();
+        let mut o = b.new_object();
+
+        o.insert("b", false);
+        o.insert("a", ());
+
+        o.finish().unwrap();
+
+        let (m, v) = b.finish();
+
+        let v1 = Variant::try_new(&m, &v).unwrap();
+        assert!(!v1.metadata().unwrap().is_sorted());
+
+        // create another object pre-filled with field names, b and a
+        // but insert the fields in the order of a, b
+        let mut b = VariantBuilder::new().with_field_names(["b", 
"a"].into_iter());

Review Comment:
   👍 



##########
parquet-variant/src/builder.rs:
##########
@@ -216,6 +217,57 @@ impl ValueBuffer {
         self.append_slice(value.as_bytes());
     }
 
+    fn append_object(&mut self, metadata_builder: &mut MetadataBuilder, obj: 
VariantObject) {
+        let mut object_builder = self.new_object(metadata_builder);
+
+        for (field_name, value) in obj.iter() {
+            object_builder.insert(field_name, value);
+        }
+
+        object_builder.finish().unwrap();
+    }
+
+    fn try_append_object(
+        &mut self,
+        metadata_builder: &mut MetadataBuilder,
+        obj: VariantObject,
+    ) -> Result<(), ArrowError> {
+        let mut object_builder = self.new_object(metadata_builder);
+
+        for res in obj.iter_try() {

Review Comment:
   You might be able to consolidate the apis if you checked the `is_validated` 
dynamically
   
   For example, have a single try_append_object and internally try
   
   ```rust
   /// if source variant is already validated, use faster APIs
   if obj.is_validated() {
           for (field_name, value) in obj.iter() {
               object_builder.insert(field_name, value);
           }
   } else {
           for res in obj.iter_try() {
                let (field_name, value) = res?;
               object_builder.try_insert(field_name, value)?;
           }
   }
   ```



##########
parquet-variant/src/variant/metadata.rs:
##########
@@ -332,6 +334,30 @@ impl<'m> VariantMetadata<'m> {
     }
 }
 
+// According to the spec, metadata dictionaries are not required to be in a 
specific order,

Review Comment:
   Do we need a special is_equal check for VariantMetadata? It seems like now 
since VariantObject doesn't include a check for the metadata being equal we 
could avoid a special equality 🤔 
   
   



##########
parquet-variant/src/builder.rs:
##########
@@ -252,7 +304,28 @@ impl ValueBuffer {
         variant: Variant<'m, 'd>,
         metadata_builder: &mut MetadataBuilder,
     ) {
-        self.try_append_variant(variant, metadata_builder).unwrap();
+        match variant {
+            Variant::Null => self.append_null(),
+            Variant::BooleanTrue => self.append_bool(true),
+            Variant::BooleanFalse => self.append_bool(false),
+            Variant::Int8(v) => self.append_int8(v),
+            Variant::Int16(v) => self.append_int16(v),
+            Variant::Int32(v) => self.append_int32(v),
+            Variant::Int64(v) => self.append_int64(v),
+            Variant::Date(v) => self.append_date(v),
+            Variant::TimestampMicros(v) => self.append_timestamp_micros(v),
+            Variant::TimestampNtzMicros(v) => 
self.append_timestamp_ntz_micros(v),
+            Variant::Decimal4(decimal4) => self.append_decimal4(decimal4),
+            Variant::Decimal8(decimal8) => self.append_decimal8(decimal8),
+            Variant::Decimal16(decimal16) => self.append_decimal16(decimal16),
+            Variant::Float(v) => self.append_float(v),
+            Variant::Double(v) => self.append_double(v),
+            Variant::Binary(v) => self.append_binary(v),
+            Variant::String(s) => self.append_string(s),
+            Variant::ShortString(s) => self.append_short_string(s),
+            Variant::Object(obj) => self.append_object(metadata_builder, obj),
+            Variant::List(list) => self.append_list(metadata_builder, list),
+        }

Review Comment:
   I also left a suggestion above about how to potentially unify the code paths



##########
parquet-variant/src/variant/object.rs:
##########
@@ -718,4 +752,187 @@ mod tests {
         test_variant_object_with_large_data(16777216 + 1, 
OffsetSizeBytes::Four);
         // 2^24
     }
+
+    #[test]
+    fn test_objects_with_same_fields_are_equal() {
+        let mut b = VariantBuilder::new();
+        let mut o = b.new_object();
+
+        o.insert("b", ());
+        o.insert("c", ());
+        o.insert("a", ());
+
+        o.finish().unwrap();
+
+        let (m, v) = b.finish();
+
+        let v1 = Variant::try_new(&m, &v).unwrap();
+        let v2 = Variant::try_new(&m, &v).unwrap();
+
+        assert_eq!(v1, v2);
+    }
+
+    #[test]
+    fn test_same_objects_with_different_builder_are_equal() {
+        let mut b = VariantBuilder::new();
+        let mut o = b.new_object();
+
+        o.insert("a", ());
+        o.insert("b", false);
+
+        o.finish().unwrap();
+        let (m, v) = b.finish();
+
+        let v1 = Variant::try_new(&m, &v).unwrap();
+
+        let mut b = VariantBuilder::new();
+        let mut o = b.new_object();
+
+        o.insert("a", ());
+        o.insert("b", false);
+
+        o.finish().unwrap();
+        let (m, v) = b.finish();
+
+        let v2 = Variant::try_new(&m, &v).unwrap();
+
+        assert_eq!(v1, v2);
+    }
+
+    #[test]
+    fn test_objects_with_different_values_are_not_equal() {
+        let mut b = VariantBuilder::new();
+        let mut o = b.new_object();
+
+        o.insert("a", ());
+        o.insert("b", 4.3);
+
+        o.finish().unwrap();
+
+        let (m, v) = b.finish();
+
+        let v1 = Variant::try_new(&m, &v).unwrap();
+
+        // second object, same field name but different values
+        let mut b = VariantBuilder::new();
+        let mut o = b.new_object();
+
+        o.insert("a", ());
+        let mut inner_o = o.new_object("b");
+        inner_o.insert("a", 3.3);
+        inner_o.finish().unwrap();
+        o.finish().unwrap();
+
+        let (m, v) = b.finish();
+
+        let v2 = Variant::try_new(&m, &v).unwrap();
+
+        let m1 = v1.metadata().unwrap();
+        let m2 = v2.metadata().unwrap();
+
+        // metadata would be equal since they contain the same keys
+        assert_eq!(m1, m2);
+
+        // but the objects are not equal
+        assert_ne!(v1, v2);
+    }
+
+    #[test]
+    fn test_objects_with_different_field_names_are_not_equal() {
+        let mut b = VariantBuilder::new();
+        let mut o = b.new_object();
+
+        o.insert("a", ());
+        o.insert("b", 4.3);
+
+        o.finish().unwrap();
+
+        let (m, v) = b.finish();
+
+        let v1 = Variant::try_new(&m, &v).unwrap();
+
+        // second object, same field name but different values
+        let mut b = VariantBuilder::new();
+        let mut o = b.new_object();
+
+        o.insert("aardvark", ());
+        o.insert("barracuda", 3.3);
+
+        o.finish().unwrap();
+
+        let (m, v) = b.finish();
+        let v2 = Variant::try_new(&m, &v).unwrap();
+
+        assert_ne!(v1, v2);
+    }
+
+    #[test]
+    fn test_objects_with_different_insertion_order_are_equal() {
+        let mut b = VariantBuilder::new();
+        let mut o = b.new_object();
+
+        o.insert("b", false);
+        o.insert("a", ());
+
+        o.finish().unwrap();
+
+        let (m, v) = b.finish();
+
+        let v1 = Variant::try_new(&m, &v).unwrap();
+        assert!(!v1.metadata().unwrap().is_sorted());
+
+        // create another object pre-filled with field names, b and a
+        // but insert the fields in the order of a, b
+        let mut b = VariantBuilder::new().with_field_names(["b", 
"a"].into_iter());
+        let mut o = b.new_object();
+
+        o.insert("a", ());
+        o.insert("b", false);
+
+        o.finish().unwrap();
+
+        let (m, v) = b.finish();
+
+        let v2 = Variant::try_new(&m, &v).unwrap();
+
+        // v2 should also have a unsorted dictionary
+        assert!(!v2.metadata().unwrap().is_sorted());
+
+        assert_eq!(v1, v2);
+    }
+
+    #[test]
+    fn test_objects_with_differing_metadata_are_equal() {

Review Comment:
   For a follow on work, I think we should 



##########
parquet-variant/src/variant/object.rs:
##########
@@ -718,4 +752,187 @@ mod tests {
         test_variant_object_with_large_data(16777216 + 1, 
OffsetSizeBytes::Four);
         // 2^24
     }
+
+    #[test]
+    fn test_objects_with_same_fields_are_equal() {
+        let mut b = VariantBuilder::new();
+        let mut o = b.new_object();
+
+        o.insert("b", ());
+        o.insert("c", ());
+        o.insert("a", ());
+
+        o.finish().unwrap();
+
+        let (m, v) = b.finish();
+
+        let v1 = Variant::try_new(&m, &v).unwrap();
+        let v2 = Variant::try_new(&m, &v).unwrap();
+
+        assert_eq!(v1, v2);
+    }
+
+    #[test]
+    fn test_same_objects_with_different_builder_are_equal() {
+        let mut b = VariantBuilder::new();
+        let mut o = b.new_object();
+
+        o.insert("a", ());
+        o.insert("b", false);
+
+        o.finish().unwrap();
+        let (m, v) = b.finish();
+
+        let v1 = Variant::try_new(&m, &v).unwrap();
+
+        let mut b = VariantBuilder::new();
+        let mut o = b.new_object();
+
+        o.insert("a", ());
+        o.insert("b", false);
+
+        o.finish().unwrap();
+        let (m, v) = b.finish();
+
+        let v2 = Variant::try_new(&m, &v).unwrap();
+
+        assert_eq!(v1, v2);
+    }
+
+    #[test]
+    fn test_objects_with_different_values_are_not_equal() {
+        let mut b = VariantBuilder::new();
+        let mut o = b.new_object();
+
+        o.insert("a", ());
+        o.insert("b", 4.3);
+
+        o.finish().unwrap();
+
+        let (m, v) = b.finish();
+
+        let v1 = Variant::try_new(&m, &v).unwrap();
+
+        // second object, same field name but different values
+        let mut b = VariantBuilder::new();
+        let mut o = b.new_object();
+
+        o.insert("a", ());
+        let mut inner_o = o.new_object("b");
+        inner_o.insert("a", 3.3);
+        inner_o.finish().unwrap();
+        o.finish().unwrap();
+
+        let (m, v) = b.finish();
+
+        let v2 = Variant::try_new(&m, &v).unwrap();
+
+        let m1 = v1.metadata().unwrap();
+        let m2 = v2.metadata().unwrap();
+
+        // metadata would be equal since they contain the same keys
+        assert_eq!(m1, m2);
+
+        // but the objects are not equal
+        assert_ne!(v1, v2);
+    }
+
+    #[test]
+    fn test_objects_with_different_field_names_are_not_equal() {
+        let mut b = VariantBuilder::new();
+        let mut o = b.new_object();
+
+        o.insert("a", ());
+        o.insert("b", 4.3);
+
+        o.finish().unwrap();
+
+        let (m, v) = b.finish();
+
+        let v1 = Variant::try_new(&m, &v).unwrap();
+
+        // second object, same field name but different values
+        let mut b = VariantBuilder::new();
+        let mut o = b.new_object();
+
+        o.insert("aardvark", ());
+        o.insert("barracuda", 3.3);
+
+        o.finish().unwrap();
+
+        let (m, v) = b.finish();
+        let v2 = Variant::try_new(&m, &v).unwrap();
+
+        assert_ne!(v1, v2);
+    }
+
+    #[test]
+    fn test_objects_with_different_insertion_order_are_equal() {
+        let mut b = VariantBuilder::new();
+        let mut o = b.new_object();
+
+        o.insert("b", false);
+        o.insert("a", ());
+
+        o.finish().unwrap();
+
+        let (m, v) = b.finish();
+
+        let v1 = Variant::try_new(&m, &v).unwrap();
+        assert!(!v1.metadata().unwrap().is_sorted());
+
+        // create another object pre-filled with field names, b and a
+        // but insert the fields in the order of a, b
+        let mut b = VariantBuilder::new().with_field_names(["b", 
"a"].into_iter());
+        let mut o = b.new_object();
+
+        o.insert("a", ());
+        o.insert("b", false);
+
+        o.finish().unwrap();
+
+        let (m, v) = b.finish();
+
+        let v2 = Variant::try_new(&m, &v).unwrap();
+
+        // v2 should also have a unsorted dictionary
+        assert!(!v2.metadata().unwrap().is_sorted());
+
+        assert_eq!(v1, v2);
+    }
+
+    #[test]
+    fn test_objects_with_differing_metadata_are_equal() {
+        let mut b = VariantBuilder::new();
+        let mut o = b.new_object();
+
+        o.insert("a", ());
+        o.insert("b", 4.3);
+
+        o.finish().unwrap();
+
+        let (m, v) = b.finish();
+
+        let v1 = Variant::try_new(&m, &v).unwrap();
+        // v1 is sorted
+        assert!(v1.metadata().unwrap().is_sorted());
+
+        // create a second object with different insertion order
+        let mut b = VariantBuilder::new();
+        let mut o = b.new_object();
+
+        o.insert("b", 4.3);
+        o.insert("a", ());
+
+        o.finish().unwrap();
+
+        let (m, v) = b.finish();
+
+        let v2 = Variant::try_new(&m, &v).unwrap();
+        // v2 is not sorted
+        assert!(!v2.metadata().unwrap().is_sorted());
+
+        // objects are still logically equal
+        assert_eq!(v1, v2);
+    }

Review Comment:
   I think we should also add some other tests eventually too -- like lists and 
primitives



##########
parquet-variant/src/variant/metadata.rs:
##########
@@ -332,6 +334,22 @@ impl<'m> VariantMetadata<'m> {
     }
 }
 
+impl<'m> PartialEq for VariantMetadata<'m> {
+    fn eq(&self, other: &Self) -> bool {
+        let mut is_equal = self.is_empty() == other.is_empty()
+            && self.is_fully_validated() == other.is_fully_validated()

Review Comment:
   It seems like the `validated` and `is_fully_validated` flags  doesn't need 
to be part of a logical type check? Like two variants can be equal by value 
even if one is fully validated and one is not 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to