scovich commented on code in PR #7807:
URL: https://github.com/apache/arrow-rs/pull/7807#discussion_r2176146408


##########
parquet-variant/src/variant/object.rs:
##########
@@ -29,111 +29,196 @@ const NUM_HEADER_BYTES: usize = 1;
 /// Header structure for [`VariantObject`]
 #[derive(Clone, Debug, PartialEq)]
 pub(crate) struct VariantObjectHeader {
-    field_offset_size: OffsetSizeBytes,
+    num_elements_size: OffsetSizeBytes,
     field_id_size: OffsetSizeBytes,
-    is_large: bool,
+    field_offset_size: OffsetSizeBytes,
 }
 
 impl VariantObjectHeader {
+    // Hide the ugly casting
+    const fn num_elements_size(&self) -> usize {
+        self.num_elements_size as _
+    }
+    const fn field_id_size(&self) -> usize {
+        self.field_id_size as _
+    }
+    const fn field_offset_size(&self) -> usize {
+        self.field_offset_size as _
+    }
+
+    // Avoid materializing this offset, since it's cheaply and safely 
computable
+    const fn field_ids_start_byte(&self) -> usize {
+        NUM_HEADER_BYTES + self.num_elements_size()
+    }
+
     pub(crate) fn try_new(header_byte: u8) -> Result<Self, ArrowError> {
         // Parse the header byte to get object parameters
         let value_header = header_byte >> 2;
         let field_offset_size_minus_one = value_header & 0x03; // Last 2 bits
         let field_id_size_minus_one = (value_header >> 2) & 0x03; // Next 2 
bits
         let is_large = (value_header & 0x10) != 0; // 5th bit
-
+        let num_elements_size = match is_large {
+            true => OffsetSizeBytes::Four,
+            false => OffsetSizeBytes::One,
+        };
         Ok(Self {
-            field_offset_size: 
OffsetSizeBytes::try_new(field_offset_size_minus_one)?,
+            num_elements_size,
             field_id_size: OffsetSizeBytes::try_new(field_id_size_minus_one)?,
-            is_large,
+            field_offset_size: 
OffsetSizeBytes::try_new(field_offset_size_minus_one)?,
         })
     }
 }
 
 /// A [`Variant`] Object (struct with named fields).
+///
+/// See the [Variant spec] file for more information.
+///
+/// # Validation
+///
+/// Every instance of variant object is either _valid_ or _invalid_. depending 
on whether the
+/// underlying bytes are a valid encoding of a variant object subtype (see 
below).
+///
+/// Instances produced by [`Self::try_new`] or [`Self::validate`] are fully 
(and recursively)
+/// _validated_. They always contain _valid_ data, and infallible accesses 
such as iteration and
+/// indexing are panic-free. The validation cost is linear in the number of 
underlying bytes.
+///
+/// Instances produced by [`Self::new`] are _unvalidated_ and so they may 
contain either _valid_ or
+/// _invalid_ data. Infallible accesses such as iteration and indexing will 
panic if the underlying
+/// bytes are _invalid_, and fallible alternatives such as [`Self::iter_try`] 
and [`Self::get`] are
+/// provided as panic-free alternatives. [`Self::validate`] can also be used 
to _validate_ an
+/// _unvalidated_ instance, if desired.
+///
+/// _Unvalidated_ instances can be constructed in constant time. They can be 
useful if the caller
+/// knows the underlying bytes were already validated previously, or if the 
caller intends to
+/// perform a small number of (fallible) field accesses against a large object.
+///
+/// A _validated_ instance guarantees that:
+///
+/// - header byte is valid
+/// - num_elements is in bounds
+/// - field id array is in bounds
+/// - field offset array is in bounds
+/// - field value array is in bounds
+/// - all field ids are valid metadata dictionary entries (*)
+/// - field ids are lexically ordered according by their corresponding string 
values (*)
+/// - all field offsets are in bounds (*)
+/// - all field values are (recursively) _valid_ variant values (*)
+/// - the associated variant metadata is [valid] (*)
+///
+/// NOTE: [`Self::new`] only skips expensive (non-constant cost) validation 
checks (marked by `(*)`
+/// in the list above); it panics any of the other checks fails.
+///
+/// # Safety
+///
+/// Even an _invalid_ variant object instance is still _safe_ to use in the 
Rust sense. Accessing it
+/// with infallible methods may cause panics but will never lead to undefined 
behavior.
+///
+/// [valid]: VariantMetadata#Validation
+/// [Variant spec]: 
https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#value-data-for-object-basic_type2
 #[derive(Clone, Debug, PartialEq)]
 pub struct VariantObject<'m, 'v> {
     pub metadata: VariantMetadata<'m>,
     pub value: &'v [u8],
     header: VariantObjectHeader,
     num_elements: usize,
-    field_ids_start_byte: usize,
-    field_offsets_start_byte: usize,
-    values_start_byte: usize,
+    first_field_offset_byte: usize,
+    first_value_byte: usize,
+    validated: bool,
 }
 
 impl<'m, 'v> VariantObject<'m, 'v> {
-    /// Attempts to interpret `value` as a variant object value.
+    pub fn new(metadata: VariantMetadata<'m>, value: &'v [u8]) -> Self {
+        Self::try_new_impl(metadata, value).expect("Invalid variant object")
+    }
+
+    /// Attempts to interpet `metadata` and `value` as a variant object.
     ///
     /// # Validation
     ///
     /// This constructor verifies that `value` points to a valid variant 
object value. In
     /// particular, that all field ids exist in `metadata`, and all offsets 
are in-bounds and point
     /// to valid objects.
-    // TODO: How to make the validation non-recursive while still making 
iterators safely infallible??
-    // See https://github.com/apache/arrow-rs/issues/7711
     pub fn try_new(metadata: VariantMetadata<'m>, value: &'v [u8]) -> 
Result<Self, ArrowError> {
+        let mut new_self = Self::try_new_impl(metadata, value)?.validate()?;
+        new_self.validated = true;

Review Comment:
   Fixed



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to