scovich commented on code in PR #7704:
URL: https://github.com/apache/arrow-rs/pull/7704#discussion_r2155210416


##########
parquet-variant/src/utils.rs:
##########
@@ -46,9 +46,10 @@ pub(crate) fn map_try_from_slice_error(e: TryFromSliceError) 
-> ArrowError {
     ArrowError::InvalidArgumentError(e.to_string())
 }
 
-pub(crate) fn first_byte_from_slice(slice: &[u8]) -> Result<&u8, ArrowError> {
+pub(crate) fn first_byte_from_slice(slice: &[u8]) -> Result<u8, ArrowError> {

Review Comment:
   Small ergonomic improvement



##########
parquet-variant/src/variant.rs:
##########
@@ -116,9 +116,7 @@ impl VariantMetadataHeader {
     /// - sorted_strings is a 1-bit value indicating whether dictionary 
strings are sorted and unique.
     /// - offset_size_minus_one is a 2-bit value providing the number of bytes 
per dictionary size and offset field.
     /// - The actual number of bytes, offset_size, is offset_size_minus_one + 1
-    pub(crate) fn try_new(bytes: &[u8]) -> Result<Self, ArrowError> {
-        let header = first_byte_from_slice(bytes)?;
-
+    pub(crate) fn try_new(header: u8) -> Result<Self, ArrowError> {

Review Comment:
   Hoist the `first_byte_from_slice` call to caller, to avoid the kinds of 
misuse that had been creeping in.



##########
parquet-variant/src/variant.rs:
##########
@@ -367,150 +296,129 @@ impl VariantObjectHeader {
                 value.len()
             )));
         }
-        Ok(Self {
-            field_offset_size,
-            field_id_size,
+
+        let s = Self {
+            metadata,
+            value,
+            header,
             num_elements,
             field_ids_start_byte,
             field_offsets_start_byte,
             values_start_byte,
-        })
+        };
+
+        // Verify that `iter` can safely `unwrap` the items produced by this 
iterator.
+        //
+        // This has the side effect of validating the field_id and 
field_offset arrays, and also
+        // proves the field values are all in bounds.
+        validate_fallible_iterator(s.iter_checked())?;
+        Ok(s)
     }
 
     /// Returns the number of key-value pairs in this object
-    pub(crate) fn num_elements(&self) -> usize {
+    pub fn len(&self) -> usize {
         self.num_elements
     }
-}
 
-#[derive(Clone, Debug, PartialEq)]
-pub struct VariantObject<'m, 'v> {
-    pub metadata: VariantMetadata<'m>,
-    pub value: &'v [u8],
-    header: VariantObjectHeader,
-}
+    /// Returns true if the object contains no key-value pairs
+    pub fn is_empty(&self) -> bool {
+        self.len() == 0
+    }
 
-impl<'m, 'v> VariantObject<'m, 'v> {
-    pub fn try_new(metadata: VariantMetadata<'m>, value: &'v [u8]) -> 
Result<Self, ArrowError> {
-        Ok(Self {
-            metadata,
-            value,
-            header: VariantObjectHeader::try_new(value)?,
-        })
+    /// Get a field's value by index in `0..self.len()`
+    pub fn field(&self, i: usize) -> Result<Variant<'m, 'v>, ArrowError> {
+        let start_offset = self.header.field_offset_size.unpack_usize(
+            self.value,
+            self.field_offsets_start_byte,
+            i,
+        )?;
+        let value_bytes = slice_from_slice(self.value, self.values_start_byte 
+ start_offset..)?;
+        Variant::try_new_with_metadata(self.metadata, value_bytes)
     }
 
-    /// Returns the number of key-value pairs in this object
-    pub fn len(&self) -> usize {
-        self.header.num_elements()
+    /// Get a field's name by index in `0..self.len()`
+    pub fn field_name(&self, i: usize) -> Result<&'m str, ArrowError> {
+        let field_id =
+            self.header
+                .field_id_size
+                .unpack_usize(self.value, self.field_ids_start_byte, i)?;
+        self.metadata.get(field_id)
     }
 
-    /// Returns true if the object contains no key-value pairs
-    pub fn is_empty(&self) -> bool {
-        self.len() == 0
+    /// Returns an iterator of (name, value) pairs over the fields of this 
object.
+    pub fn iter(&self) -> impl Iterator<Item = (&'m str, Variant<'m, 'v>)> + 
'_ {
+        self.iter_checked().map(Result::unwrap)
     }
 
-    pub fn fields(&self) -> Result<impl Iterator<Item = (&'m str, Variant<'m, 
'v>)>, ArrowError> {
-        let field_list = self.parse_field_list()?;
-        Ok(field_list.into_iter())
+    // Fallible iteration over the fields of this object. The constructor 
traverses the iterator to
+    // prove it has no errors, so that all other use sites can blindly 
`unwrap` the result.
+    fn iter_checked(
+        &self,
+    ) -> impl Iterator<Item = Result<(&'m str, Variant<'m, 'v>), ArrowError>> 
+ '_ {
+        (0..self.num_elements).map(move |i| Ok((self.field_name(i)?, 
self.field(i)?)))
     }
 
-    pub fn field(&self, name: &str) -> Result<Option<Variant<'m, 'v>>, 
ArrowError> {
+    /// Returns the value of the field with the specified name, if any.
+    ///
+    /// `Ok(None)` means the field does not exist; `Err` means the search 
encountered an error.
+    pub fn field_by_name(&self, name: &str) -> Result<Option<Variant<'m, 'v>>, 
ArrowError> {
         // Binary search through the field IDs of this object to find the 
requested field name.
         //
         // NOTE: This does not require a sorted metadata dictionary, because 
the variant spec
         // requires object field ids to be lexically sorted by their 
corresponding string values,
         // and probing the dictionary for a field id is always O(1) work.
-        let (field_ids, field_offsets) = self.parse_field_arrays()?;
-        let search_result = try_binary_search_by(&field_ids, &name, 
|&field_id| {
-            self.metadata.get_field_by(field_id)
-        })?;
+        let search_result =
+            try_binary_search_range_by(0..self.num_elements, &name, |i| 
self.field_name(i))?;
 
-        let Ok(index) = search_result else {
-            return Ok(None);
-        };
-        let start_offset = field_offsets[index];
-        let end_offset = field_offsets[index + 1];
-        let value_bytes = slice_from_slice(
-            self.value,
-            self.header.values_start_byte + start_offset
-                ..self.header.values_start_byte + end_offset,
-        )?;
-        let variant = Variant::try_new_with_metadata(self.metadata, 
value_bytes)?;

Review Comment:
   This method was highly redundant, so I rewrote it in terms of `field_name` 
and `field` methods.



##########
parquet-variant/src/utils.rs:
##########
@@ -74,28 +75,37 @@ pub(crate) fn string_from_slice(slice: &[u8], range: 
Range<usize>) -> Result<&st
 /// * `Ok(Ok(index))` - Element found at the given index
 /// * `Ok(Err(index))` - Element not found, but would be inserted at the given 
index
 /// * `Err(e)` - Key extraction failed with error `e`
-pub(crate) fn try_binary_search_by<T, K, E, F>(
-    slice: &[T],
+pub(crate) fn try_binary_search_range_by<K, E, F>(
+    range: Range<usize>,
     target: &K,
     mut key_extractor: F,
 ) -> Result<Result<usize, usize>, E>
 where
     K: Ord,
-    F: FnMut(&T) -> Result<K, E>,
+    F: FnMut(usize) -> Result<K, E>,
 {
-    let mut left = 0;
-    let mut right = slice.len();
+    let Range { mut start, mut end } = range;
 
-    while left < right {
-        let mid = (left + right) / 2;
-        let key = key_extractor(&slice[mid])?;
+    while start < end {
+        let mid = start + (end - start) / 2;

Review Comment:
   The original average calculation was vulnerable to integer overflow



##########
parquet-variant/src/variant.rs:
##########
@@ -223,113 +192,79 @@ impl<'m> VariantMetadata<'m> {
     pub fn dictionary_size(&self) -> usize {
         self.dict_size
     }
+
+    /// The variant protocol version
     pub fn version(&self) -> u8 {
         self.header.version
     }
 
-    /// Helper method to get the offset start and end range for a key by index.
-    fn get_offsets_for_key_by(&self, index: usize) -> Result<Range<usize>, 
ArrowError> {

Review Comment:
   All these extra methods were a lot of complexity without any obvious 
benefit, so I removed them.



##########
parquet-variant/src/variant.rs:
##########
@@ -223,113 +192,79 @@ impl<'m> VariantMetadata<'m> {
     pub fn dictionary_size(&self) -> usize {
         self.dict_size
     }
+
+    /// The variant protocol version
     pub fn version(&self) -> u8 {
         self.header.version
     }
 
-    /// Helper method to get the offset start and end range for a key by index.
-    fn get_offsets_for_key_by(&self, index: usize) -> Result<Range<usize>, 
ArrowError> {
-        if index >= self.dict_size {
-            return Err(ArrowError::InvalidArgumentError(format!(
-                "Index {} out of bounds for dictionary of length {}",
-                index, self.dict_size
-            )));
-        }
-
+    /// Gets an offset array entry by index.
+    fn get_offset(&self, i: usize) -> Result<usize, ArrowError> {
         // Skipping the header byte (setting byte_offset = 1) and the 
dictionary_size (setting offset_index +1)
-        let unpack = |i| self.header.offset_size.unpack_usize(self.bytes, 1, i 
+ 1);
-        Ok(unpack(index)?..unpack(index + 1)?)
+        let bytes = slice_from_slice(self.bytes, 
..self.dictionary_key_start_byte)?;
+        self.header.offset_size.unpack_usize(bytes, 1, i + 1)
     }
 
-    /// Get a single offset by index
-    pub fn get_offset_by(&self, index: usize) -> Result<usize, ArrowError> {
-        if index >= self.dict_size {
-            return Err(ArrowError::InvalidArgumentError(format!(
-                "Index {} out of bounds for dictionary of length {}",
-                index, self.dict_size
-            )));
-        }
-
-        // Skipping the header byte (setting byte_offset = 1) and the 
dictionary_size (setting offset_index +1)
-        let unpack = |i| self.header.offset_size.unpack_usize(self.bytes, 1, i 
+ 1);
-        unpack(index)
-    }
-
-    /// Get the key-name by index
-    pub fn get_field_by(&self, index: usize) -> Result<&'m str, ArrowError> {
-        let offset_range = self.get_offsets_for_key_by(index)?;
-        self.get_field_by_offset(offset_range)
+    /// Gets a dictionary entry by index
+    pub fn get(&self, i: usize) -> Result<&'m str, ArrowError> {
+        let dictionary_keys_bytes = slice_from_slice(self.bytes, 
self.dictionary_key_start_byte..)?;
+        let byte_range = self.get_offset(i)?..self.get_offset(i + 1)?;
+        string_from_slice(dictionary_keys_bytes, byte_range)
     }
 
-    /// Gets the field using an offset (Range) - helper method to keep 
consistent API.
-    pub(crate) fn get_field_by_offset(&self, offset: Range<usize>) -> 
Result<&'m str, ArrowError> {
-        let dictionary_keys_bytes =
-            slice_from_slice(self.bytes, 
self.dictionary_key_start_byte..self.bytes.len())?;
-        let result = string_from_slice(dictionary_keys_bytes, offset)?;
-
-        Ok(result)
+    /// Get all key-names as an Iterator of strings
+    pub fn iter(&self) -> impl Iterator<Item = &'m str> + '_ {
+        self.iter_checked().map(Result::unwrap)
     }
 
-    #[allow(unused)]
-    pub(crate) fn header(&self) -> VariantMetadataHeader {
-        self.header
+    // Fallible iteration over the fields of this dictionary. The constructor 
traverse the iterator
+    // to prove it has no errors, so that all other use sites can blindly 
`unwrap` the result.
+    fn iter_checked(&self) -> impl Iterator<Item = Result<&'m str, 
ArrowError>> + '_ {
+        (0..self.dict_size).map(move |i| self.get(i))
     }
+}
 
-    /// Get the offsets as an iterator
-    pub fn offsets(&self) -> impl Iterator<Item = Result<Range<usize>, 
ArrowError>> + 'm {
-        let offset_size = self.header.offset_size; // `Copy`
-        let bytes = self.bytes;
+#[derive(Clone, Debug, PartialEq)]
+pub(crate) struct VariantObjectHeader {
+    field_offset_size: OffsetSizeBytes,
+    field_id_size: OffsetSizeBytes,
+    is_large: bool,
+}
 
-        (0..self.dict_size).map(move |i| {
-            // This wont be out of bounds as long as dict_size and offsets 
have been validated
-            // during construction via `try_new`, as it calls unpack_usize for 
the
-            // indices `1..dict_size+1` already.
-            let start = offset_size.unpack_usize(bytes, 1, i + 1);
-            let end = offset_size.unpack_usize(bytes, 1, i + 2);
+impl VariantObjectHeader {
+    pub(crate) fn try_new(header: u8) -> Result<Self, ArrowError> {
+        // Parse the header byte to get object parameters
+        let value_header = header >> 2;
+        let field_offset_size_minus_one = value_header & 0x03; // Last 2 bits
+        let field_id_size_minus_one = (value_header >> 2) & 0x03; // Next 2 
bits
+        let is_large = (value_header & 0x10) != 0; // 5th bit
 
-            match (start, end) {
-                (Ok(s), Ok(e)) => Ok(s..e),
-                (Err(e), _) | (_, Err(e)) => Err(e),
-            }
+        Ok(Self {
+            field_offset_size: 
OffsetSizeBytes::try_new(field_offset_size_minus_one)?,
+            field_id_size: OffsetSizeBytes::try_new(field_id_size_minus_one)?,
+            is_large,
         })
     }
-
-    /// Get all key-names as an Iterator of strings
-    pub fn fields(
-        &'m self,
-    ) -> Result<impl Iterator<Item = Result<&'m str, ArrowError>>, ArrowError> 
{
-        let iterator = self
-            .offsets()
-            .map(move |offset_range| self.get_field_by_offset(offset_range?));
-        Ok(iterator)
-    }
 }
 
 #[derive(Clone, Debug, PartialEq)]
-pub(crate) struct VariantObjectHeader {
-    field_offset_size: OffsetSizeBytes,
-    field_id_size: OffsetSizeBytes,
+pub struct VariantObject<'m, 'v> {
+    pub metadata: VariantMetadata<'m>,
+    pub value: &'v [u8],
+    header: VariantObjectHeader,
     num_elements: usize,
     field_ids_start_byte: usize,
     field_offsets_start_byte: usize,
     values_start_byte: usize,

Review Comment:
   These fields moved from `VariantObjectHeader` to `VariantObject`, since 
they're not actually part of the header. The diff is just weird.



##########
parquet-variant/src/utils.rs:
##########
@@ -74,28 +75,37 @@ pub(crate) fn string_from_slice(slice: &[u8], range: 
Range<usize>) -> Result<&st
 /// * `Ok(Ok(index))` - Element found at the given index
 /// * `Ok(Err(index))` - Element not found, but would be inserted at the given 
index
 /// * `Err(e)` - Key extraction failed with error `e`
-pub(crate) fn try_binary_search_by<T, K, E, F>(
-    slice: &[T],
+pub(crate) fn try_binary_search_range_by<K, E, F>(
+    range: Range<usize>,

Review Comment:
   Turns out we don't _have_ a slice we can search over. 
   
   Ranges are anyway more flexible, because the key extraction can always index 
into a slice if desired.



##########
parquet-variant/src/variant.rs:
##########
@@ -154,64 +152,35 @@ impl<'m> VariantMetadata<'m> {
     }
 
     pub fn try_new(bytes: &'m [u8]) -> Result<Self, ArrowError> {
-        let header = VariantMetadataHeader::try_new(bytes)?;
+        let header = first_byte_from_slice(bytes)?;
+        let header = VariantMetadataHeader::try_new(header)?;
+
         // Offset 1, index 0 because first element after header is dictionary 
size
         let dict_size = header.offset_size.unpack_usize(bytes, 1, 0)?;
 
-        // Check that we have the correct metadata length according to 
dictionary_size, or return
-        // error early.
-        // Minimum number of bytes the metadata buffer must contain:
-        // 1 byte header
-        // + offset_size-byte `dictionary_size` field
-        // + (dict_size + 1) offset entries, each `offset_size` bytes. (Table 
size, essentially)
-        // 1 + offset_size + (dict_size + 1) * offset_size
+        // Calculate the starting offset of the dictionary string bytes.
+        //
+        // Value header, dict_size (offset_size bytes), and dict_size+1 offsets
+        // = 1 + offset_size + (dict_size + 1) * offset_size
         // = (dict_size + 2) * offset_size + 1
-        let offset_size = header.offset_size as usize; // Cheap to copy
-
         let dictionary_key_start_byte = dict_size
             .checked_add(2)
-            .and_then(|n| n.checked_mul(offset_size))
+            .and_then(|n| n.checked_mul(header.offset_size as usize))
             .and_then(|n| n.checked_add(1))
             .ok_or_else(|| ArrowError::InvalidArgumentError("metadata length 
overflow".into()))?;
-
-        if bytes.len() < dictionary_key_start_byte {
-            return Err(ArrowError::InvalidArgumentError(
-                "Metadata shorter than dictionary_size implies".to_string(),
-            ));
-        }
-
-        // Check that all offsets are monotonically increasing

Review Comment:
   The iterator covers this check now



##########
parquet-variant/src/variant.rs:
##########
@@ -340,25 +275,19 @@ impl VariantObjectHeader {
 
         // Calculate byte offsets for different sections
         let field_ids_start_byte = 1 + num_elements_size as usize;
-        let field_offsets_start_byte = field_ids_start_byte + num_elements * 
field_id_size as usize;
+        let field_offsets_start_byte =
+            field_ids_start_byte + num_elements * header.field_id_size as 
usize;
         let values_start_byte =
-            field_offsets_start_byte + (num_elements + 1) * field_offset_size 
as usize;
+            field_offsets_start_byte + (num_elements + 1) * 
header.field_offset_size as usize;
 
-        // Verify that the last field offset array entry is inside the value 
slice
-        let last_field_offset_byte =
-            field_offsets_start_byte + (num_elements + 1) * field_offset_size 
as usize;

Review Comment:
   The iterator test covers this check.



##########
parquet-variant/src/utils.rs:
##########
@@ -74,28 +75,37 @@ pub(crate) fn string_from_slice(slice: &[u8], range: 
Range<usize>) -> Result<&st
 /// * `Ok(Ok(index))` - Element found at the given index
 /// * `Ok(Err(index))` - Element not found, but would be inserted at the given 
index
 /// * `Err(e)` - Key extraction failed with error `e`
-pub(crate) fn try_binary_search_by<T, K, E, F>(
-    slice: &[T],
+pub(crate) fn try_binary_search_range_by<K, E, F>(
+    range: Range<usize>,
     target: &K,
     mut key_extractor: F,
 ) -> Result<Result<usize, usize>, E>
 where
     K: Ord,
-    F: FnMut(&T) -> Result<K, E>,
+    F: FnMut(usize) -> Result<K, E>,
 {
-    let mut left = 0;
-    let mut right = slice.len();
+    let Range { mut start, mut end } = range;
 
-    while left < right {
-        let mid = (left + right) / 2;
-        let key = key_extractor(&slice[mid])?;
+    while start < end {
+        let mid = start + (end - start) / 2;
+        let key = key_extractor(mid)?;
 
         match key.cmp(target) {
             std::cmp::Ordering::Equal => return Ok(Ok(mid)),
-            std::cmp::Ordering::Greater => right = mid,
-            std::cmp::Ordering::Less => left = mid + 1,
+            std::cmp::Ordering::Greater => end = mid,
+            std::cmp::Ordering::Less => start = mid + 1,
         }
     }
 
-    Ok(Err(left))
+    Ok(Err(start))
+}
+
+/// Attempts to prove a fallible iterator is actually infallible in practice, 
by consuming every
+/// element and returning the first error (if any).
+pub(crate) fn validate_fallible_iterator<T, E>(
+    mut it: impl Iterator<Item = Result<T, E>>,
+) -> Result<(), E> {
+    // NOTE: It should really be `let None = ...`, but the compiler can't 
prove that.
+    let _ = it.find(Result::is_err).transpose()?;

Review Comment:
   Even tho this is a one-liner, it's arcane enough to be worth wrapping in a 
named+documented helper function.



##########
parquet-variant/src/variant.rs:
##########
@@ -367,150 +296,129 @@ impl VariantObjectHeader {
                 value.len()
             )));
         }
-        Ok(Self {
-            field_offset_size,
-            field_id_size,
+
+        let s = Self {
+            metadata,
+            value,
+            header,
             num_elements,
             field_ids_start_byte,
             field_offsets_start_byte,
             values_start_byte,
-        })
+        };
+
+        // Verify that `iter` can safely `unwrap` the items produced by this 
iterator.
+        //
+        // This has the side effect of validating the field_id and 
field_offset arrays, and also
+        // proves the field values are all in bounds.
+        validate_fallible_iterator(s.iter_checked())?;
+        Ok(s)
     }
 
     /// Returns the number of key-value pairs in this object
-    pub(crate) fn num_elements(&self) -> usize {
+    pub fn len(&self) -> usize {
         self.num_elements
     }
-}
 
-#[derive(Clone, Debug, PartialEq)]
-pub struct VariantObject<'m, 'v> {
-    pub metadata: VariantMetadata<'m>,
-    pub value: &'v [u8],
-    header: VariantObjectHeader,
-}
+    /// Returns true if the object contains no key-value pairs
+    pub fn is_empty(&self) -> bool {
+        self.len() == 0
+    }
 
-impl<'m, 'v> VariantObject<'m, 'v> {
-    pub fn try_new(metadata: VariantMetadata<'m>, value: &'v [u8]) -> 
Result<Self, ArrowError> {
-        Ok(Self {
-            metadata,
-            value,
-            header: VariantObjectHeader::try_new(value)?,
-        })
+    /// Get a field's value by index in `0..self.len()`
+    pub fn field(&self, i: usize) -> Result<Variant<'m, 'v>, ArrowError> {
+        let start_offset = self.header.field_offset_size.unpack_usize(
+            self.value,
+            self.field_offsets_start_byte,
+            i,
+        )?;
+        let value_bytes = slice_from_slice(self.value, self.values_start_byte 
+ start_offset..)?;
+        Variant::try_new_with_metadata(self.metadata, value_bytes)
     }
 
-    /// Returns the number of key-value pairs in this object
-    pub fn len(&self) -> usize {
-        self.header.num_elements()
+    /// Get a field's name by index in `0..self.len()`
+    pub fn field_name(&self, i: usize) -> Result<&'m str, ArrowError> {
+        let field_id =
+            self.header
+                .field_id_size
+                .unpack_usize(self.value, self.field_ids_start_byte, i)?;
+        self.metadata.get(field_id)
     }
 
-    /// Returns true if the object contains no key-value pairs
-    pub fn is_empty(&self) -> bool {
-        self.len() == 0
+    /// Returns an iterator of (name, value) pairs over the fields of this 
object.
+    pub fn iter(&self) -> impl Iterator<Item = (&'m str, Variant<'m, 'v>)> + 
'_ {
+        self.iter_checked().map(Result::unwrap)
     }
 
-    pub fn fields(&self) -> Result<impl Iterator<Item = (&'m str, Variant<'m, 
'v>)>, ArrowError> {
-        let field_list = self.parse_field_list()?;
-        Ok(field_list.into_iter())
+    // Fallible iteration over the fields of this object. The constructor 
traverses the iterator to
+    // prove it has no errors, so that all other use sites can blindly 
`unwrap` the result.
+    fn iter_checked(
+        &self,
+    ) -> impl Iterator<Item = Result<(&'m str, Variant<'m, 'v>), ArrowError>> 
+ '_ {
+        (0..self.num_elements).map(move |i| Ok((self.field_name(i)?, 
self.field(i)?)))
     }
 
-    pub fn field(&self, name: &str) -> Result<Option<Variant<'m, 'v>>, 
ArrowError> {
+    /// Returns the value of the field with the specified name, if any.
+    ///
+    /// `Ok(None)` means the field does not exist; `Err` means the search 
encountered an error.
+    pub fn field_by_name(&self, name: &str) -> Result<Option<Variant<'m, 'v>>, 
ArrowError> {
         // Binary search through the field IDs of this object to find the 
requested field name.
         //
         // NOTE: This does not require a sorted metadata dictionary, because 
the variant spec
         // requires object field ids to be lexically sorted by their 
corresponding string values,
         // and probing the dictionary for a field id is always O(1) work.
-        let (field_ids, field_offsets) = self.parse_field_arrays()?;
-        let search_result = try_binary_search_by(&field_ids, &name, 
|&field_id| {
-            self.metadata.get_field_by(field_id)
-        })?;
+        let search_result =
+            try_binary_search_range_by(0..self.num_elements, &name, |i| 
self.field_name(i))?;
 
-        let Ok(index) = search_result else {
-            return Ok(None);
-        };
-        let start_offset = field_offsets[index];
-        let end_offset = field_offsets[index + 1];
-        let value_bytes = slice_from_slice(
-            self.value,
-            self.header.values_start_byte + start_offset
-                ..self.header.values_start_byte + end_offset,
-        )?;
-        let variant = Variant::try_new_with_metadata(self.metadata, 
value_bytes)?;
-        Ok(Some(variant))
-    }
-
-    /// Parse field IDs and field offsets arrays using the cached header
-    fn parse_field_arrays(&self) -> Result<(Vec<usize>, Vec<usize>), 
ArrowError> {
-        // Parse field IDs
-        let field_ids = (0..self.header.num_elements)
-            .map(|i| {
-                self.header.field_id_size.unpack_usize(
-                    self.value,
-                    self.header.field_ids_start_byte,
-                    i,
-                )
-            })
-            .collect::<Result<Vec<_>, _>>()?;
-        debug_assert_eq!(field_ids.len(), self.header.num_elements);
-
-        // Parse field offsets (num_elements + 1 entries)
-        let field_offsets = (0..=self.header.num_elements)
-            .map(|i| {
-                self.header.field_offset_size.unpack_usize(
-                    self.value,
-                    self.header.field_offsets_start_byte,
-                    i,
-                )
-            })
-            .collect::<Result<Vec<_>, _>>()?;
-        debug_assert_eq!(field_offsets.len(), self.header.num_elements + 1);
-
-        Ok((field_ids, field_offsets))
-    }
-
-    /// Parse all fields into a vector for iteration
-    fn parse_field_list(&self) -> Result<Vec<(&'m str, Variant<'m, 'v>)>, 
ArrowError> {
-        let (field_ids, field_offsets) = self.parse_field_arrays()?;
-
-        let mut fields = Vec::with_capacity(self.header.num_elements);
-
-        for i in 0..self.header.num_elements {
-            let field_id = field_ids[i];
-            let field_name = self.metadata.get_field_by(field_id)?;
-
-            let start_offset = field_offsets[i];
-            let value_bytes =
-                slice_from_slice(self.value, self.header.values_start_byte + 
start_offset..)?;
-            let variant = Variant::try_new_with_metadata(self.metadata, 
value_bytes)?;
-
-            fields.push((field_name, variant));
-        }
-
-        Ok(fields)
+        search_result.ok().map(|i| self.field(i)).transpose()
     }
 }
 
 #[derive(Clone, Debug, PartialEq)]
 pub(crate) struct VariantListHeader {
     offset_size: OffsetSizeBytes,
     is_large: bool,
-    num_elements: usize,
-    first_offset_byte: usize,
-    first_value_byte: usize,

Review Comment:
   Again, moved out of the header struct because not actually part of the 
variant value header



##########
parquet-variant/src/variant.rs:
##########
@@ -367,150 +296,129 @@ impl VariantObjectHeader {
                 value.len()
             )));
         }
-        Ok(Self {
-            field_offset_size,
-            field_id_size,
+
+        let s = Self {
+            metadata,
+            value,
+            header,
             num_elements,
             field_ids_start_byte,
             field_offsets_start_byte,
             values_start_byte,
-        })
+        };
+
+        // Verify that `iter` can safely `unwrap` the items produced by this 
iterator.
+        //
+        // This has the side effect of validating the field_id and 
field_offset arrays, and also
+        // proves the field values are all in bounds.
+        validate_fallible_iterator(s.iter_checked())?;
+        Ok(s)
     }
 
     /// Returns the number of key-value pairs in this object
-    pub(crate) fn num_elements(&self) -> usize {
+    pub fn len(&self) -> usize {
         self.num_elements
     }
-}
 
-#[derive(Clone, Debug, PartialEq)]
-pub struct VariantObject<'m, 'v> {
-    pub metadata: VariantMetadata<'m>,
-    pub value: &'v [u8],
-    header: VariantObjectHeader,
-}
+    /// Returns true if the object contains no key-value pairs
+    pub fn is_empty(&self) -> bool {
+        self.len() == 0
+    }
 
-impl<'m, 'v> VariantObject<'m, 'v> {
-    pub fn try_new(metadata: VariantMetadata<'m>, value: &'v [u8]) -> 
Result<Self, ArrowError> {
-        Ok(Self {
-            metadata,
-            value,
-            header: VariantObjectHeader::try_new(value)?,
-        })
+    /// Get a field's value by index in `0..self.len()`
+    pub fn field(&self, i: usize) -> Result<Variant<'m, 'v>, ArrowError> {
+        let start_offset = self.header.field_offset_size.unpack_usize(
+            self.value,
+            self.field_offsets_start_byte,
+            i,
+        )?;
+        let value_bytes = slice_from_slice(self.value, self.values_start_byte 
+ start_offset..)?;
+        Variant::try_new_with_metadata(self.metadata, value_bytes)
     }
 
-    /// Returns the number of key-value pairs in this object
-    pub fn len(&self) -> usize {
-        self.header.num_elements()
+    /// Get a field's name by index in `0..self.len()`
+    pub fn field_name(&self, i: usize) -> Result<&'m str, ArrowError> {
+        let field_id =
+            self.header
+                .field_id_size
+                .unpack_usize(self.value, self.field_ids_start_byte, i)?;
+        self.metadata.get(field_id)
     }
 
-    /// Returns true if the object contains no key-value pairs
-    pub fn is_empty(&self) -> bool {
-        self.len() == 0
+    /// Returns an iterator of (name, value) pairs over the fields of this 
object.
+    pub fn iter(&self) -> impl Iterator<Item = (&'m str, Variant<'m, 'v>)> + 
'_ {
+        self.iter_checked().map(Result::unwrap)
     }
 
-    pub fn fields(&self) -> Result<impl Iterator<Item = (&'m str, Variant<'m, 
'v>)>, ArrowError> {
-        let field_list = self.parse_field_list()?;
-        Ok(field_list.into_iter())
+    // Fallible iteration over the fields of this object. The constructor 
traverses the iterator to
+    // prove it has no errors, so that all other use sites can blindly 
`unwrap` the result.
+    fn iter_checked(
+        &self,
+    ) -> impl Iterator<Item = Result<(&'m str, Variant<'m, 'v>), ArrowError>> 
+ '_ {
+        (0..self.num_elements).map(move |i| Ok((self.field_name(i)?, 
self.field(i)?)))
     }
 
-    pub fn field(&self, name: &str) -> Result<Option<Variant<'m, 'v>>, 
ArrowError> {
+    /// Returns the value of the field with the specified name, if any.
+    ///
+    /// `Ok(None)` means the field does not exist; `Err` means the search 
encountered an error.
+    pub fn field_by_name(&self, name: &str) -> Result<Option<Variant<'m, 'v>>, 
ArrowError> {
         // Binary search through the field IDs of this object to find the 
requested field name.
         //
         // NOTE: This does not require a sorted metadata dictionary, because 
the variant spec
         // requires object field ids to be lexically sorted by their 
corresponding string values,
         // and probing the dictionary for a field id is always O(1) work.
-        let (field_ids, field_offsets) = self.parse_field_arrays()?;
-        let search_result = try_binary_search_by(&field_ids, &name, 
|&field_id| {
-            self.metadata.get_field_by(field_id)
-        })?;
+        let search_result =
+            try_binary_search_range_by(0..self.num_elements, &name, |i| 
self.field_name(i))?;
 
-        let Ok(index) = search_result else {
-            return Ok(None);
-        };
-        let start_offset = field_offsets[index];
-        let end_offset = field_offsets[index + 1];
-        let value_bytes = slice_from_slice(
-            self.value,
-            self.header.values_start_byte + start_offset
-                ..self.header.values_start_byte + end_offset,
-        )?;
-        let variant = Variant::try_new_with_metadata(self.metadata, 
value_bytes)?;
-        Ok(Some(variant))
-    }
-
-    /// Parse field IDs and field offsets arrays using the cached header
-    fn parse_field_arrays(&self) -> Result<(Vec<usize>, Vec<usize>), 
ArrowError> {
-        // Parse field IDs
-        let field_ids = (0..self.header.num_elements)
-            .map(|i| {
-                self.header.field_id_size.unpack_usize(
-                    self.value,
-                    self.header.field_ids_start_byte,
-                    i,
-                )
-            })
-            .collect::<Result<Vec<_>, _>>()?;

Review Comment:
   No more `collect` calls!



##########
parquet-variant/src/variant.rs:
##########
@@ -1527,26 +1372,21 @@ mod tests {
         let md = VariantMetadata::try_new(bytes).expect("should parse");
         assert_eq!(md.dictionary_size(), 2);
         // Fields
-        assert_eq!(md.get_field_by(0).unwrap(), "cat");
-        assert_eq!(md.get_field_by(1).unwrap(), "dog");
+        assert_eq!(md.get(0).unwrap(), "cat");
+        assert_eq!(md.get(1).unwrap(), "dog");
 
         // Offsets
-        assert_eq!(md.get_offset_by(0).unwrap(), 0x00);
-        assert_eq!(md.get_offset_by(1).unwrap(), 0x03);
-        // We only have 2 keys, the final offset should not be accessible 
using this method.
-        let err = md.get_offset_by(2).unwrap_err();
+        assert_eq!(md.get_offset(0).unwrap(), 0x00);
+        assert_eq!(md.get_offset(1).unwrap(), 0x03);
+        assert_eq!(md.get_offset(2).unwrap(), 0x06);
 
+        let err = md.get_offset(3);
         assert!(
-            matches!(err, ArrowError::InvalidArgumentError(ref msg)
-                     if msg.contains("Index 2 out of bounds for dictionary of 
length 2")),
-            "unexpected error: {err:?}"
+            matches!(err, Err(ArrowError::InvalidArgumentError(_))),

Review Comment:
   It wasn't obvious to me that the specific error string matters much -- we 
just want to verify that the bad access does not succeed.



##########
parquet-variant/src/variant.rs:
##########
@@ -527,142 +435,79 @@ impl VariantListHeader {
 
         // 2.  (num_elements + 1) * offset_size
         let value_bytes = n_offsets
-            .checked_mul(offset_size as usize)
+            .checked_mul(header.offset_size as usize)
             .ok_or_else(overflow)?;
 
         // 3.  first_offset_byte + ...
         let first_value_byte = first_offset_byte
             .checked_add(value_bytes)
             .ok_or_else(overflow)?;
 
-        // Verify that the last offset array entry is inside the value slice

Review Comment:
   These checks are covered by the new iterator test



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to