This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new 42b6c17930 [Variant] Reduce variant-related struct sizes (#7888)
42b6c17930 is described below

commit 42b6c1793000f8017c952966b0080927145bccff
Author: Ryan Johnson <[email protected]>
AuthorDate: Fri Jul 11 05:23:55 2025 -0700

    [Variant] Reduce variant-related struct sizes (#7888)
    
    # Which issue does this PR close?
    
    - Closes https://github.com/apache/arrow-rs/issues/7831
    
    # Rationale for this change
    
    Variants naturally work with `u32` offsets, field ids, etc. Widening
    them artificially to `usize` on 64-bit architectures causes several
    problems:
    1. A majority of developers will be using 64-bit architectures, and are
    unlikely to think about integer overflow issues when working with
    `usize`. But it's actually quite easy for malicious data or buggy code
    to overflow the `u32` values that variant actually relies on. Worse, it
    becomes difficult, if not impossible, to validate the code's resistance
    to 32-bit integer overflow, when manipulating `usize` values on 64-bit
    hardware.
    2. Related to 1/, casting from `usize` to `u32` can clip the value on
    64-bit hardware, which makes it harder to reason about the code's
    correctness (always wondering whether the value _might_ be larger than
    32-bits can hold). In contrast, casting from `u32` to `usize` is safe in
    spite of being fallible in rust (assumes we do _not_ need to support
    16-bit architectures).
    3. The variant-related data structures occupy significantly more space
    than they need to, when storing (64-bit) `usize` offsets instead of
    `u32`.
    
    # What changes are included in this PR?
    
    Store all variant-related offsets as `u32` instead of `usize`. The
    `VariantMetadata`, `VariantObject` and `VariantList` structs shrink to
    32/64/64 bytes (previously 40/88/80 bytes).
    
    Also, rename `OffsetSizeBytes::unpack_usize[_at_offset]` methods to
    `unpack_u32[_at_offset]`, to more accurately reflect what they actually
    do now.
    
    # Are these changes tested?
    
    Existing unit tests cover the use of these values; new static assertions
    will catch any future size changes.
    
    # Are there any user-facing changes?
    
    `VariantMetadata` is no longer `Copy`, reflecting the fact that this PR
    still leaves it 2x larger than a fat pointer.
    
    ---------
    
    Co-authored-by: Andrew Lamb <[email protected]>
---
 parquet-variant-json/src/from_json.rs   |  2 +-
 parquet-variant/src/builder.rs          |  1 -
 parquet-variant/src/decoder.rs          | 66 +++++++++++++--------------------
 parquet-variant/src/utils.rs            |  9 +++++
 parquet-variant/src/variant.rs          |  3 ++
 parquet-variant/src/variant/list.rs     | 47 ++++++++++++-----------
 parquet-variant/src/variant/metadata.rs | 44 ++++++++++++----------
 parquet-variant/src/variant/object.rs   | 60 +++++++++++++++---------------
 8 files changed, 118 insertions(+), 114 deletions(-)

diff --git a/parquet-variant-json/src/from_json.rs 
b/parquet-variant-json/src/from_json.rs
index c091095036..3052bc504d 100644
--- a/parquet-variant-json/src/from_json.rs
+++ b/parquet-variant-json/src/from_json.rs
@@ -165,7 +165,7 @@ mod test {
         expected: Variant<'a, 'a>,
     }
 
-    impl<'a> JsonToVariantTest<'a> {
+    impl JsonToVariantTest<'_> {
         fn run(self) -> Result<(), ArrowError> {
             let mut variant_builder = VariantBuilder::new();
             json_to_variant(self.json, &mut variant_builder)?;
diff --git a/parquet-variant/src/builder.rs b/parquet-variant/src/builder.rs
index 542065045c..33608d27cb 100644
--- a/parquet-variant/src/builder.rs
+++ b/parquet-variant/src/builder.rs
@@ -1932,7 +1932,6 @@ mod tests {
         assert!(metadata.is_empty());
 
         let variant = Variant::try_new_with_metadata(metadata, 
&value).unwrap();
-        assert!(metadata.is_empty());
         assert_eq!(variant, Variant::Int8(42));
     }
 
diff --git a/parquet-variant/src/decoder.rs b/parquet-variant/src/decoder.rs
index 5a6aab43ff..5d6a064793 100644
--- a/parquet-variant/src/decoder.rs
+++ b/parquet-variant/src/decoder.rs
@@ -22,8 +22,6 @@ use crate::ShortString;
 use arrow_schema::ArrowError;
 use chrono::{DateTime, Duration, NaiveDate, NaiveDateTime, Utc};
 
-use std::num::TryFromIntError;
-
 /// The basic type of a [`Variant`] value, encoded in the first two bits of the
 /// header byte.
 ///
@@ -147,11 +145,9 @@ impl OffsetSizeBytes {
     /// * `bytes` – the byte buffer to index
     /// * `index` – 0-based index into the buffer
     ///
-    /// Each value is `self as usize` bytes wide (1, 2, 3 or 4).
-    /// Three-byte values are zero-extended to 32 bits before the final
-    /// fallible cast to `usize`.
-    pub(crate) fn unpack_usize(&self, bytes: &[u8], index: usize) -> 
Result<usize, ArrowError> {
-        self.unpack_usize_at_offset(bytes, 0, index)
+    /// Each value is `self as u32` bytes wide (1, 2, 3 or 4), zero-extended 
to 32 bits as needed.
+    pub(crate) fn unpack_u32(&self, bytes: &[u8], index: usize) -> Result<u32, 
ArrowError> {
+        self.unpack_u32_at_offset(bytes, 0, index)
     }
 
     /// Return one unsigned little-endian value from `bytes`.
@@ -162,15 +158,13 @@ impl OffsetSizeBytes {
     /// * `offset_index` – 0-based index **after** the skipped bytes
     ///   (`0` is the first value, `1` the next, …).
     ///
-    /// Each value is `self as usize` bytes wide (1, 2, 3 or 4).
-    /// Three-byte values are zero-extended to 32 bits before the final
-    /// fallible cast to `usize`.
-    pub(crate) fn unpack_usize_at_offset(
+    /// Each value is `self as u32` bytes wide (1, 2, 3 or 4), zero-extended 
to 32 bits as needed.
+    pub(crate) fn unpack_u32_at_offset(
         &self,
         bytes: &[u8],
         byte_offset: usize,  // how many bytes to skip
         offset_index: usize, // which offset in an array of offsets
-    ) -> Result<usize, ArrowError> {
+    ) -> Result<u32, ArrowError> {
         use OffsetSizeBytes::*;
 
         // Index into the byte array:
@@ -179,7 +173,7 @@ impl OffsetSizeBytes {
             .checked_mul(*self as usize)
             .and_then(|n| n.checked_add(byte_offset))
             .ok_or_else(|| overflow_error("unpacking offset array value"))?;
-        let result = match self {
+        let value = match self {
             One => u8::from_le_bytes(array_from_slice(bytes, offset)?).into(),
             Two => u16::from_le_bytes(array_from_slice(bytes, offset)?).into(),
             Three => {
@@ -192,11 +186,7 @@ impl OffsetSizeBytes {
             }
             Four => u32::from_le_bytes(array_from_slice(bytes, offset)?),
         };
-
-        // Convert the u32 we extracted to usize (should always succeed on 32- 
and 64-bit arch)
-        result
-            .try_into()
-            .map_err(|e: TryFromIntError| 
ArrowError::InvalidArgumentError(e.to_string()))
+        Ok(value)
     }
 }
 
@@ -518,57 +508,51 @@ mod tests {
     }
 
     #[test]
-    fn unpack_usize_all_widths() {
+    fn unpack_u32_all_widths() {
         // One-byte offsets
         let buf_one = [0x01u8, 0xAB, 0xCD];
-        assert_eq!(
-            OffsetSizeBytes::One.unpack_usize(&buf_one, 0).unwrap(),
-            0x01
-        );
-        assert_eq!(
-            OffsetSizeBytes::One.unpack_usize(&buf_one, 2).unwrap(),
-            0xCD
-        );
+        assert_eq!(OffsetSizeBytes::One.unpack_u32(&buf_one, 0).unwrap(), 
0x01);
+        assert_eq!(OffsetSizeBytes::One.unpack_u32(&buf_one, 2).unwrap(), 
0xCD);
 
         // Two-byte offsets (little-endian 0x1234, 0x5678)
         let buf_two = [0x34, 0x12, 0x78, 0x56];
         assert_eq!(
-            OffsetSizeBytes::Two.unpack_usize(&buf_two, 0).unwrap(),
+            OffsetSizeBytes::Two.unpack_u32(&buf_two, 0).unwrap(),
             0x1234
         );
         assert_eq!(
-            OffsetSizeBytes::Two.unpack_usize(&buf_two, 1).unwrap(),
+            OffsetSizeBytes::Two.unpack_u32(&buf_two, 1).unwrap(),
             0x5678
         );
 
         // Three-byte offsets (0x030201 and 0x0000FF)
         let buf_three = [0x01, 0x02, 0x03, 0xFF, 0x00, 0x00];
         assert_eq!(
-            OffsetSizeBytes::Three.unpack_usize(&buf_three, 0).unwrap(),
+            OffsetSizeBytes::Three.unpack_u32(&buf_three, 0).unwrap(),
             0x030201
         );
         assert_eq!(
-            OffsetSizeBytes::Three.unpack_usize(&buf_three, 1).unwrap(),
+            OffsetSizeBytes::Three.unpack_u32(&buf_three, 1).unwrap(),
             0x0000FF
         );
 
         // Four-byte offsets (0x12345678, 0x90ABCDEF)
         let buf_four = [0x78, 0x56, 0x34, 0x12, 0xEF, 0xCD, 0xAB, 0x90];
         assert_eq!(
-            OffsetSizeBytes::Four.unpack_usize(&buf_four, 0).unwrap(),
+            OffsetSizeBytes::Four.unpack_u32(&buf_four, 0).unwrap(),
             0x1234_5678
         );
         assert_eq!(
-            OffsetSizeBytes::Four.unpack_usize(&buf_four, 1).unwrap(),
+            OffsetSizeBytes::Four.unpack_u32(&buf_four, 1).unwrap(),
             0x90AB_CDEF
         );
     }
 
     #[test]
-    fn unpack_usize_out_of_bounds() {
+    fn unpack_u32_out_of_bounds() {
         let tiny = [0x00u8]; // deliberately too short
-        assert!(OffsetSizeBytes::Two.unpack_usize(&tiny, 0).is_err());
-        assert!(OffsetSizeBytes::Three.unpack_usize(&tiny, 0).is_err());
+        assert!(OffsetSizeBytes::Two.unpack_u32(&tiny, 0).is_err());
+        assert!(OffsetSizeBytes::Three.unpack_u32(&tiny, 0).is_err());
     }
 
     #[test]
@@ -584,20 +568,20 @@ mod tests {
         let width = OffsetSizeBytes::Two;
 
         // dictionary_size starts immediately after the header byte
-        let dict_size = width.unpack_usize_at_offset(&buf, 1, 0).unwrap();
+        let dict_size = width.unpack_u32_at_offset(&buf, 1, 0).unwrap();
         assert_eq!(dict_size, 2);
 
         // offset array immediately follows the dictionary size
-        let first = width.unpack_usize_at_offset(&buf, 1, 1).unwrap();
+        let first = width.unpack_u32_at_offset(&buf, 1, 1).unwrap();
         assert_eq!(first, 0);
 
-        let second = width.unpack_usize_at_offset(&buf, 1, 2).unwrap();
+        let second = width.unpack_u32_at_offset(&buf, 1, 2).unwrap();
         assert_eq!(second, 5);
 
-        let third = width.unpack_usize_at_offset(&buf, 1, 3).unwrap();
+        let third = width.unpack_u32_at_offset(&buf, 1, 3).unwrap();
         assert_eq!(third, 9);
 
-        let err = width.unpack_usize_at_offset(&buf, 1, 4);
+        let err = width.unpack_u32_at_offset(&buf, 1, 4);
         assert!(err.is_err())
     }
 }
diff --git a/parquet-variant/src/utils.rs b/parquet-variant/src/utils.rs
index ef402064e9..a9751f0ab6 100644
--- a/parquet-variant/src/utils.rs
+++ b/parquet-variant/src/utils.rs
@@ -122,3 +122,12 @@ where
 
     Some(Err(start))
 }
+
+/// Verifies the expected size of type T, for a type that should only grow if 
absolutely necessary.
+#[allow(unused)]
+pub(crate) const fn expect_size_of<T>(expected: usize) {
+    let size = std::mem::size_of::<T>();
+    if size != expected {
+        let _ = [""; 0][size];
+    }
+}
diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs
index dd8287fd1c..8138549b1a 100644
--- a/parquet-variant/src/variant.rs
+++ b/parquet-variant/src/variant.rs
@@ -256,6 +256,9 @@ pub enum Variant<'m, 'v> {
     List(VariantList<'m, 'v>),
 }
 
+// We don't want this to grow because it could hurt performance of a 
frequently-created type.
+const _: () = crate::utils::expect_size_of::<Variant>(80);
+
 impl<'m, 'v> Variant<'m, 'v> {
     /// Attempts to interpret a metadata and value buffer pair as a new 
`Variant`.
     ///
diff --git a/parquet-variant/src/variant/list.rs 
b/parquet-variant/src/variant/list.rs
index 11122190b4..17f87a2e0d 100644
--- a/parquet-variant/src/variant/list.rs
+++ b/parquet-variant/src/variant/list.rs
@@ -23,7 +23,7 @@ use crate::variant::{Variant, VariantMetadata};
 use arrow_schema::ArrowError;
 
 // The value header occupies one byte; use a named constant for readability
-const NUM_HEADER_BYTES: usize = 1;
+const NUM_HEADER_BYTES: u32 = 1;
 
 /// A parsed version of the variant array value header byte.
 #[derive(Debug, Clone, PartialEq)]
@@ -34,15 +34,15 @@ pub(crate) struct VariantListHeader {
 
 impl VariantListHeader {
     // Hide the ugly casting
-    const fn num_elements_size(&self) -> usize {
+    const fn num_elements_size(&self) -> u32 {
         self.num_elements_size as _
     }
-    const fn offset_size(&self) -> usize {
+    const fn offset_size(&self) -> u32 {
         self.offset_size as _
     }
 
     // Avoid materializing this offset, since it's cheaply and safely 
computable
-    const fn first_offset_byte(&self) -> usize {
+    const fn first_offset_byte(&self) -> u32 {
         NUM_HEADER_BYTES + self.num_elements_size()
     }
 
@@ -122,11 +122,14 @@ pub struct VariantList<'m, 'v> {
     pub metadata: VariantMetadata<'m>,
     pub value: &'v [u8],
     header: VariantListHeader,
-    num_elements: usize,
-    first_value_byte: usize,
+    num_elements: u32,
+    first_value_byte: u32,
     validated: bool,
 }
 
+// We don't want this to grow because it could increase the size of `Variant` 
and hurt performance.
+const _: () = crate::utils::expect_size_of::<VariantList>(64);
+
 impl<'m, 'v> VariantList<'m, 'v> {
     /// Attempts to interpret `value` as a variant array value.
     ///
@@ -157,7 +160,7 @@ impl<'m, 'v> VariantList<'m, 'v> {
         let num_elements =
             header
                 .num_elements_size
-                .unpack_usize_at_offset(value, NUM_HEADER_BYTES, 0)?;
+                .unpack_u32_at_offset(value, NUM_HEADER_BYTES as _, 0)?;
 
         // (num_elements + 1) * offset_size + first_offset_byte
         let first_value_byte = num_elements
@@ -185,10 +188,10 @@ impl<'m, 'v> VariantList<'m, 'v> {
 
         // Use the last offset to upper-bound the value buffer
         let last_offset = new_self
-            .get_offset(num_elements)?
+            .get_offset(num_elements as _)?
             .checked_add(first_value_byte)
             .ok_or_else(|| overflow_error("variant array size"))?;
-        new_self.value = slice_from_slice(value, ..last_offset)?;
+        new_self.value = slice_from_slice(value, ..last_offset as _)?;
         Ok(new_self)
     }
 
@@ -210,7 +213,7 @@ impl<'m, 'v> VariantList<'m, 'v> {
 
             let offset_buffer = slice_from_slice(
                 self.value,
-                self.header.first_offset_byte()..self.first_value_byte,
+                self.header.first_offset_byte() as _..self.first_value_byte as 
_,
             )?;
 
             let offsets =
@@ -226,7 +229,7 @@ impl<'m, 'v> VariantList<'m, 'v> {
                 ));
             }
 
-            let value_buffer = slice_from_slice(self.value, 
self.first_value_byte..)?;
+            let value_buffer = slice_from_slice(self.value, 
self.first_value_byte as _..)?;
 
             // Validate whether values are valid variant objects
             for i in 1..offsets.len() {
@@ -234,7 +237,7 @@ impl<'m, 'v> VariantList<'m, 'v> {
                 let end_offset = offsets[i];
 
                 let value_bytes = slice_from_slice(value_buffer, 
start_offset..end_offset)?;
-                Variant::try_new_with_metadata(self.metadata, value_bytes)?;
+                Variant::try_new_with_metadata(self.metadata.clone(), 
value_bytes)?;
             }
 
             self.validated = true;
@@ -244,7 +247,7 @@ impl<'m, 'v> VariantList<'m, 'v> {
 
     /// Return the length of this array
     pub fn len(&self) -> usize {
-        self.num_elements
+        self.num_elements as _
     }
 
     /// Is the array of zero length
@@ -256,7 +259,7 @@ impl<'m, 'v> VariantList<'m, 'v> {
     ///
     /// [invalid]: Self#Validation
     pub fn get(&self, index: usize) -> Option<Variant<'m, 'v>> {
-        (index < self.num_elements).then(|| {
+        (index < self.len()).then(|| {
             self.try_get_with_shallow_validation(index)
                 .expect("Invalid variant array element")
         })
@@ -272,10 +275,10 @@ impl<'m, 'v> VariantList<'m, 'v> {
     fn try_get_with_shallow_validation(&self, index: usize) -> 
Result<Variant<'m, 'v>, ArrowError> {
         // Fetch the value bytes between the two offsets for this index, from 
the value array region
         // of the byte buffer
-        let byte_range = self.get_offset(index)?..self.get_offset(index + 1)?;
+        let byte_range = self.get_offset(index)? as _..self.get_offset(index + 
1)? as _;
         let value_bytes =
-            slice_from_slice_at_offset(self.value, self.first_value_byte, 
byte_range)?;
-        Variant::try_new_with_metadata_and_shallow_validation(self.metadata, 
value_bytes)
+            slice_from_slice_at_offset(self.value, self.first_value_byte as _, 
byte_range)?;
+        
Variant::try_new_with_metadata_and_shallow_validation(self.metadata.clone(), 
value_bytes)
     }
 
     /// Iterates over the values of this list. When working with [unvalidated] 
input, consider
@@ -297,14 +300,14 @@ impl<'m, 'v> VariantList<'m, 'v> {
     fn iter_try_with_shallow_validation(
         &self,
     ) -> impl Iterator<Item = Result<Variant<'m, 'v>, ArrowError>> + '_ {
-        (0..self.len()).map(move |i| self.try_get_with_shallow_validation(i))
+        (0..self.len()).map(|i| self.try_get_with_shallow_validation(i))
     }
 
     // Attempts to retrieve the ith offset from the offset array region of the 
byte buffer.
-    fn get_offset(&self, index: usize) -> Result<usize, ArrowError> {
-        let byte_range = 
self.header.first_offset_byte()..self.first_value_byte;
+    fn get_offset(&self, index: usize) -> Result<u32, ArrowError> {
+        let byte_range = self.header.first_offset_byte() as 
_..self.first_value_byte as _;
         let offset_bytes = slice_from_slice(self.value, byte_range)?;
-        self.header.offset_size.unpack_usize(offset_bytes, index)
+        self.header.offset_size.unpack_u32(offset_bytes, index)
     }
 }
 
@@ -623,7 +626,7 @@ mod tests {
             expected_num_element_size,
             variant_list.header.num_elements_size
         );
-        assert_eq!(list_size, variant_list.num_elements);
+        assert_eq!(list_size, variant_list.num_elements as usize);
 
         // verify the data in the variant
         assert_eq!(list_size, variant_list.len());
diff --git a/parquet-variant/src/variant/metadata.rs 
b/parquet-variant/src/variant/metadata.rs
index b50a766869..007122af75 100644
--- a/parquet-variant/src/variant/metadata.rs
+++ b/parquet-variant/src/variant/metadata.rs
@@ -34,16 +34,16 @@ pub(crate) struct VariantMetadataHeader {
 const CORRECT_VERSION_VALUE: u8 = 1;
 
 // The metadata header occupies one byte; use a named constant for readability
-const NUM_HEADER_BYTES: usize = 1;
+const NUM_HEADER_BYTES: u32 = 1;
 
 impl VariantMetadataHeader {
     // Hide the cast
-    const fn offset_size(&self) -> usize {
-        self.offset_size as usize
+    const fn offset_size(&self) -> u32 {
+        self.offset_size as u32
     }
 
     // Avoid materializing this offset, since it's cheaply and safely 
computable
-    const fn first_offset_byte(&self) -> usize {
+    const fn first_offset_byte(&self) -> u32 {
         NUM_HEADER_BYTES + self.offset_size()
     }
 
@@ -125,15 +125,19 @@ impl VariantMetadataHeader {
 ///
 /// [`Variant`]: crate::Variant
 /// [Variant Spec]: 
https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#metadata-encoding
-#[derive(Debug, Clone, Copy, PartialEq)]
+#[derive(Debug, Clone, PartialEq)]
 pub struct VariantMetadata<'m> {
     bytes: &'m [u8],
     header: VariantMetadataHeader,
-    dictionary_size: usize,
-    first_value_byte: usize,
+    dictionary_size: u32,
+    first_value_byte: u32,
     validated: bool,
 }
 
+// We don't want this to grow because it increases the size of VariantList and 
VariantObject, which
+// could increase the size of Variant. All those size increases could hurt 
performance.
+const _: () = crate::utils::expect_size_of::<VariantMetadata>(32);
+
 impl<'m> VariantMetadata<'m> {
     /// Attempts to interpret `bytes` as a variant metadata instance, with 
full [validation] of all
     /// dictionary entries.
@@ -166,7 +170,7 @@ impl<'m> VariantMetadata<'m> {
         let dictionary_size =
             header
                 .offset_size
-                .unpack_usize_at_offset(bytes, NUM_HEADER_BYTES, 0)?;
+                .unpack_u32_at_offset(bytes, NUM_HEADER_BYTES as usize, 0)?;
 
         // Calculate the starting offset of the dictionary string bytes.
         //
@@ -196,16 +200,16 @@ impl<'m> VariantMetadata<'m> {
 
         // Use the last offset to upper-bound the byte slice
         let last_offset = new_self
-            .get_offset(dictionary_size)?
+            .get_offset(dictionary_size as _)?
             .checked_add(first_value_byte)
             .ok_or_else(|| overflow_error("variant metadata size"))?;
-        new_self.bytes = slice_from_slice(bytes, ..last_offset)?;
+        new_self.bytes = slice_from_slice(bytes, ..last_offset as _)?;
         Ok(new_self)
     }
 
     /// The number of metadata dictionary entries
     pub fn len(&self) -> usize {
-        self.dictionary_size
+        self.dictionary_size()
     }
 
     /// True if this metadata dictionary contains no entries
@@ -227,7 +231,7 @@ impl<'m> VariantMetadata<'m> {
         if !self.validated {
             let offset_bytes = slice_from_slice(
                 self.bytes,
-                self.header.first_offset_byte()..self.first_value_byte,
+                self.header.first_offset_byte() as _..self.first_value_byte as 
_,
             )?;
 
             let offsets =
@@ -245,7 +249,7 @@ impl<'m> VariantMetadata<'m> {
 
             // Verify the string values in the dictionary are UTF-8 encoded 
strings.
             let value_buffer =
-                string_from_slice(self.bytes, 0, 
self.first_value_byte..self.bytes.len())?;
+                string_from_slice(self.bytes, 0, self.first_value_byte as 
_..self.bytes.len())?;
 
             if self.header.is_sorted {
                 // Validate the dictionary values are unique and 
lexicographically sorted
@@ -278,7 +282,7 @@ impl<'m> VariantMetadata<'m> {
 
     /// Get the dictionary size
     pub const fn dictionary_size(&self) -> usize {
-        self.dictionary_size
+        self.dictionary_size as _
     }
 
     /// The variant protocol version
@@ -290,10 +294,10 @@ impl<'m> VariantMetadata<'m> {
     ///
     /// This offset is an index into the dictionary, at the boundary between 
string `i-1` and string
     /// `i`. See [`Self::get`] to retrieve a specific dictionary entry.
-    fn get_offset(&self, i: usize) -> Result<usize, ArrowError> {
-        let offset_byte_range = 
self.header.first_offset_byte()..self.first_value_byte;
+    fn get_offset(&self, i: usize) -> Result<u32, ArrowError> {
+        let offset_byte_range = self.header.first_offset_byte() as 
_..self.first_value_byte as _;
         let bytes = slice_from_slice(self.bytes, offset_byte_range)?;
-        self.header.offset_size.unpack_usize(bytes, i)
+        self.header.offset_size.unpack_u32(bytes, i)
     }
 
     /// Attempts to retrieve a dictionary entry by index, failing if out of 
bounds or if the
@@ -301,8 +305,8 @@ impl<'m> VariantMetadata<'m> {
     ///
     /// [invalid]: Self#Validation
     pub fn get(&self, i: usize) -> Result<&'m str, ArrowError> {
-        let byte_range = self.get_offset(i)?..self.get_offset(i + 1)?;
-        string_from_slice(self.bytes, self.first_value_byte, byte_range)
+        let byte_range = self.get_offset(i)? as _..self.get_offset(i + 1)? as 
_;
+        string_from_slice(self.bytes, self.first_value_byte as _, byte_range)
     }
 
     /// Returns an iterator that attempts to visit all dictionary entries, 
producing `Err` if the
@@ -310,7 +314,7 @@ impl<'m> VariantMetadata<'m> {
     ///
     /// [invalid]: Self#Validation
     pub fn iter_try(&self) -> impl Iterator<Item = Result<&'m str, 
ArrowError>> + '_ {
-        (0..self.dictionary_size).map(move |i| self.get(i))
+        (0..self.len()).map(|i| self.get(i))
     }
 
     /// Iterates over all dictionary entries. When working with [unvalidated] 
input, consider
diff --git a/parquet-variant/src/variant/object.rs 
b/parquet-variant/src/variant/object.rs
index 36c8f999b2..dd6da08fbe 100644
--- a/parquet-variant/src/variant/object.rs
+++ b/parquet-variant/src/variant/object.rs
@@ -23,7 +23,7 @@ use crate::variant::{Variant, VariantMetadata};
 use arrow_schema::ArrowError;
 
 // The value header occupies one byte; use a named constant for readability
-const NUM_HEADER_BYTES: usize = 1;
+const NUM_HEADER_BYTES: u32 = 1;
 
 /// Header structure for [`VariantObject`]
 #[derive(Debug, Clone, PartialEq)]
@@ -35,18 +35,18 @@ pub(crate) struct VariantObjectHeader {
 
 impl VariantObjectHeader {
     // Hide the ugly casting
-    const fn num_elements_size(&self) -> usize {
+    const fn num_elements_size(&self) -> u32 {
         self.num_elements_size as _
     }
-    const fn field_id_size(&self) -> usize {
+    const fn field_id_size(&self) -> u32 {
         self.field_id_size as _
     }
-    const fn field_offset_size(&self) -> usize {
+    const fn field_offset_size(&self) -> u32 {
         self.field_offset_size as _
     }
 
     // Avoid materializing this offset, since it's cheaply and safely 
computable
-    const fn field_ids_start_byte(&self) -> usize {
+    const fn field_ids_start_byte(&self) -> u32 {
         NUM_HEADER_BYTES + self.num_elements_size()
     }
 
@@ -119,12 +119,15 @@ pub struct VariantObject<'m, 'v> {
     pub metadata: VariantMetadata<'m>,
     pub value: &'v [u8],
     header: VariantObjectHeader,
-    num_elements: usize,
-    first_field_offset_byte: usize,
-    first_value_byte: usize,
+    num_elements: u32,
+    first_field_offset_byte: u32,
+    first_value_byte: u32,
     validated: bool,
 }
 
+// We don't want this to grow because it could increase the size of `Variant` 
and hurt performance.
+const _: () = crate::utils::expect_size_of::<VariantObject>(64);
+
 impl<'m, 'v> VariantObject<'m, 'v> {
     pub fn new(metadata: VariantMetadata<'m>, value: &'v [u8]) -> Self {
         Self::try_new_with_shallow_validation(metadata, value).expect("Invalid 
variant object")
@@ -156,7 +159,7 @@ impl<'m, 'v> VariantObject<'m, 'v> {
         let num_elements =
             header
                 .num_elements_size
-                .unpack_usize_at_offset(value, NUM_HEADER_BYTES, 0)?;
+                .unpack_u32_at_offset(value, NUM_HEADER_BYTES as _, 0)?;
 
         // Calculate byte offsets for field offsets and values with overflow 
protection, and verify
         // they're in bounds
@@ -186,10 +189,10 @@ impl<'m, 'v> VariantObject<'m, 'v> {
         // Use it to upper-bound the value bytes, which also verifies that the 
field id and field
         // offset arrays are in bounds.
         let last_offset = new_self
-            .get_offset(num_elements)?
+            .get_offset(num_elements as _)?
             .checked_add(first_value_byte)
             .ok_or_else(|| overflow_error("variant object size"))?;
-        new_self.value = slice_from_slice(value, ..last_offset)?;
+        new_self.value = slice_from_slice(value, ..last_offset as _)?;
         Ok(new_self)
     }
 
@@ -211,7 +214,7 @@ impl<'m, 'v> VariantObject<'m, 'v> {
 
             let field_id_buffer = slice_from_slice(
                 self.value,
-                
self.header.field_ids_start_byte()..self.first_field_offset_byte,
+                self.header.field_ids_start_byte() as 
_..self.first_field_offset_byte as _,
             )?;
 
             let field_ids = map_bytes_to_offsets(field_id_buffer, 
self.header.field_id_size)
@@ -268,17 +271,17 @@ impl<'m, 'v> VariantObject<'m, 'v> {
             // Validate whether values are valid variant objects
             let field_offset_buffer = slice_from_slice(
                 self.value,
-                self.first_field_offset_byte..self.first_value_byte,
+                self.first_field_offset_byte as _..self.first_value_byte as _,
             )?;
-            let num_offsets = field_offset_buffer.len() / 
self.header.field_offset_size();
+            let num_offsets = field_offset_buffer.len() / 
self.header.field_offset_size() as usize;
 
-            let value_buffer = slice_from_slice(self.value, 
self.first_value_byte..)?;
+            let value_buffer = slice_from_slice(self.value, 
self.first_value_byte as _..)?;
 
             map_bytes_to_offsets(field_offset_buffer, 
self.header.field_offset_size)
                 .take(num_offsets.saturating_sub(1))
                 .try_for_each(|offset| {
                     let value_bytes = slice_from_slice(value_buffer, 
offset..)?;
-                    Variant::try_new_with_metadata(self.metadata, 
value_bytes)?;
+                    Variant::try_new_with_metadata(self.metadata.clone(), 
value_bytes)?;
 
                     Ok::<_, ArrowError>(())
                 })?;
@@ -290,7 +293,7 @@ impl<'m, 'v> VariantObject<'m, 'v> {
 
     /// Returns the number of key-value pairs in this object
     pub fn len(&self) -> usize {
-        self.num_elements
+        self.num_elements as _
     }
 
     /// Returns true if the object contains no key-value pairs
@@ -321,16 +324,16 @@ impl<'m, 'v> VariantObject<'m, 'v> {
     // Attempts to retrieve the ith field value from the value region of the 
byte buffer; it
     // performs only basic (constant-cost) validation.
     fn try_field_with_shallow_validation(&self, i: usize) -> 
Result<Variant<'m, 'v>, ArrowError> {
-        let value_bytes = slice_from_slice(self.value, 
self.first_value_byte..)?;
-        let value_bytes = slice_from_slice(value_bytes, 
self.get_offset(i)?..)?;
-        Variant::try_new_with_metadata_and_shallow_validation(self.metadata, 
value_bytes)
+        let value_bytes = slice_from_slice(self.value, self.first_value_byte 
as _..)?;
+        let value_bytes = slice_from_slice(value_bytes, self.get_offset(i)? as 
_..)?;
+        
Variant::try_new_with_metadata_and_shallow_validation(self.metadata.clone(), 
value_bytes)
     }
 
     // Attempts to retrieve the ith offset from the field offset region of the 
byte buffer.
-    fn get_offset(&self, i: usize) -> Result<usize, ArrowError> {
-        let byte_range = self.first_field_offset_byte..self.first_value_byte;
+    fn get_offset(&self, i: usize) -> Result<u32, ArrowError> {
+        let byte_range = self.first_field_offset_byte as 
_..self.first_value_byte as _;
         let field_offsets = slice_from_slice(self.value, byte_range)?;
-        self.header.field_offset_size.unpack_usize(field_offsets, i)
+        self.header.field_offset_size.unpack_u32(field_offsets, i)
     }
 
     /// Get a field's name by index in `0..self.len()`
@@ -347,10 +350,10 @@ impl<'m, 'v> VariantObject<'m, 'v> {
 
     /// Fallible version of `field_name`. Returns field name by index, 
capturing validation errors
     fn try_field_name(&self, i: usize) -> Result<&'m str, ArrowError> {
-        let byte_range = 
self.header.field_ids_start_byte()..self.first_field_offset_byte;
+        let byte_range = self.header.field_ids_start_byte() as 
_..self.first_field_offset_byte as _;
         let field_id_bytes = slice_from_slice(self.value, byte_range)?;
-        let field_id = self.header.field_id_size.unpack_usize(field_id_bytes, 
i)?;
-        self.metadata.get(field_id)
+        let field_id = self.header.field_id_size.unpack_u32(field_id_bytes, 
i)?;
+        self.metadata.get(field_id as _)
     }
 
     /// Returns an iterator of (name, value) pairs over the fields of this 
object.
@@ -374,7 +377,7 @@ impl<'m, 'v> VariantObject<'m, 'v> {
     fn iter_try_with_shallow_validation(
         &self,
     ) -> impl Iterator<Item = Result<(&'m str, Variant<'m, 'v>), ArrowError>> 
+ '_ {
-        (0..self.num_elements).map(move |i| {
+        (0..self.len()).map(|i| {
             let field = self.try_field_with_shallow_validation(i)?;
             Ok((self.try_field_name(i)?, field))
         })
@@ -389,8 +392,7 @@ impl<'m, 'v> VariantObject<'m, 'v> {
         // NOTE: This does not require a sorted metadata dictionary, because 
the variant spec
         // requires object field ids to be lexically sorted by their 
corresponding string values,
         // and probing the dictionary for a field id is always O(1) work.
-        let i = try_binary_search_range_by(0..self.num_elements, &name, |i| 
self.field_name(i))?
-            .ok()?;
+        let i = try_binary_search_range_by(0..self.len(), &name, |i| 
self.field_name(i))?.ok()?;
 
         self.field(i)
     }

Reply via email to