Re: [PR] API for reading Variant data and metadata [arrow-rs]

via GitHub Thu, 22 May 2025 13:55:03 -0700


mkarbo commented on code in PR #7535:
URL: https://github.com/apache/arrow-rs/pull/7535#discussion_r2103383668



##########
parquet-variant/src/variant.rs:
##########
@@ -0,0 +1,415 @@
+use crate::decoder::{
+    self, get_basic_type, get_primitive_type, VariantBasicType, 
VariantPrimitiveType,
+};
+use crate::utils::{array_from_slice, invalid_utf8_err, non_empty_slice, 
slice_from_slice};
+use arrow_schema::ArrowError;
+use std::{
+    num::TryFromIntError,
+    ops::{Index, Range},
+    str,
+};
+
+#[derive(Clone, Debug, Copy, PartialEq)]
+enum OffsetSizeBytes {
+    One = 1,
+    Two = 2,
+    Three = 3,
+    Four = 4,
+}
+
+impl OffsetSizeBytes {
+    fn try_new(offset_size_minus_one: u8) -> Result<Self, ArrowError> {
+        use OffsetSizeBytes::*;
+        let result = match offset_size_minus_one {
+            0 => One,
+            1 => Two,
+            2 => Three,
+            3 => Four,
+            _ => {
+                return Err(ArrowError::InvalidArgumentError(
+                    "offset_size_minus_one must be 0–3".to_string(),
+                ))
+            }
+        };
+        Ok(result)
+    }
+
+    fn unpack_usize(
+        &self,
+        bytes: &[u8],
+        byte_offset: usize,  // how many bytes to skip
+        offset_index: usize, // which offset in an array of offsets
+    ) -> Result<usize, ArrowError> {
+        use OffsetSizeBytes::*;
+        let offset = byte_offset + (*self as usize) * offset_index;
+        let result = match self {
+            One => u8::from_le_bytes(array_from_slice(bytes, offset)?).into(),
+            Two => u16::from_le_bytes(array_from_slice(bytes, offset)?).into(),
+            Three => todo!(), // ugh, endianness
+            Four => u32::from_le_bytes(array_from_slice(bytes, offset)?)
+                .try_into()
+                .map_err(|e: TryFromIntError| 
ArrowError::InvalidArgumentError(e.to_string()))?,
+        };
+        Ok(result)
+    }
+}
+
+#[derive(Clone, Debug, Copy, PartialEq)]
+pub(crate) struct VariantMetadataHeader {
+    version: u8,
+    is_sorted: bool,
+    /// Note: This is `offset_size_minus_one` + 1
+    offset_size: OffsetSizeBytes,
+}
+
+impl<'m> VariantMetadataHeader {
+    /// Tries to construct the variant metadata header, which has the form
+    ///              7     6  5   4  3             0
+    ///             +-------+---+---+---------------+
+    /// header      |       |   |   |    version    |
+    ///             +-------+---+---+---------------+
+    ///                 ^         ^
+    ///                 |         +-- sorted_strings
+    ///                 +-- offset_size_minus_one
+    /// The version is a 4-bit value that must always contain the value 1.
+    /// - sorted_strings is a 1-bit value indicating whether dictionary 
strings are sorted and unique.
+    /// - offset_size_minus_one is a 2-bit value providing the number of bytes 
per dictionary size and offset field.
+    /// - The actual number of bytes, offset_size, is offset_size_minus_one + 1
+    pub fn try_new(bytes: &'m [u8]) -> Result<Self, ArrowError> {
+        let Some(header) = bytes.get(0) else {
+            return Err(ArrowError::InvalidArgumentError(
+                "Received zero bytes".to_string(),
+            ));
+        };
+
+        let version = header & 0x0F; // First four bits
+        let is_sorted = (header & 0x10) != 0; // Fifth bit
+        let offset_size_minus_one = (header >> 6) & 0x03; // Last two bits
+        Ok(Self {
+            version,
+            is_sorted,
+            offset_size: OffsetSizeBytes::try_new(offset_size_minus_one)?,
+        })
+    }
+}
+
+#[derive(Clone, Copy, Debug, PartialEq)]
+/// Encodes the Variant Metadata, see the Variant spec file for more 
information
+pub struct VariantMetadata<'m> {
+    bytes: &'m [u8],
+    header: VariantMetadataHeader,
+    dict_size: usize,
+}
+
+impl<'m> VariantMetadata<'m> {
+    /// View the raw bytes (needed by very low-level decoders)
+    #[inline]
+    pub const fn as_bytes(&self) -> &'m [u8] {
+        self.bytes
+    }
+
+    pub fn try_new(bytes: &'m [u8]) -> Result<Self, ArrowError> {
+        let header = VariantMetadataHeader::try_new(bytes)?;
+        // Offset 1, index 0 because first element after header is dictionary 
size
+        let dict_size = header.offset_size.unpack_usize(bytes, 1, 0)?;
+
+        // TODO: Refactor, add test for validation
+        let valid = (0..=dict_size)
+            .map(|i| header.offset_size.unpack_usize(bytes, 1, i + 1))
+            .scan(0, |prev, cur| {
+                let Ok(cur_offset) = cur else {
+                    return Some(false);
+                };
+                // Skip the first offset, which is always 0
+                if *prev == 0 {
+                    *prev = cur_offset;
+                    return Some(true);
+                }
+
+                let valid = cur_offset > *prev;
+                *prev = cur_offset;
+                Some(valid)
+            })
+            .all(|valid| valid);
+
+        if !valid {
+            return Err(ArrowError::InvalidArgumentError(
+                "Offsets are not monotonically increasing".to_string(),
+            ));
+        }
+        Ok(Self {
+            bytes,
+            header,
+            dict_size,
+        })
+    }
+
+    /// Whether the dictionary keys are sorted and unique
+    pub fn is_sorted(&self) -> bool {
+        self.header.is_sorted
+    }
+
+    /// Get the dictionary size
+    pub fn dictionary_size(&self) -> usize {
+        self.dict_size
+    }
+    pub fn version(&self) -> usize {
+        todo!()
+    }
+
+    /// Get the offset by key-index
+    pub fn get_offset_by(&self, index: usize) -> Result<Range<usize>, 
ArrowError> {
+        // TODO: Should we memoize the offsets? There could be thousands of 
them (https://github.com/apache/arrow-rs/pull/7535#discussion_r2101351294)

Review Comment:
   Agreed, I will postpone this for a later ticket, let's focus on finishing 
the API and rest of the basics first.
   
   LMK if you disagree, resolving for now



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] API for reading Variant data and metadata [arrow-rs]

Reply via email to