Re: [PR] API for reading Variant data and metadata [arrow-rs]

via GitHub Wed, 21 May 2025 13:00:59 -0700


alamb commented on code in PR #7535:
URL: https://github.com/apache/arrow-rs/pull/7535#discussion_r2101010184



##########
parquet-variant/src/variant.rs:
##########
@@ -0,0 +1,321 @@
+use std::{borrow::Cow, ops::Index};
+
+use crate::decoder::{self, get_variant_type};
+use arrow_schema::ArrowError;
+use strum_macros::EnumDiscriminants;
+
+#[derive(Clone, Copy, Debug, PartialEq)]
+/// Encodes the Variant Metadata, see the Variant spec file for more 
information
+pub struct VariantMetadata<'m> {
+    bytes: &'m [u8],
+}
+
+impl<'m> VariantMetadata<'m> {
+    /// View the raw bytes (needed by very low-level decoders)
+    #[inline]
+    pub const fn as_bytes(&self) -> &'m [u8] {
+        self.bytes
+    }
+
+    /// Whether the dictionary keys are sorted and unique
+    pub fn is_sorted(&self) -> bool {
+        todo!()
+    }
+
+    /// Get the dict length
+    pub fn dict_len(&self) -> Result<usize, ArrowError> {
+        let end_location = self.offset_size()? as usize + 1;
+        if self.bytes.len() < end_location {
+            let err_str = format!(
+                "Invalid metadata bytes, must have at least length {} but has 
length {}",
+                &end_location,
+                self.bytes.len()
+            );
+            return Err(ArrowError::InvalidArgumentError(err_str));
+        }
+        let dict_len_bytes = &self.bytes[1..end_location];
+        let dict_len = 
usize::from_le_bytes(dict_len_bytes.try_into().map_err(|e| {
+            ArrowError::InvalidArgumentError(format!(
+                "Unable to convert dictionary_size bytes into usize: {}",
+                e,
+            ))
+        })?);
+        Ok(dict_len)
+    }
+    pub fn version(&self) -> usize {
+        todo!()
+    }
+
+    /// Get the offset by index
+    pub fn get_offset_by(&self, index: usize) -> Result<usize, ArrowError> {
+        todo!()
+    }
+
+    /// Get the header byte, which has the following form
+    ///              7     6  5   4  3             0
+    ///             +-------+---+---+---------------+
+    /// header      |       |   |   |    version    |
+    ///             +-------+---+---+---------------+
+    ///                 ^         ^
+    ///                 |         +-- sorted_strings
+    ///                 +-- offset_size_minus_one
+    /// The version is a 4-bit value that must always contain the value 1.
+    /// - sorted_strings is a 1-bit value indicating whether dictionary 
strings are sorted and unique.
+    /// - offset_size_minus_one is a 2-bit value providing the number of bytes 
per dictionary size and offset field.
+    /// - The actual number of bytes, offset_size, is offset_size_minus_one + 1
+    pub fn header(&self) -> Result<u8, ArrowError> {
+        if self.bytes.is_empty() {
+            return Err(ArrowError::InvalidArgumentError(
+                "Can't get header from empty buffer".to_string(),
+            ));
+        }
+        Ok(self.bytes[0])
+    }
+
+    /// Get the offset_minus_one value from the header
+    pub fn offset_size_minus_one(&self) -> Result<u8, ArrowError> {
+        if self.bytes.is_empty() {
+            Err(ArrowError::InvalidArgumentError(
+                "Tried to get offset_size_minus_one from header, but 
self.bytes buffer is emtpy."
+                    .to_string(),
+            ))
+        } else {
+            Ok(self.bytes[0] & (0b11 << 6)) // Grab the last 2 bits
+        }
+    }
+
+    /// Get the offset_size
+    pub fn offset_size(&self) -> Result<u8, ArrowError> {
+        Ok(self.offset_size_minus_one()? + 1)
+    }
+
+    /// Get the offsets as an iterator
+    // TODO: Do we want this kind of API?
+    // TODO: Test once API is agreed upon
+    pub fn offsets(&'m self) -> Result<impl Iterator<Item = (usize, usize)> + 
'm, ArrowError> {
+        struct OffsetIterators<'m> {
+            buffer: &'m [u8],
+            total: usize,

Review Comment:
   Maybe we could call this `dict_len` for consistency 



##########
parquet-variant/src/variant.rs:
##########
@@ -0,0 +1,321 @@
+use std::{borrow::Cow, ops::Index};
+
+use crate::decoder::{self, get_variant_type};
+use arrow_schema::ArrowError;
+use strum_macros::EnumDiscriminants;
+
+#[derive(Clone, Copy, Debug, PartialEq)]
+/// Encodes the Variant Metadata, see the Variant spec file for more 
information
+pub struct VariantMetadata<'m> {
+    bytes: &'m [u8],
+}
+
+impl<'m> VariantMetadata<'m> {
+    /// View the raw bytes (needed by very low-level decoders)
+    #[inline]
+    pub const fn as_bytes(&self) -> &'m [u8] {
+        self.bytes
+    }
+
+    /// Whether the dictionary keys are sorted and unique
+    pub fn is_sorted(&self) -> bool {
+        todo!()
+    }
+
+    /// Get the dict length
+    pub fn dict_len(&self) -> Result<usize, ArrowError> {
+        let end_location = self.offset_size()? as usize + 1;
+        if self.bytes.len() < end_location {
+            let err_str = format!(
+                "Invalid metadata bytes, must have at least length {} but has 
length {}",
+                &end_location,
+                self.bytes.len()
+            );
+            return Err(ArrowError::InvalidArgumentError(err_str));
+        }
+        let dict_len_bytes = &self.bytes[1..end_location];
+        let dict_len = 
usize::from_le_bytes(dict_len_bytes.try_into().map_err(|e| {
+            ArrowError::InvalidArgumentError(format!(
+                "Unable to convert dictionary_size bytes into usize: {}",
+                e,
+            ))
+        })?);
+        Ok(dict_len)
+    }
+    pub fn version(&self) -> usize {
+        todo!()
+    }
+
+    /// Get the offset by index
+    pub fn get_offset_by(&self, index: usize) -> Result<usize, ArrowError> {
+        todo!()
+    }
+
+    /// Get the header byte, which has the following form
+    ///              7     6  5   4  3             0
+    ///             +-------+---+---+---------------+
+    /// header      |       |   |   |    version    |
+    ///             +-------+---+---+---------------+
+    ///                 ^         ^
+    ///                 |         +-- sorted_strings
+    ///                 +-- offset_size_minus_one
+    /// The version is a 4-bit value that must always contain the value 1.
+    /// - sorted_strings is a 1-bit value indicating whether dictionary 
strings are sorted and unique.
+    /// - offset_size_minus_one is a 2-bit value providing the number of bytes 
per dictionary size and offset field.
+    /// - The actual number of bytes, offset_size, is offset_size_minus_one + 1
+    pub fn header(&self) -> Result<u8, ArrowError> {
+        if self.bytes.is_empty() {
+            return Err(ArrowError::InvalidArgumentError(
+                "Can't get header from empty buffer".to_string(),
+            ));
+        }
+        Ok(self.bytes[0])
+    }
+
+    /// Get the offset_minus_one value from the header
+    pub fn offset_size_minus_one(&self) -> Result<u8, ArrowError> {
+        if self.bytes.is_empty() {
+            Err(ArrowError::InvalidArgumentError(
+                "Tried to get offset_size_minus_one from header, but 
self.bytes buffer is emtpy."
+                    .to_string(),
+            ))
+        } else {
+            Ok(self.bytes[0] & (0b11 << 6)) // Grab the last 2 bits
+        }
+    }
+
+    /// Get the offset_size
+    pub fn offset_size(&self) -> Result<u8, ArrowError> {
+        Ok(self.offset_size_minus_one()? + 1)
+    }
+
+    /// Get the offsets as an iterator
+    // TODO: Do we want this kind of API?
+    // TODO: Test once API is agreed upon
+    pub fn offsets(&'m self) -> Result<impl Iterator<Item = (usize, usize)> + 
'm, ArrowError> {
+        struct OffsetIterators<'m> {
+            buffer: &'m [u8],
+            total: usize,
+            seen: usize,
+            offset_size: usize,
+        }
+        impl<'m> Iterator for OffsetIterators<'m> {
+            type Item = (usize, usize); // (start, end) positions of the bytes
+
+            // TODO: Check bounds here or ensure they're correct
+
+            fn next(&mut self) -> Option<Self::Item> {
+                // +1 to skip the first offset
+                if self.seen < self.total {
+                    let start = usize::from_le_bytes(
+                        self.buffer[(self.seen ) * self.offset_size + 1 // +1 
to skip header
+                            ..(self.seen ) * self.offset_size + 1]
+                            .try_into()
+                            .ok()?,
+                    );
+                    self.seen += 1;
+                    let end = usize::from_le_bytes(
+                        self.buffer[(self.seen ) * self.offset_size + 1 // +1 
to skip header
+                            ..(self.seen ) * self.offset_size + 1]
+                            .try_into()
+                            .ok()?,
+                    );
+
+                    Some((start, end))
+                } else {
+                    None
+                }
+            }
+        }
+        let iterator: OffsetIterators = OffsetIterators {
+            buffer: self.bytes,
+            total: self.dict_len()?,
+            seen: 0,
+            offset_size: self.offset_size()? as usize,
+        };
+        Ok(iterator)
+    }
+    /// Get the key-name-bytes by index
+    pub fn get_by(&self, index: usize) -> Result<&'m str, ArrowError> {
+        todo!()
+    }
+    /// Get all key-names as an Iterator of strings
+    // TODO: Result
+    pub fn fields(&self) -> impl Iterator<Item = &'m str> {
+        // Do the same as for offsets
+        todo!();
+        vec![].into_iter()
+    }
+}
+
+#[derive(Clone, Copy, Debug, PartialEq)]
+pub struct VariantObject<'m, 'v> {
+    pub metadata: VariantMetadata<'m>,
+    pub value: &'v [u8],
+}
+
+impl<'m, 'v> VariantObject<'m, 'v> {
+    pub fn fields(&self) -> Result<impl Iterator<Item = (&'m str, Variant<'m, 
'v>)>, ArrowError> {
+        todo!();
+        #[allow(unreachable_code)] // Just to infer the return type
+        Ok(vec![].into_iter())
+    }
+
+    pub fn field(&self, _name: &'m str) -> Result<Variant<'m, 'v>, ArrowError> 
{
+        todo!()
+    }
+}
+
+#[derive(Clone, Copy, Debug, PartialEq)]
+pub struct VariantArray<'m, 'v> {
+    pub metadata: VariantMetadata<'m>,
+    pub value: &'v [u8],
+}
+
+// TODO: Let's agree on the API here, also should we expose a way to get the 
values as a vec of

Review Comment:
   I don't think we need to expose a vec of variants. If anyone needs a Vec 
they could get one via vailues
   
   ```rut
   let values: Vec<_> = variant_array.values().collect());
   ```



##########
parquet-variant/src/variant.rs:
##########
@@ -0,0 +1,321 @@
+use std::{borrow::Cow, ops::Index};
+
+use crate::decoder::{self, get_variant_type};
+use arrow_schema::ArrowError;
+use strum_macros::EnumDiscriminants;
+
+#[derive(Clone, Copy, Debug, PartialEq)]
+/// Encodes the Variant Metadata, see the Variant spec file for more 
information
+pub struct VariantMetadata<'m> {
+    bytes: &'m [u8],
+}
+
+impl<'m> VariantMetadata<'m> {
+    /// View the raw bytes (needed by very low-level decoders)
+    #[inline]
+    pub const fn as_bytes(&self) -> &'m [u8] {
+        self.bytes
+    }
+
+    /// Whether the dictionary keys are sorted and unique
+    pub fn is_sorted(&self) -> bool {
+        todo!()
+    }
+
+    /// Get the dict length
+    pub fn dict_len(&self) -> Result<usize, ArrowError> {
+        let end_location = self.offset_size()? as usize + 1;
+        if self.bytes.len() < end_location {
+            let err_str = format!(
+                "Invalid metadata bytes, must have at least length {} but has 
length {}",
+                &end_location,
+                self.bytes.len()
+            );
+            return Err(ArrowError::InvalidArgumentError(err_str));
+        }
+        let dict_len_bytes = &self.bytes[1..end_location];
+        let dict_len = 
usize::from_le_bytes(dict_len_bytes.try_into().map_err(|e| {
+            ArrowError::InvalidArgumentError(format!(
+                "Unable to convert dictionary_size bytes into usize: {}",
+                e,
+            ))
+        })?);
+        Ok(dict_len)
+    }
+    pub fn version(&self) -> usize {
+        todo!()
+    }
+
+    /// Get the offset by index
+    pub fn get_offset_by(&self, index: usize) -> Result<usize, ArrowError> {
+        todo!()
+    }
+
+    /// Get the header byte, which has the following form
+    ///              7     6  5   4  3             0
+    ///             +-------+---+---+---------------+
+    /// header      |       |   |   |    version    |
+    ///             +-------+---+---+---------------+
+    ///                 ^         ^
+    ///                 |         +-- sorted_strings
+    ///                 +-- offset_size_minus_one
+    /// The version is a 4-bit value that must always contain the value 1.
+    /// - sorted_strings is a 1-bit value indicating whether dictionary 
strings are sorted and unique.
+    /// - offset_size_minus_one is a 2-bit value providing the number of bytes 
per dictionary size and offset field.
+    /// - The actual number of bytes, offset_size, is offset_size_minus_one + 1
+    pub fn header(&self) -> Result<u8, ArrowError> {
+        if self.bytes.is_empty() {
+            return Err(ArrowError::InvalidArgumentError(
+                "Can't get header from empty buffer".to_string(),
+            ));
+        }
+        Ok(self.bytes[0])
+    }
+
+    /// Get the offset_minus_one value from the header
+    pub fn offset_size_minus_one(&self) -> Result<u8, ArrowError> {
+        if self.bytes.is_empty() {
+            Err(ArrowError::InvalidArgumentError(
+                "Tried to get offset_size_minus_one from header, but 
self.bytes buffer is emtpy."
+                    .to_string(),
+            ))
+        } else {
+            Ok(self.bytes[0] & (0b11 << 6)) // Grab the last 2 bits
+        }
+    }
+
+    /// Get the offset_size
+    pub fn offset_size(&self) -> Result<u8, ArrowError> {
+        Ok(self.offset_size_minus_one()? + 1)
+    }
+
+    /// Get the offsets as an iterator
+    // TODO: Do we want this kind of API?
+    // TODO: Test once API is agreed upon
+    pub fn offsets(&'m self) -> Result<impl Iterator<Item = (usize, usize)> + 
'm, ArrowError> {
+        struct OffsetIterators<'m> {
+            buffer: &'m [u8],
+            total: usize,
+            seen: usize,
+            offset_size: usize,
+        }
+        impl<'m> Iterator for OffsetIterators<'m> {
+            type Item = (usize, usize); // (start, end) positions of the bytes
+
+            // TODO: Check bounds here or ensure they're correct
+
+            fn next(&mut self) -> Option<Self::Item> {
+                // +1 to skip the first offset
+                if self.seen < self.total {
+                    let start = usize::from_le_bytes(
+                        self.buffer[(self.seen ) * self.offset_size + 1 // +1 
to skip header
+                            ..(self.seen ) * self.offset_size + 1]
+                            .try_into()
+                            .ok()?,
+                    );
+                    self.seen += 1;
+                    let end = usize::from_le_bytes(
+                        self.buffer[(self.seen ) * self.offset_size + 1 // +1 
to skip header
+                            ..(self.seen ) * self.offset_size + 1]
+                            .try_into()
+                            .ok()?,
+                    );
+
+                    Some((start, end))
+                } else {
+                    None
+                }
+            }
+        }
+        let iterator: OffsetIterators = OffsetIterators {
+            buffer: self.bytes,
+            total: self.dict_len()?,
+            seen: 0,
+            offset_size: self.offset_size()? as usize,
+        };
+        Ok(iterator)
+    }
+    /// Get the key-name-bytes by index
+    pub fn get_by(&self, index: usize) -> Result<&'m str, ArrowError> {
+        todo!()
+    }
+    /// Get all key-names as an Iterator of strings
+    // TODO: Result
+    pub fn fields(&self) -> impl Iterator<Item = &'m str> {
+        // Do the same as for offsets
+        todo!();
+        vec![].into_iter()
+    }
+}
+
+#[derive(Clone, Copy, Debug, PartialEq)]
+pub struct VariantObject<'m, 'v> {
+    pub metadata: VariantMetadata<'m>,
+    pub value: &'v [u8],
+}
+
+impl<'m, 'v> VariantObject<'m, 'v> {
+    pub fn fields(&self) -> Result<impl Iterator<Item = (&'m str, Variant<'m, 
'v>)>, ArrowError> {
+        todo!();
+        #[allow(unreachable_code)] // Just to infer the return type
+        Ok(vec![].into_iter())
+    }
+
+    pub fn field(&self, _name: &'m str) -> Result<Variant<'m, 'v>, ArrowError> 
{
+        todo!()
+    }
+}
+
+#[derive(Clone, Copy, Debug, PartialEq)]
+pub struct VariantArray<'m, 'v> {
+    pub metadata: VariantMetadata<'m>,
+    pub value: &'v [u8],
+}
+
+// TODO: Let's agree on the API here, also should we expose a way to get the 
values as a vec of
+// variants for those who want it? Would require allocations.
+impl<'m, 'v> VariantArray<'m, 'v> {
+    pub fn len(&self) -> usize {
+        todo!()
+    }
+
+    pub fn values(&self) -> Result<impl Iterator<Item = Variant<'m, 'v>>, 
ArrowError> {
+        todo!();
+        #[allow(unreachable_code)] // Just to infer the return type
+        Ok(vec![].into_iter())
+    }
+}
+
+impl<'m, 'v> Index<usize> for VariantArray<'m, 'v> {
+    type Output = Variant<'m, 'v>;
+
+    fn index(&self, _index: usize) -> &Self::Output {
+        todo!()
+    }
+}
+
+/// Variant value. May contain references to metadata and value
+// TODO: Add copy if no Cow on String and Shortstring?
+#[derive(Clone, Debug, PartialEq, EnumDiscriminants)]
+#[strum_discriminants(name(VariantType))]
+pub enum Variant<'m, 'v> {
+    // TODO: Add 'legs' for the rest of the primitive types, once API is 
agreed upon
+    Null,
+    Int8(i8),
+
+    BooleanTrue,
+    BooleanFalse,
+
+    // only need the *value* buffer
+    // TODO: Do we want Cow<'v, str> over &'v str? It enables From<String> - 
discuss on PR
+    String(Cow<'v, str>),
+    ShortString(Cow<'v, str>),
+
+    // need both metadata & value
+    Object(VariantObject<'m, 'v>),
+    Array(VariantArray<'m, 'v>),
+}
+
+impl<'m, 'v> Variant<'m, 'v> {
+    /// Parse the buffers and return the appropriate variant.
+    pub fn try_new(metadata: &'m [u8], value: &'v [u8]) -> Result<Self, 
ArrowError> {
+        Ok(match get_variant_type(value)? {
+            VariantType::Null => Variant::Null,
+            VariantType::BooleanTrue => Variant::BooleanTrue,
+            VariantType::BooleanFalse => Variant::BooleanFalse,
+
+            VariantType::Int8 => Variant::Int8(decoder::decode_int8(value)?),
+
+            // TODO: Add 'legs' for the rest of the primitive types, once API 
is agreed upon
+            VariantType::String => {
+                
Variant::String(Cow::Borrowed(decoder::decode_long_string(value)?))
+            }
+
+            VariantType::ShortString => {
+                
Variant::ShortString(Cow::Borrowed(decoder::decode_short_string(value)?))
+            }
+
+            VariantType::Object => Variant::Object(VariantObject {
+                metadata: VariantMetadata { bytes: metadata },
+                value,
+            }),
+            VariantType::Array => Variant::Array(VariantArray {
+                metadata: VariantMetadata { bytes: metadata },
+                value,
+            }),
+        })
+    }
+
+    pub fn as_null(&self) -> Option<()> {
+        match self {
+            Variant::Null => Some(()),
+            _ => None,
+        }
+    }
+
+    pub fn as_boolean(&self) -> Option<bool> {
+        match self {
+            Variant::BooleanTrue => Some(true),
+            Variant::BooleanFalse => Some(false),
+            _ => None,
+        }
+    }
+
+    pub fn as_string(&'v self) -> Option<&'v str> {
+        match self {
+            Variant::String(s) | Variant::ShortString(s) => Some(s),
+            _ => None,
+        }
+    }
+
+    pub fn as_int8(&self) -> Option<i8> {
+        match self {
+            Variant::Int8(i) => Some(*i),

Review Comment:
   One question I think @scovich mentioned was if this kind of accessor should 
automatically cast other types
   
   For example when we implement Variant::Int16, would we try and convert it to 
a i8 here?
   
   I personally think we should try to do so loslessly and users who want to 
handle the types explicitly can match on the variant itself.
   
   ```rust
   pub fn as_int8(&self) -> Option<i8> {
           match self {
               Variant::Int8(i) => Some(*i),
               // return None if the value stored in i16 can not be read as i8
               Variant::Int16(i) => i.try_into().ok()
               _ => None,
           }
       }



##########
parquet-variant/src/variant.rs:
##########
@@ -0,0 +1,321 @@
+use std::{borrow::Cow, ops::Index};
+
+use crate::decoder::{self, get_variant_type};
+use arrow_schema::ArrowError;
+use strum_macros::EnumDiscriminants;
+
+#[derive(Clone, Copy, Debug, PartialEq)]
+/// Encodes the Variant Metadata, see the Variant spec file for more 
information
+pub struct VariantMetadata<'m> {
+    bytes: &'m [u8],
+}
+
+impl<'m> VariantMetadata<'m> {
+    /// View the raw bytes (needed by very low-level decoders)
+    #[inline]
+    pub const fn as_bytes(&self) -> &'m [u8] {
+        self.bytes
+    }
+
+    /// Whether the dictionary keys are sorted and unique
+    pub fn is_sorted(&self) -> bool {
+        todo!()
+    }
+
+    /// Get the dict length
+    pub fn dict_len(&self) -> Result<usize, ArrowError> {
+        let end_location = self.offset_size()? as usize + 1;
+        if self.bytes.len() < end_location {
+            let err_str = format!(
+                "Invalid metadata bytes, must have at least length {} but has 
length {}",
+                &end_location,
+                self.bytes.len()
+            );
+            return Err(ArrowError::InvalidArgumentError(err_str));
+        }
+        let dict_len_bytes = &self.bytes[1..end_location];
+        let dict_len = 
usize::from_le_bytes(dict_len_bytes.try_into().map_err(|e| {
+            ArrowError::InvalidArgumentError(format!(
+                "Unable to convert dictionary_size bytes into usize: {}",
+                e,
+            ))
+        })?);
+        Ok(dict_len)
+    }
+    pub fn version(&self) -> usize {
+        todo!()
+    }
+
+    /// Get the offset by index
+    pub fn get_offset_by(&self, index: usize) -> Result<usize, ArrowError> {
+        todo!()
+    }
+
+    /// Get the header byte, which has the following form
+    ///              7     6  5   4  3             0
+    ///             +-------+---+---+---------------+
+    /// header      |       |   |   |    version    |
+    ///             +-------+---+---+---------------+
+    ///                 ^         ^
+    ///                 |         +-- sorted_strings
+    ///                 +-- offset_size_minus_one
+    /// The version is a 4-bit value that must always contain the value 1.
+    /// - sorted_strings is a 1-bit value indicating whether dictionary 
strings are sorted and unique.
+    /// - offset_size_minus_one is a 2-bit value providing the number of bytes 
per dictionary size and offset field.
+    /// - The actual number of bytes, offset_size, is offset_size_minus_one + 1
+    pub fn header(&self) -> Result<u8, ArrowError> {
+        if self.bytes.is_empty() {
+            return Err(ArrowError::InvalidArgumentError(
+                "Can't get header from empty buffer".to_string(),
+            ));
+        }
+        Ok(self.bytes[0])
+    }
+
+    /// Get the offset_minus_one value from the header
+    pub fn offset_size_minus_one(&self) -> Result<u8, ArrowError> {
+        if self.bytes.is_empty() {
+            Err(ArrowError::InvalidArgumentError(
+                "Tried to get offset_size_minus_one from header, but 
self.bytes buffer is emtpy."
+                    .to_string(),
+            ))
+        } else {
+            Ok(self.bytes[0] & (0b11 << 6)) // Grab the last 2 bits
+        }
+    }
+
+    /// Get the offset_size
+    pub fn offset_size(&self) -> Result<u8, ArrowError> {
+        Ok(self.offset_size_minus_one()? + 1)
+    }
+
+    /// Get the offsets as an iterator
+    // TODO: Do we want this kind of API?
+    // TODO: Test once API is agreed upon
+    pub fn offsets(&'m self) -> Result<impl Iterator<Item = (usize, usize)> + 
'm, ArrowError> {
+        struct OffsetIterators<'m> {
+            buffer: &'m [u8],
+            total: usize,
+            seen: usize,
+            offset_size: usize,
+        }
+        impl<'m> Iterator for OffsetIterators<'m> {
+            type Item = (usize, usize); // (start, end) positions of the bytes
+
+            // TODO: Check bounds here or ensure they're correct
+
+            fn next(&mut self) -> Option<Self::Item> {
+                // +1 to skip the first offset
+                if self.seen < self.total {
+                    let start = usize::from_le_bytes(
+                        self.buffer[(self.seen ) * self.offset_size + 1 // +1 
to skip header
+                            ..(self.seen ) * self.offset_size + 1]
+                            .try_into()
+                            .ok()?,
+                    );
+                    self.seen += 1;
+                    let end = usize::from_le_bytes(
+                        self.buffer[(self.seen ) * self.offset_size + 1 // +1 
to skip header
+                            ..(self.seen ) * self.offset_size + 1]
+                            .try_into()
+                            .ok()?,
+                    );
+
+                    Some((start, end))
+                } else {
+                    None
+                }
+            }
+        }
+        let iterator: OffsetIterators = OffsetIterators {
+            buffer: self.bytes,
+            total: self.dict_len()?,
+            seen: 0,
+            offset_size: self.offset_size()? as usize,
+        };
+        Ok(iterator)
+    }
+    /// Get the key-name-bytes by index
+    pub fn get_by(&self, index: usize) -> Result<&'m str, ArrowError> {
+        todo!()
+    }
+    /// Get all key-names as an Iterator of strings
+    // TODO: Result
+    pub fn fields(&self) -> impl Iterator<Item = &'m str> {
+        // Do the same as for offsets
+        todo!();
+        vec![].into_iter()
+    }
+}
+
+#[derive(Clone, Copy, Debug, PartialEq)]
+pub struct VariantObject<'m, 'v> {
+    pub metadata: VariantMetadata<'m>,
+    pub value: &'v [u8],
+}
+
+impl<'m, 'v> VariantObject<'m, 'v> {
+    pub fn fields(&self) -> Result<impl Iterator<Item = (&'m str, Variant<'m, 
'v>)>, ArrowError> {
+        todo!();
+        #[allow(unreachable_code)] // Just to infer the return type
+        Ok(vec![].into_iter())
+    }
+
+    pub fn field(&self, _name: &'m str) -> Result<Variant<'m, 'v>, ArrowError> 
{
+        todo!()
+    }
+}
+
+#[derive(Clone, Copy, Debug, PartialEq)]
+pub struct VariantArray<'m, 'v> {
+    pub metadata: VariantMetadata<'m>,
+    pub value: &'v [u8],
+}
+
+// TODO: Let's agree on the API here, also should we expose a way to get the 
values as a vec of
+// variants for those who want it? Would require allocations.
+impl<'m, 'v> VariantArray<'m, 'v> {
+    pub fn len(&self) -> usize {
+        todo!()
+    }
+
+    pub fn values(&self) -> Result<impl Iterator<Item = Variant<'m, 'v>>, 
ArrowError> {
+        todo!();
+        #[allow(unreachable_code)] // Just to infer the return type
+        Ok(vec![].into_iter())
+    }
+}
+
+impl<'m, 'v> Index<usize> for VariantArray<'m, 'v> {
+    type Output = Variant<'m, 'v>;
+
+    fn index(&self, _index: usize) -> &Self::Output {
+        todo!()
+    }
+}
+
+/// Variant value. May contain references to metadata and value
+// TODO: Add copy if no Cow on String and Shortstring?
+#[derive(Clone, Debug, PartialEq, EnumDiscriminants)]
+#[strum_discriminants(name(VariantType))]
+pub enum Variant<'m, 'v> {
+    // TODO: Add 'legs' for the rest of the primitive types, once API is 
agreed upon
+    Null,
+    Int8(i8),
+
+    BooleanTrue,
+    BooleanFalse,
+
+    // only need the *value* buffer
+    // TODO: Do we want Cow<'v, str> over &'v str? It enables From<String> - 
discuss on PR
+    String(Cow<'v, str>),
+    ShortString(Cow<'v, str>),
+
+    // need both metadata & value
+    Object(VariantObject<'m, 'v>),
+    Array(VariantArray<'m, 'v>),
+}
+
+impl<'m, 'v> Variant<'m, 'v> {
+    /// Parse the buffers and return the appropriate variant.
+    pub fn try_new(metadata: &'m [u8], value: &'v [u8]) -> Result<Self, 
ArrowError> {
+        Ok(match get_variant_type(value)? {
+            VariantType::Null => Variant::Null,
+            VariantType::BooleanTrue => Variant::BooleanTrue,
+            VariantType::BooleanFalse => Variant::BooleanFalse,
+
+            VariantType::Int8 => Variant::Int8(decoder::decode_int8(value)?),
+
+            // TODO: Add 'legs' for the rest of the primitive types, once API 
is agreed upon
+            VariantType::String => {
+                
Variant::String(Cow::Borrowed(decoder::decode_long_string(value)?))
+            }
+
+            VariantType::ShortString => {
+                
Variant::ShortString(Cow::Borrowed(decoder::decode_short_string(value)?))
+            }
+
+            VariantType::Object => Variant::Object(VariantObject {
+                metadata: VariantMetadata { bytes: metadata },
+                value,
+            }),
+            VariantType::Array => Variant::Array(VariantArray {
+                metadata: VariantMetadata { bytes: metadata },
+                value,
+            }),
+        })
+    }
+
+    pub fn as_null(&self) -> Option<()> {

Review Comment:
   This is a nice API



##########
parquet-variant/src/variant.rs:
##########
@@ -0,0 +1,321 @@
+use std::{borrow::Cow, ops::Index};
+
+use crate::decoder::{self, get_variant_type};
+use arrow_schema::ArrowError;
+use strum_macros::EnumDiscriminants;
+
+#[derive(Clone, Copy, Debug, PartialEq)]
+/// Encodes the Variant Metadata, see the Variant spec file for more 
information

Review Comment:
   I think this API looks pretty good to me.
   
   One thing that we might want to do is to make a `VariantMetadata::try_new` 
function that checks and parses out the sorted_strings, dict_len, etc fields. 
To do anything with the metadata those fields will need to be decoded, so it 
seems like it would be a nicer API to compute and validate them once on 
constructon rather than on each access



##########
parquet-variant/src/variant.rs:
##########
@@ -0,0 +1,321 @@
+use std::{borrow::Cow, ops::Index};
+
+use crate::decoder::{self, get_variant_type};
+use arrow_schema::ArrowError;
+use strum_macros::EnumDiscriminants;
+
+#[derive(Clone, Copy, Debug, PartialEq)]
+/// Encodes the Variant Metadata, see the Variant spec file for more 
information
+pub struct VariantMetadata<'m> {
+    bytes: &'m [u8],
+}
+
+impl<'m> VariantMetadata<'m> {
+    /// View the raw bytes (needed by very low-level decoders)
+    #[inline]
+    pub const fn as_bytes(&self) -> &'m [u8] {
+        self.bytes
+    }
+
+    /// Whether the dictionary keys are sorted and unique
+    pub fn is_sorted(&self) -> bool {
+        todo!()
+    }
+
+    /// Get the dict length
+    pub fn dict_len(&self) -> Result<usize, ArrowError> {
+        let end_location = self.offset_size()? as usize + 1;
+        if self.bytes.len() < end_location {
+            let err_str = format!(
+                "Invalid metadata bytes, must have at least length {} but has 
length {}",
+                &end_location,
+                self.bytes.len()
+            );
+            return Err(ArrowError::InvalidArgumentError(err_str));
+        }
+        let dict_len_bytes = &self.bytes[1..end_location];
+        let dict_len = 
usize::from_le_bytes(dict_len_bytes.try_into().map_err(|e| {
+            ArrowError::InvalidArgumentError(format!(
+                "Unable to convert dictionary_size bytes into usize: {}",
+                e,
+            ))
+        })?);
+        Ok(dict_len)
+    }
+    pub fn version(&self) -> usize {
+        todo!()
+    }
+
+    /// Get the offset by index
+    pub fn get_offset_by(&self, index: usize) -> Result<usize, ArrowError> {
+        todo!()
+    }
+
+    /// Get the header byte, which has the following form
+    ///              7     6  5   4  3             0
+    ///             +-------+---+---+---------------+
+    /// header      |       |   |   |    version    |
+    ///             +-------+---+---+---------------+
+    ///                 ^         ^
+    ///                 |         +-- sorted_strings
+    ///                 +-- offset_size_minus_one
+    /// The version is a 4-bit value that must always contain the value 1.
+    /// - sorted_strings is a 1-bit value indicating whether dictionary 
strings are sorted and unique.
+    /// - offset_size_minus_one is a 2-bit value providing the number of bytes 
per dictionary size and offset field.
+    /// - The actual number of bytes, offset_size, is offset_size_minus_one + 1
+    pub fn header(&self) -> Result<u8, ArrowError> {
+        if self.bytes.is_empty() {
+            return Err(ArrowError::InvalidArgumentError(
+                "Can't get header from empty buffer".to_string(),
+            ));
+        }
+        Ok(self.bytes[0])
+    }
+
+    /// Get the offset_minus_one value from the header
+    pub fn offset_size_minus_one(&self) -> Result<u8, ArrowError> {
+        if self.bytes.is_empty() {
+            Err(ArrowError::InvalidArgumentError(
+                "Tried to get offset_size_minus_one from header, but 
self.bytes buffer is emtpy."
+                    .to_string(),
+            ))
+        } else {
+            Ok(self.bytes[0] & (0b11 << 6)) // Grab the last 2 bits
+        }
+    }
+
+    /// Get the offset_size
+    pub fn offset_size(&self) -> Result<u8, ArrowError> {
+        Ok(self.offset_size_minus_one()? + 1)
+    }
+
+    /// Get the offsets as an iterator
+    // TODO: Do we want this kind of API?
+    // TODO: Test once API is agreed upon
+    pub fn offsets(&'m self) -> Result<impl Iterator<Item = (usize, usize)> + 
'm, ArrowError> {
+        struct OffsetIterators<'m> {
+            buffer: &'m [u8],
+            total: usize,
+            seen: usize,
+            offset_size: usize,
+        }
+        impl<'m> Iterator for OffsetIterators<'m> {
+            type Item = (usize, usize); // (start, end) positions of the bytes
+
+            // TODO: Check bounds here or ensure they're correct
+
+            fn next(&mut self) -> Option<Self::Item> {
+                // +1 to skip the first offset
+                if self.seen < self.total {
+                    let start = usize::from_le_bytes(
+                        self.buffer[(self.seen ) * self.offset_size + 1 // +1 
to skip header
+                            ..(self.seen ) * self.offset_size + 1]
+                            .try_into()
+                            .ok()?,
+                    );
+                    self.seen += 1;
+                    let end = usize::from_le_bytes(
+                        self.buffer[(self.seen ) * self.offset_size + 1 // +1 
to skip header
+                            ..(self.seen ) * self.offset_size + 1]
+                            .try_into()
+                            .ok()?,
+                    );
+
+                    Some((start, end))
+                } else {
+                    None
+                }
+            }
+        }
+        let iterator: OffsetIterators = OffsetIterators {
+            buffer: self.bytes,
+            total: self.dict_len()?,
+            seen: 0,
+            offset_size: self.offset_size()? as usize,
+        };
+        Ok(iterator)
+    }
+    /// Get the key-name-bytes by index
+    pub fn get_by(&self, index: usize) -> Result<&'m str, ArrowError> {
+        todo!()
+    }
+    /// Get all key-names as an Iterator of strings
+    // TODO: Result
+    pub fn fields(&self) -> impl Iterator<Item = &'m str> {
+        // Do the same as for offsets
+        todo!();
+        vec![].into_iter()
+    }
+}
+
+#[derive(Clone, Copy, Debug, PartialEq)]
+pub struct VariantObject<'m, 'v> {
+    pub metadata: VariantMetadata<'m>,
+    pub value: &'v [u8],
+}
+
+impl<'m, 'v> VariantObject<'m, 'v> {
+    pub fn fields(&self) -> Result<impl Iterator<Item = (&'m str, Variant<'m, 
'v>)>, ArrowError> {
+        todo!();
+        #[allow(unreachable_code)] // Just to infer the return type
+        Ok(vec![].into_iter())
+    }
+
+    pub fn field(&self, _name: &'m str) -> Result<Variant<'m, 'v>, ArrowError> 
{
+        todo!()
+    }
+}
+
+#[derive(Clone, Copy, Debug, PartialEq)]
+pub struct VariantArray<'m, 'v> {
+    pub metadata: VariantMetadata<'m>,
+    pub value: &'v [u8],
+}
+
+// TODO: Let's agree on the API here, also should we expose a way to get the 
values as a vec of
+// variants for those who want it? Would require allocations.
+impl<'m, 'v> VariantArray<'m, 'v> {
+    pub fn len(&self) -> usize {
+        todo!()
+    }
+
+    pub fn values(&self) -> Result<impl Iterator<Item = Variant<'m, 'v>>, 
ArrowError> {
+        todo!();
+        #[allow(unreachable_code)] // Just to infer the return type
+        Ok(vec![].into_iter())
+    }
+}
+
+impl<'m, 'v> Index<usize> for VariantArray<'m, 'v> {
+    type Output = Variant<'m, 'v>;
+
+    fn index(&self, _index: usize) -> &Self::Output {
+        todo!()
+    }
+}
+
+/// Variant value. May contain references to metadata and value
+// TODO: Add copy if no Cow on String and Shortstring?
+#[derive(Clone, Debug, PartialEq, EnumDiscriminants)]
+#[strum_discriminants(name(VariantType))]
+pub enum Variant<'m, 'v> {
+    // TODO: Add 'legs' for the rest of the primitive types, once API is 
agreed upon
+    Null,
+    Int8(i8),
+
+    BooleanTrue,
+    BooleanFalse,
+
+    // only need the *value* buffer
+    // TODO: Do we want Cow<'v, str> over &'v str? It enables From<String> - 
discuss on PR
+    String(Cow<'v, str>),

Review Comment:
   I am not sure how useful `From<String>` is once we have the `VariantBuilder` 
API
   
   I think it would be better to have this be `String(&'v str)`
   
   Perhaps the challenge is that with the current formulation, there is no way 
to have an "owned" Variant 🤔 



##########
parquet-variant/src/test_variant.rs:
##########
@@ -0,0 +1,49 @@
+//! End-to-end check: (almost) every sample from apache/parquet-testing/variant
+//! can be parsed into our `Variant`.
+
+// NOTE: We keep this file separate rather than a test mod inside variant.rs 
because it should be

Review Comment:
   💯 



##########
parquet-variant/src/test_variant.rs:
##########
@@ -0,0 +1,49 @@
+//! End-to-end check: (almost) every sample from apache/parquet-testing/variant
+//! can be parsed into our `Variant`.
+
+// NOTE: We keep this file separate rather than a test mod inside variant.rs 
because it should be
+// moved to the test folder later
+use std::fs;
+use std::path::{Path, PathBuf};
+
+use crate::variant::Variant;
+use arrow_schema::ArrowError;
+
+fn cases_dir() -> PathBuf {
+    Path::new(env!("CARGO_MANIFEST_DIR"))
+        .join("..")
+        .join("parquet-testing")
+        .join("variant")
+}
+
+fn load_case(name: &str) -> Result<(Vec<u8>, Vec<u8>), ArrowError> {
+    let root = cases_dir();
+    let meta = fs::read(root.join(format!("{name}.metadata")))?;
+    let val = fs::read(root.join(format!("{name}.value")))?;
+    Ok((meta, val))
+}
+
+fn get_cases() -> Vec<(&'static str, Variant<'static, 'static>)> {

Review Comment:
   Amazing! It is working!



##########
parquet-variant/src/variant.rs:
##########
@@ -0,0 +1,321 @@
+use std::{borrow::Cow, ops::Index};
+
+use crate::decoder::{self, get_variant_type};
+use arrow_schema::ArrowError;
+use strum_macros::EnumDiscriminants;
+
+#[derive(Clone, Copy, Debug, PartialEq)]
+/// Encodes the Variant Metadata, see the Variant spec file for more 
information
+pub struct VariantMetadata<'m> {
+    bytes: &'m [u8],
+}
+
+impl<'m> VariantMetadata<'m> {
+    /// View the raw bytes (needed by very low-level decoders)
+    #[inline]
+    pub const fn as_bytes(&self) -> &'m [u8] {
+        self.bytes
+    }
+
+    /// Whether the dictionary keys are sorted and unique
+    pub fn is_sorted(&self) -> bool {
+        todo!()
+    }
+
+    /// Get the dict length
+    pub fn dict_len(&self) -> Result<usize, ArrowError> {
+        let end_location = self.offset_size()? as usize + 1;
+        if self.bytes.len() < end_location {
+            let err_str = format!(
+                "Invalid metadata bytes, must have at least length {} but has 
length {}",
+                &end_location,
+                self.bytes.len()
+            );
+            return Err(ArrowError::InvalidArgumentError(err_str));
+        }
+        let dict_len_bytes = &self.bytes[1..end_location];
+        let dict_len = 
usize::from_le_bytes(dict_len_bytes.try_into().map_err(|e| {
+            ArrowError::InvalidArgumentError(format!(
+                "Unable to convert dictionary_size bytes into usize: {}",
+                e,
+            ))
+        })?);
+        Ok(dict_len)
+    }
+    pub fn version(&self) -> usize {
+        todo!()
+    }
+
+    /// Get the offset by index
+    pub fn get_offset_by(&self, index: usize) -> Result<usize, ArrowError> {
+        todo!()
+    }
+
+    /// Get the header byte, which has the following form
+    ///              7     6  5   4  3             0
+    ///             +-------+---+---+---------------+
+    /// header      |       |   |   |    version    |
+    ///             +-------+---+---+---------------+
+    ///                 ^         ^
+    ///                 |         +-- sorted_strings
+    ///                 +-- offset_size_minus_one
+    /// The version is a 4-bit value that must always contain the value 1.
+    /// - sorted_strings is a 1-bit value indicating whether dictionary 
strings are sorted and unique.
+    /// - offset_size_minus_one is a 2-bit value providing the number of bytes 
per dictionary size and offset field.
+    /// - The actual number of bytes, offset_size, is offset_size_minus_one + 1
+    pub fn header(&self) -> Result<u8, ArrowError> {
+        if self.bytes.is_empty() {
+            return Err(ArrowError::InvalidArgumentError(
+                "Can't get header from empty buffer".to_string(),
+            ));
+        }
+        Ok(self.bytes[0])
+    }
+
+    /// Get the offset_minus_one value from the header
+    pub fn offset_size_minus_one(&self) -> Result<u8, ArrowError> {
+        if self.bytes.is_empty() {
+            Err(ArrowError::InvalidArgumentError(
+                "Tried to get offset_size_minus_one from header, but 
self.bytes buffer is emtpy."
+                    .to_string(),
+            ))
+        } else {
+            Ok(self.bytes[0] & (0b11 << 6)) // Grab the last 2 bits
+        }
+    }
+
+    /// Get the offset_size
+    pub fn offset_size(&self) -> Result<u8, ArrowError> {
+        Ok(self.offset_size_minus_one()? + 1)
+    }
+
+    /// Get the offsets as an iterator
+    // TODO: Do we want this kind of API?
+    // TODO: Test once API is agreed upon
+    pub fn offsets(&'m self) -> Result<impl Iterator<Item = (usize, usize)> + 
'm, ArrowError> {
+        struct OffsetIterators<'m> {
+            buffer: &'m [u8],
+            total: usize,
+            seen: usize,
+            offset_size: usize,
+        }
+        impl<'m> Iterator for OffsetIterators<'m> {
+            type Item = (usize, usize); // (start, end) positions of the bytes
+
+            // TODO: Check bounds here or ensure they're correct
+
+            fn next(&mut self) -> Option<Self::Item> {
+                // +1 to skip the first offset
+                if self.seen < self.total {
+                    let start = usize::from_le_bytes(
+                        self.buffer[(self.seen ) * self.offset_size + 1 // +1 
to skip header
+                            ..(self.seen ) * self.offset_size + 1]
+                            .try_into()
+                            .ok()?,
+                    );
+                    self.seen += 1;
+                    let end = usize::from_le_bytes(
+                        self.buffer[(self.seen ) * self.offset_size + 1 // +1 
to skip header
+                            ..(self.seen ) * self.offset_size + 1]
+                            .try_into()
+                            .ok()?,
+                    );
+
+                    Some((start, end))
+                } else {
+                    None
+                }
+            }
+        }
+        let iterator: OffsetIterators = OffsetIterators {
+            buffer: self.bytes,
+            total: self.dict_len()?,
+            seen: 0,
+            offset_size: self.offset_size()? as usize,
+        };
+        Ok(iterator)
+    }
+    /// Get the key-name-bytes by index
+    pub fn get_by(&self, index: usize) -> Result<&'m str, ArrowError> {
+        todo!()
+    }
+    /// Get all key-names as an Iterator of strings
+    // TODO: Result
+    pub fn fields(&self) -> impl Iterator<Item = &'m str> {
+        // Do the same as for offsets
+        todo!();
+        vec![].into_iter()
+    }
+}
+
+#[derive(Clone, Copy, Debug, PartialEq)]
+pub struct VariantObject<'m, 'v> {
+    pub metadata: VariantMetadata<'m>,
+    pub value: &'v [u8],
+}
+
+impl<'m, 'v> VariantObject<'m, 'v> {
+    pub fn fields(&self) -> Result<impl Iterator<Item = (&'m str, Variant<'m, 
'v>)>, ArrowError> {
+        todo!();
+        #[allow(unreachable_code)] // Just to infer the return type
+        Ok(vec![].into_iter())
+    }
+
+    pub fn field(&self, _name: &'m str) -> Result<Variant<'m, 'v>, ArrowError> 
{
+        todo!()
+    }
+}
+
+#[derive(Clone, Copy, Debug, PartialEq)]
+pub struct VariantArray<'m, 'v> {
+    pub metadata: VariantMetadata<'m>,
+    pub value: &'v [u8],
+}
+
+// TODO: Let's agree on the API here, also should we expose a way to get the 
values as a vec of
+// variants for those who want it? Would require allocations.
+impl<'m, 'v> VariantArray<'m, 'v> {
+    pub fn len(&self) -> usize {
+        todo!()
+    }
+
+    pub fn values(&self) -> Result<impl Iterator<Item = Variant<'m, 'v>>, 
ArrowError> {
+        todo!();
+        #[allow(unreachable_code)] // Just to infer the return type
+        Ok(vec![].into_iter())
+    }
+}
+
+impl<'m, 'v> Index<usize> for VariantArray<'m, 'v> {
+    type Output = Variant<'m, 'v>;
+
+    fn index(&self, _index: usize) -> &Self::Output {
+        todo!()
+    }
+}
+
+/// Variant value. May contain references to metadata and value
+// TODO: Add copy if no Cow on String and Shortstring?
+#[derive(Clone, Debug, PartialEq, EnumDiscriminants)]
+#[strum_discriminants(name(VariantType))]

Review Comment:
   I don't fully understand the need for the `VariantType` here
   
   I see it adds a level of indirection 



##########
parquet-variant/src/variant.rs:
##########
@@ -0,0 +1,321 @@
+use std::{borrow::Cow, ops::Index};
+
+use crate::decoder::{self, get_variant_type};
+use arrow_schema::ArrowError;
+use strum_macros::EnumDiscriminants;
+
+#[derive(Clone, Copy, Debug, PartialEq)]
+/// Encodes the Variant Metadata, see the Variant spec file for more 
information
+pub struct VariantMetadata<'m> {
+    bytes: &'m [u8],
+}
+
+impl<'m> VariantMetadata<'m> {
+    /// View the raw bytes (needed by very low-level decoders)
+    #[inline]
+    pub const fn as_bytes(&self) -> &'m [u8] {
+        self.bytes
+    }
+
+    /// Whether the dictionary keys are sorted and unique
+    pub fn is_sorted(&self) -> bool {
+        todo!()
+    }
+
+    /// Get the dict length
+    pub fn dict_len(&self) -> Result<usize, ArrowError> {
+        let end_location = self.offset_size()? as usize + 1;
+        if self.bytes.len() < end_location {
+            let err_str = format!(
+                "Invalid metadata bytes, must have at least length {} but has 
length {}",
+                &end_location,
+                self.bytes.len()
+            );
+            return Err(ArrowError::InvalidArgumentError(err_str));
+        }
+        let dict_len_bytes = &self.bytes[1..end_location];
+        let dict_len = 
usize::from_le_bytes(dict_len_bytes.try_into().map_err(|e| {
+            ArrowError::InvalidArgumentError(format!(
+                "Unable to convert dictionary_size bytes into usize: {}",
+                e,
+            ))
+        })?);
+        Ok(dict_len)
+    }
+    pub fn version(&self) -> usize {
+        todo!()
+    }
+
+    /// Get the offset by index
+    pub fn get_offset_by(&self, index: usize) -> Result<usize, ArrowError> {
+        todo!()
+    }
+
+    /// Get the header byte, which has the following form
+    ///              7     6  5   4  3             0
+    ///             +-------+---+---+---------------+
+    /// header      |       |   |   |    version    |
+    ///             +-------+---+---+---------------+
+    ///                 ^         ^
+    ///                 |         +-- sorted_strings
+    ///                 +-- offset_size_minus_one
+    /// The version is a 4-bit value that must always contain the value 1.
+    /// - sorted_strings is a 1-bit value indicating whether dictionary 
strings are sorted and unique.
+    /// - offset_size_minus_one is a 2-bit value providing the number of bytes 
per dictionary size and offset field.
+    /// - The actual number of bytes, offset_size, is offset_size_minus_one + 1
+    pub fn header(&self) -> Result<u8, ArrowError> {
+        if self.bytes.is_empty() {
+            return Err(ArrowError::InvalidArgumentError(
+                "Can't get header from empty buffer".to_string(),
+            ));
+        }
+        Ok(self.bytes[0])
+    }
+
+    /// Get the offset_minus_one value from the header
+    pub fn offset_size_minus_one(&self) -> Result<u8, ArrowError> {
+        if self.bytes.is_empty() {
+            Err(ArrowError::InvalidArgumentError(
+                "Tried to get offset_size_minus_one from header, but 
self.bytes buffer is emtpy."
+                    .to_string(),
+            ))
+        } else {
+            Ok(self.bytes[0] & (0b11 << 6)) // Grab the last 2 bits
+        }
+    }
+
+    /// Get the offset_size
+    pub fn offset_size(&self) -> Result<u8, ArrowError> {
+        Ok(self.offset_size_minus_one()? + 1)
+    }
+
+    /// Get the offsets as an iterator
+    // TODO: Do we want this kind of API?
+    // TODO: Test once API is agreed upon
+    pub fn offsets(&'m self) -> Result<impl Iterator<Item = (usize, usize)> + 
'm, ArrowError> {
+        struct OffsetIterators<'m> {
+            buffer: &'m [u8],
+            total: usize,
+            seen: usize,
+            offset_size: usize,
+        }
+        impl<'m> Iterator for OffsetIterators<'m> {
+            type Item = (usize, usize); // (start, end) positions of the bytes
+
+            // TODO: Check bounds here or ensure they're correct
+
+            fn next(&mut self) -> Option<Self::Item> {
+                // +1 to skip the first offset
+                if self.seen < self.total {
+                    let start = usize::from_le_bytes(
+                        self.buffer[(self.seen ) * self.offset_size + 1 // +1 
to skip header
+                            ..(self.seen ) * self.offset_size + 1]
+                            .try_into()
+                            .ok()?,

Review Comment:
   this is interesting -- what should we do when we hit an offset that is an 
invlalid usize -- I think this code will basically return `None` (stop 
iterating) rather than erroring in this situation
   
   Maybe when someone calls `iter()` it would make sense to validate all the 
offsets then 🤔 



##########
parquet-variant/src/decoder.rs:
##########
@@ -0,0 +1,199 @@
+// NOTE: Largely based on the implementation of @PinkCrow007 in 
https://github.com/apache/arrow-rs/pull/7452
+// And the feedback there.
+use crate::variant::VariantType;
+use arrow_schema::ArrowError;
+use std::{array::TryFromSliceError, str};
+
+#[derive(Debug, Clone, Copy)]
+pub enum VariantBasicType {
+    Primitive = 0,
+    ShortString = 1,
+    Object = 2,
+    Array = 3,
+}
+
+#[derive(Debug, Clone, Copy)]
+pub enum VariantPrimitiveType {
+    Null = 0,
+    BooleanTrue = 1,
+    BooleanFalse = 2,
+    Int8 = 3,
+    // TODO: Add 'legs' for the rest of primitives, once API is agreed upon
+    String = 16,
+}
+
+/// Extracts the basic type from a header byte
+pub(crate) fn get_basic_type(header: u8) -> Result<VariantBasicType, 
ArrowError> {
+    // See 
https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#value-encoding
+    let basic_type = header & 0x03; // Basic type is encoded in the first 2 
bits
+    let basic_type = match basic_type {
+        0 => VariantBasicType::Primitive,
+        1 => VariantBasicType::ShortString,
+        2 => VariantBasicType::Object,
+        3 => VariantBasicType::Array,
+        _ => {
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "unknown basic type: {}",
+                basic_type
+            )))
+        }
+    };
+    Ok(basic_type)
+}
+
+/// Extracts the primitive type from a header byte
+pub(crate) fn get_primitive_type(header: u8) -> Result<VariantPrimitiveType, 
ArrowError> {
+    // See 
https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#value-encoding
+    //// Primitive type is encoded in the last 6 bits of the header byte
+    let primitive_type = (header >> 2) & 0x3F;
+    let primitive_type = match primitive_type {
+        0 => VariantPrimitiveType::Null,
+        1 => VariantPrimitiveType::BooleanTrue,
+        2 => VariantPrimitiveType::BooleanFalse,
+        3 => VariantPrimitiveType::Int8,
+        // TODO: Add 'legs' for the rest, once API is agreed upon
+        16 => VariantPrimitiveType::String,
+        _ => {
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "unknown primitive type: {}",
+                primitive_type
+            )))
+        }
+    };
+    Ok(primitive_type)
+}
+
+/// Extracts the variant type from the value section of a variant. The variant
+/// type is defined as the set of all basic types and all primitive types.
+pub fn get_variant_type(value: &[u8]) -> Result<VariantType, ArrowError> {
+    if value.is_empty() {
+        return Err(ArrowError::InvalidArgumentError(
+            "Tried to get variant type from empty buffer array".to_string(),
+        ));
+    }
+    let header = value[0];
+    let variant_type = match get_basic_type(header)? {
+        VariantBasicType::Primitive => match get_primitive_type(header)? {
+            VariantPrimitiveType::Null => VariantType::Null,
+            VariantPrimitiveType::Int8 => VariantType::Int8,
+            VariantPrimitiveType::BooleanTrue => VariantType::BooleanTrue,
+            VariantPrimitiveType::BooleanFalse => VariantType::BooleanFalse,
+            // TODO: Add 'legs' for the rest, once API is agreed upon
+            VariantPrimitiveType::String => VariantType::String,
+        },
+        VariantBasicType::ShortString => VariantType::ShortString,
+        VariantBasicType::Object => VariantType::Object,
+        VariantBasicType::Array => VariantType::Array,
+    };
+    Ok(variant_type)
+}
+
+/// To be used in `map_err` when unpacking an integer from a slice of bytes.
+fn map_try_from_slice_error(e: TryFromSliceError) -> ArrowError {
+    ArrowError::InvalidArgumentError(e.to_string())
+}
+
+/// Constructs the error message for an invalid UTF-8 string.
+fn invalid_utf8_err() -> ArrowError {
+    ArrowError::InvalidArgumentError("invalid UTF-8 string".to_string())
+}
+
+/// Decodes an Int8 from the value section of a variant.
+pub(crate) fn decode_int8(value: &[u8]) -> Result<i8, ArrowError> {
+    if value.is_empty() {
+        return Err(ArrowError::InvalidArgumentError(
+            "Got empty value buffer so can't decode into int8.".to_string(),
+        ));
+    }
+    let value = i8::from_le_bytes([value[1]]);
+    Ok(value)
+}
+
+/// Decodes a long string from the value section of a variant.
+pub(crate) fn decode_long_string(value: &[u8]) -> Result<&str, ArrowError> {
+    if value.len() < 5 {
+        return Err(ArrowError::InvalidArgumentError(
+            "Tried to decode value buffer into long_string, but it's too short 
(len<5)."
+                .to_string(),
+        ));
+    }
+    let len =
+        
u32::from_le_bytes(value[1..=4].try_into().map_err(map_try_from_slice_error)?) 
as usize;
+    if value.len() < len + 5 {
+        let err_str = format!("The length of the buffer for the long_string is 
soo short, it is {} and it should be at least {} ({} < {} + 5)", value.len(), 
len + 5 , value.len(), len);
+        return Err(ArrowError::InvalidArgumentError(err_str));
+    }
+    let string_bytes = &value[5..5 + len];
+    let string = str::from_utf8(string_bytes).map_err(|_| invalid_utf8_err())?;
+    Ok(string)
+}
+
+/// Decodes a short string from the value section of a variant.
+pub(crate) fn decode_short_string(value: &[u8]) -> Result<&str, ArrowError> {
+    if value.is_empty() {
+        return Err(ArrowError::InvalidArgumentError(
+            "Tried to decode value buffer into short_string, but it's 
empty.".to_string(),
+        ));
+    }
+    let len = ((value[0] & 0b11111100) >> 2) as usize;
+
+    if value.len() < len + 1 {
+        let err_str = format!("The length of the buffer for the short_string 
is too short, it is {} and it should be at least {} ({} < {} + 1)", 
value.len(), len + 1 , value.len(), len);
+        return Err(ArrowError::InvalidArgumentError(err_str));
+    }
+    let string_bytes = &value[1..1 + len];
+    let string = str::from_utf8(string_bytes).map_err(|_| invalid_utf8_err())?;
+    Ok(string)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_i8() -> Result<(), ArrowError> {

Review Comment:
   To really write tests like this it will be nice / easier if we have the 
variant builder API that @PinkCrow007 is working on



##########
parquet-variant/src/decoder.rs:
##########
@@ -0,0 +1,199 @@
+// NOTE: Largely based on the implementation of @PinkCrow007 in 
https://github.com/apache/arrow-rs/pull/7452
+// And the feedback there.
+use crate::variant::VariantType;
+use arrow_schema::ArrowError;
+use std::{array::TryFromSliceError, str};
+
+#[derive(Debug, Clone, Copy)]
+pub enum VariantBasicType {
+    Primitive = 0,
+    ShortString = 1,
+    Object = 2,
+    Array = 3,
+}
+
+#[derive(Debug, Clone, Copy)]
+pub enum VariantPrimitiveType {
+    Null = 0,
+    BooleanTrue = 1,
+    BooleanFalse = 2,
+    Int8 = 3,
+    // TODO: Add 'legs' for the rest of primitives, once API is agreed upon
+    String = 16,
+}
+
+/// Extracts the basic type from a header byte
+pub(crate) fn get_basic_type(header: u8) -> Result<VariantBasicType, 
ArrowError> {
+    // See 
https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#value-encoding
+    let basic_type = header & 0x03; // Basic type is encoded in the first 2 
bits
+    let basic_type = match basic_type {
+        0 => VariantBasicType::Primitive,
+        1 => VariantBasicType::ShortString,
+        2 => VariantBasicType::Object,
+        3 => VariantBasicType::Array,
+        _ => {
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "unknown basic type: {}",
+                basic_type
+            )))
+        }
+    };
+    Ok(basic_type)
+}
+
+/// Extracts the primitive type from a header byte
+pub(crate) fn get_primitive_type(header: u8) -> Result<VariantPrimitiveType, 
ArrowError> {
+    // See 
https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#value-encoding
+    //// Primitive type is encoded in the last 6 bits of the header byte
+    let primitive_type = (header >> 2) & 0x3F;
+    let primitive_type = match primitive_type {
+        0 => VariantPrimitiveType::Null,
+        1 => VariantPrimitiveType::BooleanTrue,
+        2 => VariantPrimitiveType::BooleanFalse,
+        3 => VariantPrimitiveType::Int8,
+        // TODO: Add 'legs' for the rest, once API is agreed upon
+        16 => VariantPrimitiveType::String,
+        _ => {
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "unknown primitive type: {}",
+                primitive_type
+            )))
+        }
+    };
+    Ok(primitive_type)
+}
+
+/// Extracts the variant type from the value section of a variant. The variant
+/// type is defined as the set of all basic types and all primitive types.
+pub fn get_variant_type(value: &[u8]) -> Result<VariantType, ArrowError> {
+    if value.is_empty() {
+        return Err(ArrowError::InvalidArgumentError(
+            "Tried to get variant type from empty buffer array".to_string(),
+        ));
+    }
+    let header = value[0];
+    let variant_type = match get_basic_type(header)? {
+        VariantBasicType::Primitive => match get_primitive_type(header)? {
+            VariantPrimitiveType::Null => VariantType::Null,
+            VariantPrimitiveType::Int8 => VariantType::Int8,
+            VariantPrimitiveType::BooleanTrue => VariantType::BooleanTrue,
+            VariantPrimitiveType::BooleanFalse => VariantType::BooleanFalse,
+            // TODO: Add 'legs' for the rest, once API is agreed upon
+            VariantPrimitiveType::String => VariantType::String,
+        },
+        VariantBasicType::ShortString => VariantType::ShortString,
+        VariantBasicType::Object => VariantType::Object,
+        VariantBasicType::Array => VariantType::Array,
+    };
+    Ok(variant_type)
+}
+
+/// To be used in `map_err` when unpacking an integer from a slice of bytes.
+fn map_try_from_slice_error(e: TryFromSliceError) -> ArrowError {
+    ArrowError::InvalidArgumentError(e.to_string())
+}
+
+/// Constructs the error message for an invalid UTF-8 string.
+fn invalid_utf8_err() -> ArrowError {
+    ArrowError::InvalidArgumentError("invalid UTF-8 string".to_string())
+}
+
+/// Decodes an Int8 from the value section of a variant.
+pub(crate) fn decode_int8(value: &[u8]) -> Result<i8, ArrowError> {
+    if value.is_empty() {
+        return Err(ArrowError::InvalidArgumentError(
+            "Got empty value buffer so can't decode into int8.".to_string(),
+        ));
+    }
+    let value = i8::from_le_bytes([value[1]]);
+    Ok(value)
+}
+
+/// Decodes a long string from the value section of a variant.
+pub(crate) fn decode_long_string(value: &[u8]) -> Result<&str, ArrowError> {
+    if value.len() < 5 {
+        return Err(ArrowError::InvalidArgumentError(
+            "Tried to decode value buffer into long_string, but it's too short 
(len<5)."
+                .to_string(),
+        ));
+    }
+    let len =
+        
u32::from_le_bytes(value[1..=4].try_into().map_err(map_try_from_slice_error)?) 
as usize;
+    if value.len() < len + 5 {
+        let err_str = format!("The length of the buffer for the long_string is 
soo short, it is {} and it should be at least {} ({} < {} + 5)", value.len(), 
len + 5 , value.len(), len);
+        return Err(ArrowError::InvalidArgumentError(err_str));
+    }
+    let string_bytes = &value[5..5 + len];
+    let string = str::from_utf8(string_bytes).map_err(|_| invalid_utf8_err())?;
+    Ok(string)
+}
+
+/// Decodes a short string from the value section of a variant.
+pub(crate) fn decode_short_string(value: &[u8]) -> Result<&str, ArrowError> {
+    if value.is_empty() {
+        return Err(ArrowError::InvalidArgumentError(
+            "Tried to decode value buffer into short_string, but it's 
empty.".to_string(),
+        ));
+    }
+    let len = ((value[0] & 0b11111100) >> 2) as usize;
+
+    if value.len() < len + 1 {
+        let err_str = format!("The length of the buffer for the short_string 
is too short, it is {} and it should be at least {} ({} < {} + 1)", 
value.len(), len + 1 , value.len(), len);
+        return Err(ArrowError::InvalidArgumentError(err_str));
+    }
+    let string_bytes = &value[1..1 + len];
+    let string = str::from_utf8(string_bytes).map_err(|_| invalid_utf8_err())?;
+    Ok(string)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_i8() -> Result<(), ArrowError> {
+        let value = [
+            0 | 3 << 2, // Primitive type for i8
+            42,
+        ];
+        let result = decode_int8(&value)?;
+        assert_eq!(result, 42);
+        Ok(())
+    }
+
+    #[test]
+    fn test_short_string() -> Result<(), ArrowError> {
+        let value = [
+            1 | 5 << 2, // Basic type for short string | length of short string
+            'H' as u8,
+            'e' as u8,
+            'l' as u8,
+            'l' as u8,
+            'o' as u8,
+            'o' as u8,
+        ];
+        let result = decode_short_string(&value)?;
+        assert_eq!(result, "Hello");
+        Ok(())
+    }
+
+    #[test]
+    fn test_string() -> Result<(), ArrowError> {

Review Comment:
   FWIW I don't think these unit tests are necessary given we have code that is 
reading the vaues from `parquet-testing`



##########
parquet-variant/src/variant.rs:
##########
@@ -0,0 +1,321 @@
+use std::{borrow::Cow, ops::Index};
+
+use crate::decoder::{self, get_variant_type};
+use arrow_schema::ArrowError;
+use strum_macros::EnumDiscriminants;
+
+#[derive(Clone, Copy, Debug, PartialEq)]
+/// Encodes the Variant Metadata, see the Variant spec file for more 
information
+pub struct VariantMetadata<'m> {
+    bytes: &'m [u8],
+}
+
+impl<'m> VariantMetadata<'m> {
+    /// View the raw bytes (needed by very low-level decoders)
+    #[inline]
+    pub const fn as_bytes(&self) -> &'m [u8] {
+        self.bytes
+    }
+
+    /// Whether the dictionary keys are sorted and unique
+    pub fn is_sorted(&self) -> bool {
+        todo!()
+    }
+
+    /// Get the dict length
+    pub fn dict_len(&self) -> Result<usize, ArrowError> {
+        let end_location = self.offset_size()? as usize + 1;
+        if self.bytes.len() < end_location {
+            let err_str = format!(
+                "Invalid metadata bytes, must have at least length {} but has 
length {}",
+                &end_location,
+                self.bytes.len()
+            );
+            return Err(ArrowError::InvalidArgumentError(err_str));
+        }
+        let dict_len_bytes = &self.bytes[1..end_location];
+        let dict_len = 
usize::from_le_bytes(dict_len_bytes.try_into().map_err(|e| {
+            ArrowError::InvalidArgumentError(format!(
+                "Unable to convert dictionary_size bytes into usize: {}",
+                e,
+            ))
+        })?);
+        Ok(dict_len)
+    }
+    pub fn version(&self) -> usize {
+        todo!()
+    }
+
+    /// Get the offset by index
+    pub fn get_offset_by(&self, index: usize) -> Result<usize, ArrowError> {
+        todo!()
+    }
+
+    /// Get the header byte, which has the following form
+    ///              7     6  5   4  3             0
+    ///             +-------+---+---+---------------+
+    /// header      |       |   |   |    version    |
+    ///             +-------+---+---+---------------+
+    ///                 ^         ^
+    ///                 |         +-- sorted_strings
+    ///                 +-- offset_size_minus_one
+    /// The version is a 4-bit value that must always contain the value 1.
+    /// - sorted_strings is a 1-bit value indicating whether dictionary 
strings are sorted and unique.
+    /// - offset_size_minus_one is a 2-bit value providing the number of bytes 
per dictionary size and offset field.
+    /// - The actual number of bytes, offset_size, is offset_size_minus_one + 1
+    pub fn header(&self) -> Result<u8, ArrowError> {
+        if self.bytes.is_empty() {
+            return Err(ArrowError::InvalidArgumentError(
+                "Can't get header from empty buffer".to_string(),
+            ));
+        }
+        Ok(self.bytes[0])
+    }
+
+    /// Get the offset_minus_one value from the header
+    pub fn offset_size_minus_one(&self) -> Result<u8, ArrowError> {
+        if self.bytes.is_empty() {
+            Err(ArrowError::InvalidArgumentError(
+                "Tried to get offset_size_minus_one from header, but 
self.bytes buffer is emtpy."
+                    .to_string(),
+            ))
+        } else {
+            Ok(self.bytes[0] & (0b11 << 6)) // Grab the last 2 bits
+        }
+    }
+
+    /// Get the offset_size
+    pub fn offset_size(&self) -> Result<u8, ArrowError> {
+        Ok(self.offset_size_minus_one()? + 1)
+    }
+
+    /// Get the offsets as an iterator
+    // TODO: Do we want this kind of API?
+    // TODO: Test once API is agreed upon
+    pub fn offsets(&'m self) -> Result<impl Iterator<Item = (usize, usize)> + 
'm, ArrowError> {
+        struct OffsetIterators<'m> {
+            buffer: &'m [u8],
+            total: usize,
+            seen: usize,
+            offset_size: usize,
+        }
+        impl<'m> Iterator for OffsetIterators<'m> {
+            type Item = (usize, usize); // (start, end) positions of the bytes
+
+            // TODO: Check bounds here or ensure they're correct
+
+            fn next(&mut self) -> Option<Self::Item> {
+                // +1 to skip the first offset
+                if self.seen < self.total {
+                    let start = usize::from_le_bytes(
+                        self.buffer[(self.seen ) * self.offset_size + 1 // +1 
to skip header
+                            ..(self.seen ) * self.offset_size + 1]
+                            .try_into()
+                            .ok()?,
+                    );
+                    self.seen += 1;
+                    let end = usize::from_le_bytes(
+                        self.buffer[(self.seen ) * self.offset_size + 1 // +1 
to skip header
+                            ..(self.seen ) * self.offset_size + 1]
+                            .try_into()
+                            .ok()?,
+                    );
+
+                    Some((start, end))
+                } else {
+                    None
+                }
+            }
+        }
+        let iterator: OffsetIterators = OffsetIterators {
+            buffer: self.bytes,
+            total: self.dict_len()?,
+            seen: 0,
+            offset_size: self.offset_size()? as usize,
+        };
+        Ok(iterator)
+    }
+    /// Get the key-name-bytes by index
+    pub fn get_by(&self, index: usize) -> Result<&'m str, ArrowError> {
+        todo!()
+    }
+    /// Get all key-names as an Iterator of strings
+    // TODO: Result
+    pub fn fields(&self) -> impl Iterator<Item = &'m str> {
+        // Do the same as for offsets
+        todo!();
+        vec![].into_iter()
+    }
+}
+
+#[derive(Clone, Copy, Debug, PartialEq)]
+pub struct VariantObject<'m, 'v> {
+    pub metadata: VariantMetadata<'m>,
+    pub value: &'v [u8],
+}
+
+impl<'m, 'v> VariantObject<'m, 'v> {
+    pub fn fields(&self) -> Result<impl Iterator<Item = (&'m str, Variant<'m, 
'v>)>, ArrowError> {
+        todo!();
+        #[allow(unreachable_code)] // Just to infer the return type
+        Ok(vec![].into_iter())
+    }
+
+    pub fn field(&self, _name: &'m str) -> Result<Variant<'m, 'v>, ArrowError> 
{
+        todo!()
+    }
+}
+
+#[derive(Clone, Copy, Debug, PartialEq)]
+pub struct VariantArray<'m, 'v> {
+    pub metadata: VariantMetadata<'m>,
+    pub value: &'v [u8],
+}
+
+// TODO: Let's agree on the API here, also should we expose a way to get the 
values as a vec of
+// variants for those who want it? Would require allocations.
+impl<'m, 'v> VariantArray<'m, 'v> {
+    pub fn len(&self) -> usize {
+        todo!()
+    }
+
+    pub fn values(&self) -> Result<impl Iterator<Item = Variant<'m, 'v>>, 
ArrowError> {
+        todo!();
+        #[allow(unreachable_code)] // Just to infer the return type
+        Ok(vec![].into_iter())
+    }
+}
+
+impl<'m, 'v> Index<usize> for VariantArray<'m, 'v> {
+    type Output = Variant<'m, 'v>;
+
+    fn index(&self, _index: usize) -> &Self::Output {
+        todo!()
+    }
+}
+
+/// Variant value. May contain references to metadata and value
+// TODO: Add copy if no Cow on String and Shortstring?
+#[derive(Clone, Debug, PartialEq, EnumDiscriminants)]
+#[strum_discriminants(name(VariantType))]
+pub enum Variant<'m, 'v> {
+    // TODO: Add 'legs' for the rest of the primitive types, once API is 
agreed upon
+    Null,
+    Int8(i8),
+
+    BooleanTrue,
+    BooleanFalse,
+
+    // only need the *value* buffer
+    // TODO: Do we want Cow<'v, str> over &'v str? It enables From<String> - 
discuss on PR
+    String(Cow<'v, str>),
+    ShortString(Cow<'v, str>),
+
+    // need both metadata & value
+    Object(VariantObject<'m, 'v>),
+    Array(VariantArray<'m, 'v>),
+}
+
+impl<'m, 'v> Variant<'m, 'v> {
+    /// Parse the buffers and return the appropriate variant.
+    pub fn try_new(metadata: &'m [u8], value: &'v [u8]) -> Result<Self, 
ArrowError> {
+        Ok(match get_variant_type(value)? {
+            VariantType::Null => Variant::Null,
+            VariantType::BooleanTrue => Variant::BooleanTrue,
+            VariantType::BooleanFalse => Variant::BooleanFalse,
+
+            VariantType::Int8 => Variant::Int8(decoder::decode_int8(value)?),
+
+            // TODO: Add 'legs' for the rest of the primitive types, once API 
is agreed upon
+            VariantType::String => {
+                
Variant::String(Cow::Borrowed(decoder::decode_long_string(value)?))
+            }
+
+            VariantType::ShortString => {
+                
Variant::ShortString(Cow::Borrowed(decoder::decode_short_string(value)?))
+            }
+
+            VariantType::Object => Variant::Object(VariantObject {

Review Comment:
   I wonder if it makes sense here to validate the basic parts of VariantObject 
on creation
   
   Like `VariantObject::try_new(metadata, value)?` 🤔 



##########
parquet-variant/Cargo.toml:
##########
@@ -31,6 +31,9 @@ edition = { workspace = true }
 rust-version = { workspace = true }
 
 [dependencies]
+arrow-schema = "55.1.0"
+strum = "0.27.1"

Review Comment:
   I think it would be nice to avoid adding a net new dependency (strum/strum 
macros of possible) -- arrow / parquet are fairly low level crates so having 
the dependency chain smaller i think is preferable even if it means more macros 
/ code in arrow



##########
parquet-variant/Cargo.toml:
##########
@@ -31,6 +31,9 @@ edition = { workspace = true }
 rust-version = { workspace = true }
 
 [dependencies]
+arrow-schema = "55.1.0"
+strum = "0.27.1"

Review Comment:
   I tried removing `strum` in a PR to this branch here: 
   - https://github.com/mkarbo/arrow-rs/pull/1
   
   It seems to reduce the amount of code and makes things clearer to me



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Re: [PR] API for reading Variant data and metadata [arrow-rs]

Reply via email to