mkarbo commented on code in PR #7535: URL: https://github.com/apache/arrow-rs/pull/7535#discussion_r2102992366
########## parquet-variant/src/decoder.rs: ########## @@ -0,0 +1,199 @@ +// NOTE: Largely based on the implementation of @PinkCrow007 in https://github.com/apache/arrow-rs/pull/7452 +// And the feedback there. +use crate::variant::VariantType; +use arrow_schema::ArrowError; +use std::{array::TryFromSliceError, str}; + +#[derive(Debug, Clone, Copy)] +pub enum VariantBasicType { + Primitive = 0, + ShortString = 1, + Object = 2, + Array = 3, +} + +#[derive(Debug, Clone, Copy)] +pub enum VariantPrimitiveType { + Null = 0, + BooleanTrue = 1, + BooleanFalse = 2, + Int8 = 3, + // TODO: Add 'legs' for the rest of primitives, once API is agreed upon + String = 16, +} + +/// Extracts the basic type from a header byte +pub(crate) fn get_basic_type(header: u8) -> Result<VariantBasicType, ArrowError> { + // See https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#value-encoding + let basic_type = header & 0x03; // Basic type is encoded in the first 2 bits + let basic_type = match basic_type { + 0 => VariantBasicType::Primitive, + 1 => VariantBasicType::ShortString, + 2 => VariantBasicType::Object, + 3 => VariantBasicType::Array, + _ => { + return Err(ArrowError::InvalidArgumentError(format!( + "unknown basic type: {}", + basic_type + ))) + } + }; + Ok(basic_type) +} + +/// Extracts the primitive type from a header byte +pub(crate) fn get_primitive_type(header: u8) -> Result<VariantPrimitiveType, ArrowError> { + // See https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#value-encoding + //// Primitive type is encoded in the last 6 bits of the header byte + let primitive_type = (header >> 2) & 0x3F; + let primitive_type = match primitive_type { + 0 => VariantPrimitiveType::Null, + 1 => VariantPrimitiveType::BooleanTrue, + 2 => VariantPrimitiveType::BooleanFalse, + 3 => VariantPrimitiveType::Int8, + // TODO: Add 'legs' for the rest, once API is agreed upon + 16 => VariantPrimitiveType::String, + _ => { + return Err(ArrowError::InvalidArgumentError(format!( + "unknown primitive type: {}", + primitive_type + ))) + } + }; + Ok(primitive_type) +} + +/// Extracts the variant type from the value section of a variant. The variant +/// type is defined as the set of all basic types and all primitive types. +pub fn get_variant_type(value: &[u8]) -> Result<VariantType, ArrowError> { + if value.is_empty() { + return Err(ArrowError::InvalidArgumentError( + "Tried to get variant type from empty buffer array".to_string(), + )); + } + let header = value[0]; + let variant_type = match get_basic_type(header)? { + VariantBasicType::Primitive => match get_primitive_type(header)? { + VariantPrimitiveType::Null => VariantType::Null, + VariantPrimitiveType::Int8 => VariantType::Int8, + VariantPrimitiveType::BooleanTrue => VariantType::BooleanTrue, + VariantPrimitiveType::BooleanFalse => VariantType::BooleanFalse, + // TODO: Add 'legs' for the rest, once API is agreed upon + VariantPrimitiveType::String => VariantType::String, + }, + VariantBasicType::ShortString => VariantType::ShortString, + VariantBasicType::Object => VariantType::Object, + VariantBasicType::Array => VariantType::Array, + }; + Ok(variant_type) +} + +/// To be used in `map_err` when unpacking an integer from a slice of bytes. +fn map_try_from_slice_error(e: TryFromSliceError) -> ArrowError { + ArrowError::InvalidArgumentError(e.to_string()) +} + +/// Constructs the error message for an invalid UTF-8 string. +fn invalid_utf8_err() -> ArrowError { + ArrowError::InvalidArgumentError("invalid UTF-8 string".to_string()) +} + +/// Decodes an Int8 from the value section of a variant. +pub(crate) fn decode_int8(value: &[u8]) -> Result<i8, ArrowError> { + if value.is_empty() { + return Err(ArrowError::InvalidArgumentError( + "Got empty value buffer so can't decode into int8.".to_string(), + )); + } + let value = i8::from_le_bytes([value[1]]); + Ok(value) +} + +/// Decodes a long string from the value section of a variant. +pub(crate) fn decode_long_string(value: &[u8]) -> Result<&str, ArrowError> { + if value.len() < 5 { + return Err(ArrowError::InvalidArgumentError( + "Tried to decode value buffer into long_string, but it's too short (len<5)." + .to_string(), + )); Review Comment: Thanks! Also added `non_empty_slice(...) -> Result<&[u8], ArrowError>` for the case where we just want to e.g., index first byte and don't want an array. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org