mkarbo commented on code in PR #7535: URL: https://github.com/apache/arrow-rs/pull/7535#discussion_r2102988142
########## parquet-variant/src/decoder.rs: ########## @@ -0,0 +1,199 @@ +// NOTE: Largely based on the implementation of @PinkCrow007 in https://github.com/apache/arrow-rs/pull/7452 +// And the feedback there. +use crate::variant::VariantType; +use arrow_schema::ArrowError; +use std::{array::TryFromSliceError, str}; + +#[derive(Debug, Clone, Copy)] +pub enum VariantBasicType { + Primitive = 0, + ShortString = 1, + Object = 2, + Array = 3, +} + +#[derive(Debug, Clone, Copy)] +pub enum VariantPrimitiveType { + Null = 0, + BooleanTrue = 1, + BooleanFalse = 2, + Int8 = 3, + // TODO: Add 'legs' for the rest of primitives, once API is agreed upon + String = 16, +} + +/// Extracts the basic type from a header byte +pub(crate) fn get_basic_type(header: u8) -> Result<VariantBasicType, ArrowError> { + // See https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#value-encoding + let basic_type = header & 0x03; // Basic type is encoded in the first 2 bits + let basic_type = match basic_type { + 0 => VariantBasicType::Primitive, + 1 => VariantBasicType::ShortString, + 2 => VariantBasicType::Object, + 3 => VariantBasicType::Array, + _ => { + return Err(ArrowError::InvalidArgumentError(format!( + "unknown basic type: {}", + basic_type + ))) + } + }; + Ok(basic_type) +} + +/// Extracts the primitive type from a header byte +pub(crate) fn get_primitive_type(header: u8) -> Result<VariantPrimitiveType, ArrowError> { + // See https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#value-encoding + //// Primitive type is encoded in the last 6 bits of the header byte + let primitive_type = (header >> 2) & 0x3F; + let primitive_type = match primitive_type { + 0 => VariantPrimitiveType::Null, + 1 => VariantPrimitiveType::BooleanTrue, + 2 => VariantPrimitiveType::BooleanFalse, + 3 => VariantPrimitiveType::Int8, + // TODO: Add 'legs' for the rest, once API is agreed upon + 16 => VariantPrimitiveType::String, + _ => { + return Err(ArrowError::InvalidArgumentError(format!( + "unknown primitive type: {}", + primitive_type + ))) + } + }; + Ok(primitive_type) +} + +/// Extracts the variant type from the value section of a variant. The variant +/// type is defined as the set of all basic types and all primitive types. +pub fn get_variant_type(value: &[u8]) -> Result<VariantType, ArrowError> { + if value.is_empty() { + return Err(ArrowError::InvalidArgumentError( + "Tried to get variant type from empty buffer array".to_string(), + )); + } + let header = value[0]; + let variant_type = match get_basic_type(header)? { + VariantBasicType::Primitive => match get_primitive_type(header)? { + VariantPrimitiveType::Null => VariantType::Null, Review Comment: Removed, thanks @alamb ########## parquet-variant/src/decoder.rs: ########## @@ -0,0 +1,199 @@ +// NOTE: Largely based on the implementation of @PinkCrow007 in https://github.com/apache/arrow-rs/pull/7452 +// And the feedback there. +use crate::variant::VariantType; +use arrow_schema::ArrowError; +use std::{array::TryFromSliceError, str}; + +#[derive(Debug, Clone, Copy)] +pub enum VariantBasicType { + Primitive = 0, + ShortString = 1, + Object = 2, + Array = 3, +} + +#[derive(Debug, Clone, Copy)] +pub enum VariantPrimitiveType { + Null = 0, + BooleanTrue = 1, + BooleanFalse = 2, + Int8 = 3, + // TODO: Add 'legs' for the rest of primitives, once API is agreed upon + String = 16, +} + +/// Extracts the basic type from a header byte +pub(crate) fn get_basic_type(header: u8) -> Result<VariantBasicType, ArrowError> { + // See https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#value-encoding + let basic_type = header & 0x03; // Basic type is encoded in the first 2 bits + let basic_type = match basic_type { + 0 => VariantBasicType::Primitive, + 1 => VariantBasicType::ShortString, + 2 => VariantBasicType::Object, + 3 => VariantBasicType::Array, + _ => { + return Err(ArrowError::InvalidArgumentError(format!( + "unknown basic type: {}", + basic_type + ))) + } + }; + Ok(basic_type) +} + +/// Extracts the primitive type from a header byte +pub(crate) fn get_primitive_type(header: u8) -> Result<VariantPrimitiveType, ArrowError> { + // See https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#value-encoding + //// Primitive type is encoded in the last 6 bits of the header byte + let primitive_type = (header >> 2) & 0x3F; + let primitive_type = match primitive_type { + 0 => VariantPrimitiveType::Null, + 1 => VariantPrimitiveType::BooleanTrue, + 2 => VariantPrimitiveType::BooleanFalse, + 3 => VariantPrimitiveType::Int8, + // TODO: Add 'legs' for the rest, once API is agreed upon + 16 => VariantPrimitiveType::String, + _ => { + return Err(ArrowError::InvalidArgumentError(format!( + "unknown primitive type: {}", + primitive_type + ))) + } + }; + Ok(primitive_type) +} + +/// Extracts the variant type from the value section of a variant. The variant +/// type is defined as the set of all basic types and all primitive types. +pub fn get_variant_type(value: &[u8]) -> Result<VariantType, ArrowError> { + if value.is_empty() { + return Err(ArrowError::InvalidArgumentError( + "Tried to get variant type from empty buffer array".to_string(), + )); + } + let header = value[0]; + let variant_type = match get_basic_type(header)? { + VariantBasicType::Primitive => match get_primitive_type(header)? { + VariantPrimitiveType::Null => VariantType::Null, Review Comment: You're right, and it's been removed, thanks to @alamb -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org