Re: [PR] Foundation of API for reading Variant data and metadata [arrow-rs]

2025-05-31 Thread via GitHub


alamb commented on PR #7535:
URL: https://github.com/apache/arrow-rs/pull/7535#issuecomment-2925695138

   > LGTM, except a few older review comments might have been missed? Also a 
bunch of nits to consider either before merge or as follow-up work.
   
   Yeah, I think it is getting close to impossible to follow along with all the 
 comments on this PR:
   
   https://github.com/user-attachments/assets/6a73edf4-ea5d-4800-8079-d60189d7fdb9";
 />
   
   Thus, let's bias towards action (and knowing we can always change code), I 
am going to merge it so we can keep hacking / improving in follow on PRs. 
   
   Thank you so much @mkarbo and @scovich for pushing this along
   
   Here is to some more exciting PRs ahead: 🚀 


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]



Re: [PR] Foundation of API for reading Variant data and metadata [arrow-rs]

2025-05-30 Thread via GitHub


mkarbo commented on code in PR #7535:
URL: https://github.com/apache/arrow-rs/pull/7535#discussion_r2115644709


##
parquet-variant/src/variant.rs:
##
@@ -0,0 +1,730 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+use crate::decoder::{
+self, get_basic_type, get_primitive_type, VariantBasicType, 
VariantPrimitiveType,
+};
+use crate::utils::{array_from_slice, first_byte_from_slice, slice_from_slice, 
string_from_slice};
+use arrow_schema::ArrowError;
+use std::{
+num::TryFromIntError,
+ops::{Index, Range},
+};
+
+#[derive(Clone, Debug, Copy, PartialEq)]
+enum OffsetSizeBytes {
+One = 1,
+Two = 2,
+Three = 3,
+Four = 4,
+}
+
+impl OffsetSizeBytes {
+/// Build from the `offset_size_minus_one` bits (see spec).
+fn try_new(offset_size_minus_one: u8) -> Result {
+use OffsetSizeBytes::*;
+let result = match offset_size_minus_one {
+0 => One,
+1 => Two,
+2 => Three,
+3 => Four,
+_ => {
+return Err(ArrowError::InvalidArgumentError(
+"offset_size_minus_one must be 0–3".to_string(),
+))
+}
+};
+Ok(result)
+}
+
+/// Return one unsigned little-endian value from `bytes`.
+///
+/// * `bytes` – the Variant-metadata buffer.
+/// * `byte_offset` – number of bytes to skip **before** reading the first
+///   value (usually `1` to move past the header byte).
+/// * `offset_index` – 0-based index **after** the skip
+///   (`0` is the first value, `1` the next, …).
+///
+/// Each value is `self as usize` bytes wide (1, 2, 3 or 4).
+/// Three-byte values are zero-extended to 32 bits before the final
+/// fallible cast to `usize`.
+fn unpack_usize(
+&self,
+bytes: &[u8],
+byte_offset: usize,  // how many bytes to skip
+offset_index: usize, // which offset in an array of offsets
+) -> Result {
+use OffsetSizeBytes::*;
+let offset = byte_offset + (*self as usize) * offset_index;
+let result = match self {
+One => u8::from_le_bytes(array_from_slice(bytes, offset)?).into(),
+Two => u16::from_le_bytes(array_from_slice(bytes, offset)?).into(),
+Three => {
+// Let's grab the three byte le-chunk first
+let b3_chunks: [u8; 3] = array_from_slice(bytes, offset)?;
+// Let's pad it and construct a padded u32 from it.
+let mut buf = [0u8; 4];
+buf[..3].copy_from_slice(&b3_chunks);
+u32::from_le_bytes(buf)
+.try_into()
+.map_err(|e: TryFromIntError| 
ArrowError::InvalidArgumentError(e.to_string()))?
+}
+Four => u32::from_le_bytes(array_from_slice(bytes, offset)?)
+.try_into()
+.map_err(|e: TryFromIntError| 
ArrowError::InvalidArgumentError(e.to_string()))?,
+};
+Ok(result)
+}
+}
+
+#[derive(Clone, Debug, Copy, PartialEq)]
+pub(crate) struct VariantMetadataHeader {
+version: u8,
+is_sorted: bool,
+/// Note: This is `offset_size_minus_one` + 1
+offset_size: OffsetSizeBytes,
+}
+
+// According to the spec this is currently always = 1, and so we store this 
const for validation
+// purposes and to make that visible.
+const CORRECT_VERSION_VALUE: u8 = 1;
+
+impl VariantMetadataHeader {
+/// Tries to construct the variant metadata header, which has the form
+///  7 6  5   4  3 0
+/// +---+---+---+---+
+/// header  |   |   |   |version|
+/// +---+---+---+---+
+/// ^ ^
+/// | +-- sorted_strings
+/// +-- offset_size_minus_one
+/// The version is a 4-bit value that must always contain the value 1.
+/// - sorted_strings is a 1-bit value indicating whether dictionary 
strings are sorted and unique.
+/// - offset_size_minus_one is a 2-bit value providing the number of bytes 
per dictionary size and offset field.
+///

Re: [PR] Foundation of API for reading Variant data and metadata [arrow-rs]

2025-05-30 Thread via GitHub


scovich commented on PR #7535:
URL: https://github.com/apache/arrow-rs/pull/7535#issuecomment-2922376872

   > You're too fast @scovich! I am not done yet 😉
   
   Hehe sorry... this is just too exciting.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]



Re: [PR] Foundation of API for reading Variant data and metadata [arrow-rs]

2025-05-30 Thread via GitHub


mkarbo commented on PR #7535:
URL: https://github.com/apache/arrow-rs/pull/7535#issuecomment-2922335818

   You're too fast @scovich! I am not done yet 😉 


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]



Re: [PR] Foundation of API for reading Variant data and metadata [arrow-rs]

2025-05-30 Thread via GitHub


mkarbo commented on code in PR #7535:
URL: https://github.com/apache/arrow-rs/pull/7535#discussion_r2115375272


##
parquet-variant/src/variant.rs:
##
@@ -0,0 +1,714 @@
+use crate::decoder::{
+self, get_basic_type, get_primitive_type, VariantBasicType, 
VariantPrimitiveType,
+};
+use crate::utils::{array_from_slice, first_byte_from_slice, slice_from_slice, 
string_from_slice};
+use arrow_schema::ArrowError;
+use std::{
+num::TryFromIntError,
+ops::{Index, Range},
+};
+
+#[derive(Clone, Debug, Copy, PartialEq)]
+enum OffsetSizeBytes {
+One = 1,
+Two = 2,
+Three = 3,
+Four = 4,
+}
+
+impl OffsetSizeBytes {
+/// Build from the `offset_size_minus_one` bits (see spec).
+fn try_new(offset_size_minus_one: u8) -> Result {
+use OffsetSizeBytes::*;
+let result = match offset_size_minus_one {
+0 => One,
+1 => Two,
+2 => Three,
+3 => Four,
+_ => {
+return Err(ArrowError::InvalidArgumentError(
+"offset_size_minus_one must be 0–3".to_string(),
+))
+}
+};
+Ok(result)
+}
+
+/// Return one unsigned little-endian value from `bytes`.
+///
+/// * `bytes` – the Variant-metadata buffer.
+/// * `byte_offset` – number of bytes to skip **before** reading the first
+///   value (usually `1` to move past the header byte).
+/// * `offset_index` – 0-based index **after** the skip
+///   (`0` is the first value, `1` the next, …).
+///
+/// Each value is `self as usize` bytes wide (1, 2, 3 or 4).
+/// Three-byte values are zero-extended to 32 bits before the final
+/// fallible cast to `usize`.
+fn unpack_usize(
+&self,
+bytes: &[u8],
+byte_offset: usize,  // how many bytes to skip
+offset_index: usize, // which offset in an array of offsets
+) -> Result {
+use OffsetSizeBytes::*;
+let offset = byte_offset + (*self as usize) * offset_index;
+let result = match self {
+One => u8::from_le_bytes(array_from_slice(bytes, offset)?).into(),
+Two => u16::from_le_bytes(array_from_slice(bytes, offset)?).into(),
+Three => {
+// Let's grab the three byte le-chunk first
+let b3_chunks: [u8; 3] = array_from_slice(bytes, offset)?;
+// Let's pad it and construct a padded u32 from it.
+let mut buf = [0u8; 4];
+buf[..3].copy_from_slice(&b3_chunks);
+u32::from_le_bytes(buf)
+.try_into()
+.map_err(|e: TryFromIntError| 
ArrowError::InvalidArgumentError(e.to_string()))?
+}
+Four => u32::from_le_bytes(array_from_slice(bytes, offset)?)
+.try_into()
+.map_err(|e: TryFromIntError| 
ArrowError::InvalidArgumentError(e.to_string()))?,
+};
+Ok(result)
+}
+}
+
+#[derive(Clone, Debug, Copy, PartialEq)]
+pub(crate) struct VariantMetadataHeader {
+version: u8,
+is_sorted: bool,
+/// Note: This is `offset_size_minus_one` + 1
+offset_size: OffsetSizeBytes,
+}
+
+// According to the spec this is currently always = 1, and so we store this 
const for validation
+// purposes and to make that visible.
+const CORRECT_VERSION_VALUE: u8 = 1;
+
+impl VariantMetadataHeader {
+/// Tries to construct the variant metadata header, which has the form
+///  7 6  5   4  3 0
+/// +---+---+---+---+
+/// header  |   |   |   |version|
+/// +---+---+---+---+
+/// ^ ^
+/// | +-- sorted_strings
+/// +-- offset_size_minus_one
+/// The version is a 4-bit value that must always contain the value 1.
+/// - sorted_strings is a 1-bit value indicating whether dictionary 
strings are sorted and unique.
+/// - offset_size_minus_one is a 2-bit value providing the number of bytes 
per dictionary size and offset field.
+/// - The actual number of bytes, offset_size, is offset_size_minus_one + 1
+pub fn try_new(bytes: &[u8]) -> Result {
+let header = first_byte_from_slice(bytes)?;
+
+let version = header & 0x0F; // First four bits
+if version != CORRECT_VERSION_VALUE {
+let err_msg = format!(
+"The version bytes in the header is not 
{CORRECT_VERSION_VALUE}, got {:b}",
+version
+);
+return Err(ArrowError::InvalidArgumentError(err_msg));
+}
+let is_sorted = (header & 0x10) != 0; // Fifth bit
+let offset_size_minus_one = header >> 6; // Last two bits
+Ok(Self {
+version,
+is_sorted,
+offset_size: OffsetSizeBytes::try_new(offset_size_minus_one)?,
+})
+}
+}

Re: [PR] Foundation of API for reading Variant data and metadata [arrow-rs]

2025-05-29 Thread via GitHub


mkarbo commented on code in PR #7535:
URL: https://github.com/apache/arrow-rs/pull/7535#discussion_r2115246754


##
parquet-variant/src/variant.rs:
##
@@ -0,0 +1,714 @@
+use crate::decoder::{
+self, get_basic_type, get_primitive_type, VariantBasicType, 
VariantPrimitiveType,
+};
+use crate::utils::{array_from_slice, first_byte_from_slice, slice_from_slice, 
string_from_slice};
+use arrow_schema::ArrowError;
+use std::{
+num::TryFromIntError,
+ops::{Index, Range},
+};
+
+#[derive(Clone, Debug, Copy, PartialEq)]
+enum OffsetSizeBytes {
+One = 1,
+Two = 2,
+Three = 3,
+Four = 4,
+}
+
+impl OffsetSizeBytes {
+/// Build from the `offset_size_minus_one` bits (see spec).
+fn try_new(offset_size_minus_one: u8) -> Result {
+use OffsetSizeBytes::*;
+let result = match offset_size_minus_one {
+0 => One,
+1 => Two,
+2 => Three,
+3 => Four,
+_ => {
+return Err(ArrowError::InvalidArgumentError(
+"offset_size_minus_one must be 0–3".to_string(),
+))
+}
+};
+Ok(result)
+}
+
+/// Return one unsigned little-endian value from `bytes`.
+///
+/// * `bytes` – the Variant-metadata buffer.
+/// * `byte_offset` – number of bytes to skip **before** reading the first
+///   value (usually `1` to move past the header byte).
+/// * `offset_index` – 0-based index **after** the skip
+///   (`0` is the first value, `1` the next, …).
+///
+/// Each value is `self as usize` bytes wide (1, 2, 3 or 4).
+/// Three-byte values are zero-extended to 32 bits before the final
+/// fallible cast to `usize`.
+fn unpack_usize(
+&self,
+bytes: &[u8],
+byte_offset: usize,  // how many bytes to skip
+offset_index: usize, // which offset in an array of offsets
+) -> Result {
+use OffsetSizeBytes::*;
+let offset = byte_offset + (*self as usize) * offset_index;
+let result = match self {
+One => u8::from_le_bytes(array_from_slice(bytes, offset)?).into(),
+Two => u16::from_le_bytes(array_from_slice(bytes, offset)?).into(),
+Three => {
+// Let's grab the three byte le-chunk first
+let b3_chunks: [u8; 3] = array_from_slice(bytes, offset)?;
+// Let's pad it and construct a padded u32 from it.
+let mut buf = [0u8; 4];
+buf[..3].copy_from_slice(&b3_chunks);
+u32::from_le_bytes(buf)
+.try_into()
+.map_err(|e: TryFromIntError| 
ArrowError::InvalidArgumentError(e.to_string()))?
+}
+Four => u32::from_le_bytes(array_from_slice(bytes, offset)?)
+.try_into()
+.map_err(|e: TryFromIntError| 
ArrowError::InvalidArgumentError(e.to_string()))?,
+};
+Ok(result)
+}
+}
+
+#[derive(Clone, Debug, Copy, PartialEq)]
+pub(crate) struct VariantMetadataHeader {
+version: u8,
+is_sorted: bool,
+/// Note: This is `offset_size_minus_one` + 1
+offset_size: OffsetSizeBytes,
+}
+
+// According to the spec this is currently always = 1, and so we store this 
const for validation
+// purposes and to make that visible.
+const CORRECT_VERSION_VALUE: u8 = 1;
+
+impl VariantMetadataHeader {
+/// Tries to construct the variant metadata header, which has the form
+///  7 6  5   4  3 0
+/// +---+---+---+---+
+/// header  |   |   |   |version|
+/// +---+---+---+---+
+/// ^ ^
+/// | +-- sorted_strings
+/// +-- offset_size_minus_one
+/// The version is a 4-bit value that must always contain the value 1.
+/// - sorted_strings is a 1-bit value indicating whether dictionary 
strings are sorted and unique.
+/// - offset_size_minus_one is a 2-bit value providing the number of bytes 
per dictionary size and offset field.
+/// - The actual number of bytes, offset_size, is offset_size_minus_one + 1
+pub fn try_new(bytes: &[u8]) -> Result {
+let header = first_byte_from_slice(bytes)?;
+
+let version = header & 0x0F; // First four bits
+if version != CORRECT_VERSION_VALUE {
+let err_msg = format!(
+"The version bytes in the header is not 
{CORRECT_VERSION_VALUE}, got {:b}",
+version
+);
+return Err(ArrowError::InvalidArgumentError(err_msg));
+}
+let is_sorted = (header & 0x10) != 0; // Fifth bit
+let offset_size_minus_one = header >> 6; // Last two bits
+Ok(Self {
+version,
+is_sorted,
+offset_size: OffsetSizeBytes::try_new(offset_size_minus_one)?,
+})
+}
+}

Re: [PR] Foundation of API for reading Variant data and metadata [arrow-rs]

2025-05-27 Thread via GitHub


scovich commented on code in PR #7535:
URL: https://github.com/apache/arrow-rs/pull/7535#discussion_r2109226138


##
parquet-variant/src/variant.rs:
##
@@ -0,0 +1,730 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+use crate::decoder::{
+self, get_basic_type, get_primitive_type, VariantBasicType, 
VariantPrimitiveType,
+};
+use crate::utils::{array_from_slice, first_byte_from_slice, slice_from_slice, 
string_from_slice};
+use arrow_schema::ArrowError;
+use std::{
+num::TryFromIntError,
+ops::{Index, Range},
+};
+
+#[derive(Clone, Debug, Copy, PartialEq)]
+enum OffsetSizeBytes {
+One = 1,
+Two = 2,
+Three = 3,
+Four = 4,
+}
+
+impl OffsetSizeBytes {
+/// Build from the `offset_size_minus_one` bits (see spec).
+fn try_new(offset_size_minus_one: u8) -> Result {
+use OffsetSizeBytes::*;
+let result = match offset_size_minus_one {
+0 => One,
+1 => Two,
+2 => Three,
+3 => Four,
+_ => {
+return Err(ArrowError::InvalidArgumentError(
+"offset_size_minus_one must be 0–3".to_string(),
+))
+}
+};
+Ok(result)
+}
+
+/// Return one unsigned little-endian value from `bytes`.
+///
+/// * `bytes` – the Variant-metadata buffer.
+/// * `byte_offset` – number of bytes to skip **before** reading the first
+///   value (usually `1` to move past the header byte).
+/// * `offset_index` – 0-based index **after** the skip
+///   (`0` is the first value, `1` the next, …).
+///
+/// Each value is `self as usize` bytes wide (1, 2, 3 or 4).
+/// Three-byte values are zero-extended to 32 bits before the final
+/// fallible cast to `usize`.
+fn unpack_usize(
+&self,
+bytes: &[u8],
+byte_offset: usize,  // how many bytes to skip
+offset_index: usize, // which offset in an array of offsets
+) -> Result {
+use OffsetSizeBytes::*;
+let offset = byte_offset + (*self as usize) * offset_index;
+let result = match self {
+One => u8::from_le_bytes(array_from_slice(bytes, offset)?).into(),
+Two => u16::from_le_bytes(array_from_slice(bytes, offset)?).into(),
+Three => {
+// Let's grab the three byte le-chunk first
+let b3_chunks: [u8; 3] = array_from_slice(bytes, offset)?;
+// Let's pad it and construct a padded u32 from it.
+let mut buf = [0u8; 4];
+buf[..3].copy_from_slice(&b3_chunks);
+u32::from_le_bytes(buf)
+.try_into()
+.map_err(|e: TryFromIntError| 
ArrowError::InvalidArgumentError(e.to_string()))?
+}
+Four => u32::from_le_bytes(array_from_slice(bytes, offset)?)
+.try_into()
+.map_err(|e: TryFromIntError| 
ArrowError::InvalidArgumentError(e.to_string()))?,
+};
+Ok(result)
+}
+}
+
+#[derive(Clone, Debug, Copy, PartialEq)]
+pub(crate) struct VariantMetadataHeader {
+version: u8,
+is_sorted: bool,
+/// Note: This is `offset_size_minus_one` + 1
+offset_size: OffsetSizeBytes,
+}
+
+// According to the spec this is currently always = 1, and so we store this 
const for validation
+// purposes and to make that visible.
+const CORRECT_VERSION_VALUE: u8 = 1;
+
+impl VariantMetadataHeader {
+/// Tries to construct the variant metadata header, which has the form
+///  7 6  5   4  3 0
+/// +---+---+---+---+
+/// header  |   |   |   |version|
+/// +---+---+---+---+
+/// ^ ^
+/// | +-- sorted_strings
+/// +-- offset_size_minus_one
+/// The version is a 4-bit value that must always contain the value 1.
+/// - sorted_strings is a 1-bit value indicating whether dictionary 
strings are sorted and unique.
+/// - offset_size_minus_one is a 2-bit value providing the number of bytes 
per dictionary size and offset field.
+//

Re: [PR] Foundation of API for reading Variant data and metadata [arrow-rs]

2025-05-27 Thread via GitHub


scovich commented on code in PR #7535:
URL: https://github.com/apache/arrow-rs/pull/7535#discussion_r2109226138


##
parquet-variant/src/variant.rs:
##
@@ -0,0 +1,730 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+use crate::decoder::{
+self, get_basic_type, get_primitive_type, VariantBasicType, 
VariantPrimitiveType,
+};
+use crate::utils::{array_from_slice, first_byte_from_slice, slice_from_slice, 
string_from_slice};
+use arrow_schema::ArrowError;
+use std::{
+num::TryFromIntError,
+ops::{Index, Range},
+};
+
+#[derive(Clone, Debug, Copy, PartialEq)]
+enum OffsetSizeBytes {
+One = 1,
+Two = 2,
+Three = 3,
+Four = 4,
+}
+
+impl OffsetSizeBytes {
+/// Build from the `offset_size_minus_one` bits (see spec).
+fn try_new(offset_size_minus_one: u8) -> Result {
+use OffsetSizeBytes::*;
+let result = match offset_size_minus_one {
+0 => One,
+1 => Two,
+2 => Three,
+3 => Four,
+_ => {
+return Err(ArrowError::InvalidArgumentError(
+"offset_size_minus_one must be 0–3".to_string(),
+))
+}
+};
+Ok(result)
+}
+
+/// Return one unsigned little-endian value from `bytes`.
+///
+/// * `bytes` – the Variant-metadata buffer.
+/// * `byte_offset` – number of bytes to skip **before** reading the first
+///   value (usually `1` to move past the header byte).
+/// * `offset_index` – 0-based index **after** the skip
+///   (`0` is the first value, `1` the next, …).
+///
+/// Each value is `self as usize` bytes wide (1, 2, 3 or 4).
+/// Three-byte values are zero-extended to 32 bits before the final
+/// fallible cast to `usize`.
+fn unpack_usize(
+&self,
+bytes: &[u8],
+byte_offset: usize,  // how many bytes to skip
+offset_index: usize, // which offset in an array of offsets
+) -> Result {
+use OffsetSizeBytes::*;
+let offset = byte_offset + (*self as usize) * offset_index;
+let result = match self {
+One => u8::from_le_bytes(array_from_slice(bytes, offset)?).into(),
+Two => u16::from_le_bytes(array_from_slice(bytes, offset)?).into(),
+Three => {
+// Let's grab the three byte le-chunk first
+let b3_chunks: [u8; 3] = array_from_slice(bytes, offset)?;
+// Let's pad it and construct a padded u32 from it.
+let mut buf = [0u8; 4];
+buf[..3].copy_from_slice(&b3_chunks);
+u32::from_le_bytes(buf)
+.try_into()
+.map_err(|e: TryFromIntError| 
ArrowError::InvalidArgumentError(e.to_string()))?
+}
+Four => u32::from_le_bytes(array_from_slice(bytes, offset)?)
+.try_into()
+.map_err(|e: TryFromIntError| 
ArrowError::InvalidArgumentError(e.to_string()))?,
+};
+Ok(result)
+}
+}
+
+#[derive(Clone, Debug, Copy, PartialEq)]
+pub(crate) struct VariantMetadataHeader {
+version: u8,
+is_sorted: bool,
+/// Note: This is `offset_size_minus_one` + 1
+offset_size: OffsetSizeBytes,
+}
+
+// According to the spec this is currently always = 1, and so we store this 
const for validation
+// purposes and to make that visible.
+const CORRECT_VERSION_VALUE: u8 = 1;
+
+impl VariantMetadataHeader {
+/// Tries to construct the variant metadata header, which has the form
+///  7 6  5   4  3 0
+/// +---+---+---+---+
+/// header  |   |   |   |version|
+/// +---+---+---+---+
+/// ^ ^
+/// | +-- sorted_strings
+/// +-- offset_size_minus_one
+/// The version is a 4-bit value that must always contain the value 1.
+/// - sorted_strings is a 1-bit value indicating whether dictionary 
strings are sorted and unique.
+/// - offset_size_minus_one is a 2-bit value providing the number of bytes 
per dictionary size and offset field.
+//

Re: [PR] Foundation of API for reading Variant data and metadata [arrow-rs]

2025-05-26 Thread via GitHub


mkarbo commented on PR #7535:
URL: https://github.com/apache/arrow-rs/pull/7535#issuecomment-2910126450

   Thanks all, I can give it another go tomorrow I think with all the comments.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]



Re: [PR] Foundation of API for reading Variant data and metadata [arrow-rs]

2025-05-25 Thread via GitHub


alamb commented on code in PR #7535:
URL: https://github.com/apache/arrow-rs/pull/7535#discussion_r2106165938


##
parquet-variant/src/variant.rs:
##
@@ -0,0 +1,714 @@
+use crate::decoder::{
+self, get_basic_type, get_primitive_type, VariantBasicType, 
VariantPrimitiveType,
+};
+use crate::utils::{array_from_slice, first_byte_from_slice, slice_from_slice, 
string_from_slice};
+use arrow_schema::ArrowError;
+use std::{
+num::TryFromIntError,
+ops::{Index, Range},
+};
+
+#[derive(Clone, Debug, Copy, PartialEq)]
+enum OffsetSizeBytes {
+One = 1,
+Two = 2,
+Three = 3,
+Four = 4,
+}
+
+impl OffsetSizeBytes {
+/// Build from the `offset_size_minus_one` bits (see spec).
+fn try_new(offset_size_minus_one: u8) -> Result {
+use OffsetSizeBytes::*;
+let result = match offset_size_minus_one {
+0 => One,
+1 => Two,
+2 => Three,
+3 => Four,
+_ => {
+return Err(ArrowError::InvalidArgumentError(
+"offset_size_minus_one must be 0–3".to_string(),
+))
+}
+};
+Ok(result)
+}
+
+/// Return one unsigned little-endian value from `bytes`.
+///
+/// * `bytes` – the Variant-metadata buffer.
+/// * `byte_offset` – number of bytes to skip **before** reading the first
+///   value (usually `1` to move past the header byte).
+/// * `offset_index` – 0-based index **after** the skip
+///   (`0` is the first value, `1` the next, …).
+///
+/// Each value is `self as usize` bytes wide (1, 2, 3 or 4).
+/// Three-byte values are zero-extended to 32 bits before the final
+/// fallible cast to `usize`.
+fn unpack_usize(
+&self,
+bytes: &[u8],
+byte_offset: usize,  // how many bytes to skip
+offset_index: usize, // which offset in an array of offsets
+) -> Result {
+use OffsetSizeBytes::*;
+let offset = byte_offset + (*self as usize) * offset_index;
+let result = match self {
+One => u8::from_le_bytes(array_from_slice(bytes, offset)?).into(),
+Two => u16::from_le_bytes(array_from_slice(bytes, offset)?).into(),
+Three => {
+// Let's grab the three byte le-chunk first
+let b3_chunks: [u8; 3] = array_from_slice(bytes, offset)?;
+// Let's pad it and construct a padded u32 from it.
+let mut buf = [0u8; 4];
+buf[..3].copy_from_slice(&b3_chunks);
+u32::from_le_bytes(buf)
+.try_into()
+.map_err(|e: TryFromIntError| 
ArrowError::InvalidArgumentError(e.to_string()))?
+}
+Four => u32::from_le_bytes(array_from_slice(bytes, offset)?)
+.try_into()
+.map_err(|e: TryFromIntError| 
ArrowError::InvalidArgumentError(e.to_string()))?,
+};
+Ok(result)
+}
+}
+
+#[derive(Clone, Debug, Copy, PartialEq)]
+pub(crate) struct VariantMetadataHeader {
+version: u8,
+is_sorted: bool,
+/// Note: This is `offset_size_minus_one` + 1
+offset_size: OffsetSizeBytes,
+}
+
+// According to the spec this is currently always = 1, and so we store this 
const for validation
+// purposes and to make that visible.
+const CORRECT_VERSION_VALUE: u8 = 1;
+
+impl VariantMetadataHeader {
+/// Tries to construct the variant metadata header, which has the form
+///  7 6  5   4  3 0
+/// +---+---+---+---+
+/// header  |   |   |   |version|
+/// +---+---+---+---+
+/// ^ ^
+/// | +-- sorted_strings
+/// +-- offset_size_minus_one
+/// The version is a 4-bit value that must always contain the value 1.
+/// - sorted_strings is a 1-bit value indicating whether dictionary 
strings are sorted and unique.
+/// - offset_size_minus_one is a 2-bit value providing the number of bytes 
per dictionary size and offset field.
+/// - The actual number of bytes, offset_size, is offset_size_minus_one + 1
+pub fn try_new(bytes: &[u8]) -> Result {
+let header = first_byte_from_slice(bytes)?;
+
+let version = header & 0x0F; // First four bits
+if version != CORRECT_VERSION_VALUE {
+let err_msg = format!(
+"The version bytes in the header is not 
{CORRECT_VERSION_VALUE}, got {:b}",
+version
+);
+return Err(ArrowError::InvalidArgumentError(err_msg));
+}
+let is_sorted = (header & 0x10) != 0; // Fifth bit
+let offset_size_minus_one = header >> 6; // Last two bits
+Ok(Self {
+version,
+is_sorted,
+offset_size: OffsetSizeBytes::try_new(offset_size_minus_one)?,
+})
+}
+}

Re: [PR] Foundation of API for reading Variant data and metadata [arrow-rs]

2025-05-25 Thread via GitHub


alamb commented on code in PR #7535:
URL: https://github.com/apache/arrow-rs/pull/7535#discussion_r2106165818


##
parquet-variant/src/variant.rs:
##
@@ -0,0 +1,730 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+use crate::decoder::{
+self, get_basic_type, get_primitive_type, VariantBasicType, 
VariantPrimitiveType,
+};
+use crate::utils::{array_from_slice, first_byte_from_slice, slice_from_slice, 
string_from_slice};
+use arrow_schema::ArrowError;
+use std::{
+num::TryFromIntError,
+ops::{Index, Range},
+};
+
+#[derive(Clone, Debug, Copy, PartialEq)]
+enum OffsetSizeBytes {
+One = 1,
+Two = 2,
+Three = 3,
+Four = 4,
+}
+
+impl OffsetSizeBytes {
+/// Build from the `offset_size_minus_one` bits (see spec).
+fn try_new(offset_size_minus_one: u8) -> Result {
+use OffsetSizeBytes::*;
+let result = match offset_size_minus_one {
+0 => One,
+1 => Two,
+2 => Three,
+3 => Four,
+_ => {
+return Err(ArrowError::InvalidArgumentError(
+"offset_size_minus_one must be 0–3".to_string(),
+))
+}
+};
+Ok(result)
+}
+
+/// Return one unsigned little-endian value from `bytes`.
+///
+/// * `bytes` – the Variant-metadata buffer.
+/// * `byte_offset` – number of bytes to skip **before** reading the first
+///   value (usually `1` to move past the header byte).
+/// * `offset_index` – 0-based index **after** the skip
+///   (`0` is the first value, `1` the next, …).
+///
+/// Each value is `self as usize` bytes wide (1, 2, 3 or 4).
+/// Three-byte values are zero-extended to 32 bits before the final
+/// fallible cast to `usize`.
+fn unpack_usize(
+&self,
+bytes: &[u8],
+byte_offset: usize,  // how many bytes to skip
+offset_index: usize, // which offset in an array of offsets
+) -> Result {
+use OffsetSizeBytes::*;
+let offset = byte_offset + (*self as usize) * offset_index;
+let result = match self {
+One => u8::from_le_bytes(array_from_slice(bytes, offset)?).into(),
+Two => u16::from_le_bytes(array_from_slice(bytes, offset)?).into(),
+Three => {
+// Let's grab the three byte le-chunk first
+let b3_chunks: [u8; 3] = array_from_slice(bytes, offset)?;
+// Let's pad it and construct a padded u32 from it.
+let mut buf = [0u8; 4];
+buf[..3].copy_from_slice(&b3_chunks);
+u32::from_le_bytes(buf)
+.try_into()
+.map_err(|e: TryFromIntError| 
ArrowError::InvalidArgumentError(e.to_string()))?
+}
+Four => u32::from_le_bytes(array_from_slice(bytes, offset)?)
+.try_into()
+.map_err(|e: TryFromIntError| 
ArrowError::InvalidArgumentError(e.to_string()))?,
+};
+Ok(result)
+}
+}
+
+#[derive(Clone, Debug, Copy, PartialEq)]
+pub(crate) struct VariantMetadataHeader {
+version: u8,
+is_sorted: bool,
+/// Note: This is `offset_size_minus_one` + 1
+offset_size: OffsetSizeBytes,
+}
+
+// According to the spec this is currently always = 1, and so we store this 
const for validation
+// purposes and to make that visible.
+const CORRECT_VERSION_VALUE: u8 = 1;
+
+impl VariantMetadataHeader {
+/// Tries to construct the variant metadata header, which has the form
+///  7 6  5   4  3 0
+/// +---+---+---+---+
+/// header  |   |   |   |version|
+/// +---+---+---+---+
+/// ^ ^
+/// | +-- sorted_strings
+/// +-- offset_size_minus_one
+/// The version is a 4-bit value that must always contain the value 1.
+/// - sorted_strings is a 1-bit value indicating whether dictionary 
strings are sorted and unique.
+/// - offset_size_minus_one is a 2-bit value providing the number of bytes 
per dictionary size and offset field.
+/// 

Re: [PR] Foundation of API for reading Variant data and metadata [arrow-rs]

2025-05-25 Thread via GitHub


alamb commented on code in PR #7535:
URL: https://github.com/apache/arrow-rs/pull/7535#discussion_r2106165423


##
parquet-variant/src/variant.rs:
##
@@ -0,0 +1,714 @@
+use crate::decoder::{
+self, get_basic_type, get_primitive_type, VariantBasicType, 
VariantPrimitiveType,
+};
+use crate::utils::{array_from_slice, first_byte_from_slice, slice_from_slice, 
string_from_slice};
+use arrow_schema::ArrowError;
+use std::{
+num::TryFromIntError,
+ops::{Index, Range},
+};
+
+#[derive(Clone, Debug, Copy, PartialEq)]
+enum OffsetSizeBytes {
+One = 1,
+Two = 2,
+Three = 3,
+Four = 4,
+}
+
+impl OffsetSizeBytes {
+/// Build from the `offset_size_minus_one` bits (see spec).
+fn try_new(offset_size_minus_one: u8) -> Result {
+use OffsetSizeBytes::*;
+let result = match offset_size_minus_one {
+0 => One,
+1 => Two,
+2 => Three,
+3 => Four,
+_ => {
+return Err(ArrowError::InvalidArgumentError(
+"offset_size_minus_one must be 0–3".to_string(),
+))
+}
+};
+Ok(result)
+}
+
+/// Return one unsigned little-endian value from `bytes`.
+///
+/// * `bytes` – the Variant-metadata buffer.
+/// * `byte_offset` – number of bytes to skip **before** reading the first
+///   value (usually `1` to move past the header byte).
+/// * `offset_index` – 0-based index **after** the skip
+///   (`0` is the first value, `1` the next, …).
+///
+/// Each value is `self as usize` bytes wide (1, 2, 3 or 4).
+/// Three-byte values are zero-extended to 32 bits before the final
+/// fallible cast to `usize`.
+fn unpack_usize(
+&self,
+bytes: &[u8],
+byte_offset: usize,  // how many bytes to skip
+offset_index: usize, // which offset in an array of offsets
+) -> Result {
+use OffsetSizeBytes::*;
+let offset = byte_offset + (*self as usize) * offset_index;
+let result = match self {
+One => u8::from_le_bytes(array_from_slice(bytes, offset)?).into(),
+Two => u16::from_le_bytes(array_from_slice(bytes, offset)?).into(),
+Three => {
+// Let's grab the three byte le-chunk first
+let b3_chunks: [u8; 3] = array_from_slice(bytes, offset)?;
+// Let's pad it and construct a padded u32 from it.
+let mut buf = [0u8; 4];
+buf[..3].copy_from_slice(&b3_chunks);
+u32::from_le_bytes(buf)
+.try_into()
+.map_err(|e: TryFromIntError| 
ArrowError::InvalidArgumentError(e.to_string()))?
+}
+Four => u32::from_le_bytes(array_from_slice(bytes, offset)?)
+.try_into()
+.map_err(|e: TryFromIntError| 
ArrowError::InvalidArgumentError(e.to_string()))?,
+};
+Ok(result)
+}
+}
+
+#[derive(Clone, Debug, Copy, PartialEq)]
+pub(crate) struct VariantMetadataHeader {
+version: u8,
+is_sorted: bool,
+/// Note: This is `offset_size_minus_one` + 1
+offset_size: OffsetSizeBytes,
+}
+
+// According to the spec this is currently always = 1, and so we store this 
const for validation
+// purposes and to make that visible.
+const CORRECT_VERSION_VALUE: u8 = 1;
+
+impl VariantMetadataHeader {
+/// Tries to construct the variant metadata header, which has the form
+///  7 6  5   4  3 0
+/// +---+---+---+---+
+/// header  |   |   |   |version|
+/// +---+---+---+---+
+/// ^ ^
+/// | +-- sorted_strings
+/// +-- offset_size_minus_one
+/// The version is a 4-bit value that must always contain the value 1.
+/// - sorted_strings is a 1-bit value indicating whether dictionary 
strings are sorted and unique.
+/// - offset_size_minus_one is a 2-bit value providing the number of bytes 
per dictionary size and offset field.
+/// - The actual number of bytes, offset_size, is offset_size_minus_one + 1
+pub fn try_new(bytes: &[u8]) -> Result {
+let header = first_byte_from_slice(bytes)?;
+
+let version = header & 0x0F; // First four bits
+if version != CORRECT_VERSION_VALUE {
+let err_msg = format!(
+"The version bytes in the header is not 
{CORRECT_VERSION_VALUE}, got {:b}",
+version
+);
+return Err(ArrowError::InvalidArgumentError(err_msg));
+}
+let is_sorted = (header & 0x10) != 0; // Fifth bit
+let offset_size_minus_one = header >> 6; // Last two bits
+Ok(Self {
+version,
+is_sorted,
+offset_size: OffsetSizeBytes::try_new(offset_size_minus_one)?,
+})
+}
+}

Re: [PR] Foundation of API for reading Variant data and metadata [arrow-rs]

2025-05-23 Thread via GitHub


mapleFU commented on code in PR #7535:
URL: https://github.com/apache/arrow-rs/pull/7535#discussion_r2105536689


##
parquet-variant/src/variant.rs:
##
@@ -0,0 +1,730 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+use crate::decoder::{
+self, get_basic_type, get_primitive_type, VariantBasicType, 
VariantPrimitiveType,
+};
+use crate::utils::{array_from_slice, first_byte_from_slice, slice_from_slice, 
string_from_slice};
+use arrow_schema::ArrowError;
+use std::{
+num::TryFromIntError,
+ops::{Index, Range},
+};
+
+#[derive(Clone, Debug, Copy, PartialEq)]
+enum OffsetSizeBytes {
+One = 1,
+Two = 2,
+Three = 3,
+Four = 4,
+}
+
+impl OffsetSizeBytes {
+/// Build from the `offset_size_minus_one` bits (see spec).
+fn try_new(offset_size_minus_one: u8) -> Result {
+use OffsetSizeBytes::*;
+let result = match offset_size_minus_one {
+0 => One,
+1 => Two,
+2 => Three,
+3 => Four,
+_ => {
+return Err(ArrowError::InvalidArgumentError(
+"offset_size_minus_one must be 0–3".to_string(),
+))
+}
+};
+Ok(result)
+}
+
+/// Return one unsigned little-endian value from `bytes`.
+///
+/// * `bytes` – the Variant-metadata buffer.
+/// * `byte_offset` – number of bytes to skip **before** reading the first
+///   value (usually `1` to move past the header byte).
+/// * `offset_index` – 0-based index **after** the skip
+///   (`0` is the first value, `1` the next, …).
+///
+/// Each value is `self as usize` bytes wide (1, 2, 3 or 4).
+/// Three-byte values are zero-extended to 32 bits before the final
+/// fallible cast to `usize`.
+fn unpack_usize(
+&self,
+bytes: &[u8],
+byte_offset: usize,  // how many bytes to skip
+offset_index: usize, // which offset in an array of offsets
+) -> Result {
+use OffsetSizeBytes::*;
+let offset = byte_offset + (*self as usize) * offset_index;
+let result = match self {
+One => u8::from_le_bytes(array_from_slice(bytes, offset)?).into(),
+Two => u16::from_le_bytes(array_from_slice(bytes, offset)?).into(),
+Three => {
+// Let's grab the three byte le-chunk first
+let b3_chunks: [u8; 3] = array_from_slice(bytes, offset)?;
+// Let's pad it and construct a padded u32 from it.
+let mut buf = [0u8; 4];
+buf[..3].copy_from_slice(&b3_chunks);
+u32::from_le_bytes(buf)
+.try_into()
+.map_err(|e: TryFromIntError| 
ArrowError::InvalidArgumentError(e.to_string()))?
+}
+Four => u32::from_le_bytes(array_from_slice(bytes, offset)?)
+.try_into()
+.map_err(|e: TryFromIntError| 
ArrowError::InvalidArgumentError(e.to_string()))?,
+};
+Ok(result)
+}
+}
+
+#[derive(Clone, Debug, Copy, PartialEq)]
+pub(crate) struct VariantMetadataHeader {
+version: u8,
+is_sorted: bool,
+/// Note: This is `offset_size_minus_one` + 1
+offset_size: OffsetSizeBytes,
+}
+
+// According to the spec this is currently always = 1, and so we store this 
const for validation
+// purposes and to make that visible.
+const CORRECT_VERSION_VALUE: u8 = 1;
+
+impl VariantMetadataHeader {
+/// Tries to construct the variant metadata header, which has the form
+///  7 6  5   4  3 0
+/// +---+---+---+---+
+/// header  |   |   |   |version|
+/// +---+---+---+---+
+/// ^ ^
+/// | +-- sorted_strings
+/// +-- offset_size_minus_one
+/// The version is a 4-bit value that must always contain the value 1.
+/// - sorted_strings is a 1-bit value indicating whether dictionary 
strings are sorted and unique.
+/// - offset_size_minus_one is a 2-bit value providing the number of bytes 
per dictionary size and offset field.
+//

Re: [PR] Foundation of API for reading Variant data and metadata [arrow-rs]

2025-05-23 Thread via GitHub


scovich commented on code in PR #7535:
URL: https://github.com/apache/arrow-rs/pull/7535#discussion_r2105531459


##
parquet-variant/src/variant.rs:
##
@@ -0,0 +1,730 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+use crate::decoder::{
+self, get_basic_type, get_primitive_type, VariantBasicType, 
VariantPrimitiveType,
+};
+use crate::utils::{array_from_slice, first_byte_from_slice, slice_from_slice, 
string_from_slice};
+use arrow_schema::ArrowError;
+use std::{
+num::TryFromIntError,
+ops::{Index, Range},
+};
+
+#[derive(Clone, Debug, Copy, PartialEq)]
+enum OffsetSizeBytes {
+One = 1,
+Two = 2,
+Three = 3,
+Four = 4,
+}
+
+impl OffsetSizeBytes {
+/// Build from the `offset_size_minus_one` bits (see spec).
+fn try_new(offset_size_minus_one: u8) -> Result {
+use OffsetSizeBytes::*;
+let result = match offset_size_minus_one {
+0 => One,
+1 => Two,
+2 => Three,
+3 => Four,
+_ => {
+return Err(ArrowError::InvalidArgumentError(
+"offset_size_minus_one must be 0–3".to_string(),
+))
+}
+};
+Ok(result)
+}
+
+/// Return one unsigned little-endian value from `bytes`.
+///
+/// * `bytes` – the Variant-metadata buffer.
+/// * `byte_offset` – number of bytes to skip **before** reading the first
+///   value (usually `1` to move past the header byte).
+/// * `offset_index` – 0-based index **after** the skip
+///   (`0` is the first value, `1` the next, …).
+///
+/// Each value is `self as usize` bytes wide (1, 2, 3 or 4).
+/// Three-byte values are zero-extended to 32 bits before the final
+/// fallible cast to `usize`.
+fn unpack_usize(
+&self,
+bytes: &[u8],
+byte_offset: usize,  // how many bytes to skip
+offset_index: usize, // which offset in an array of offsets
+) -> Result {
+use OffsetSizeBytes::*;
+let offset = byte_offset + (*self as usize) * offset_index;
+let result = match self {
+One => u8::from_le_bytes(array_from_slice(bytes, offset)?).into(),
+Two => u16::from_le_bytes(array_from_slice(bytes, offset)?).into(),
+Three => {
+// Let's grab the three byte le-chunk first
+let b3_chunks: [u8; 3] = array_from_slice(bytes, offset)?;
+// Let's pad it and construct a padded u32 from it.
+let mut buf = [0u8; 4];
+buf[..3].copy_from_slice(&b3_chunks);
+u32::from_le_bytes(buf)
+.try_into()
+.map_err(|e: TryFromIntError| 
ArrowError::InvalidArgumentError(e.to_string()))?
+}
+Four => u32::from_le_bytes(array_from_slice(bytes, offset)?)
+.try_into()
+.map_err(|e: TryFromIntError| 
ArrowError::InvalidArgumentError(e.to_string()))?,
+};
+Ok(result)
+}
+}
+
+#[derive(Clone, Debug, Copy, PartialEq)]
+pub(crate) struct VariantMetadataHeader {
+version: u8,
+is_sorted: bool,
+/// Note: This is `offset_size_minus_one` + 1
+offset_size: OffsetSizeBytes,
+}
+
+// According to the spec this is currently always = 1, and so we store this 
const for validation
+// purposes and to make that visible.
+const CORRECT_VERSION_VALUE: u8 = 1;
+
+impl VariantMetadataHeader {
+/// Tries to construct the variant metadata header, which has the form
+///  7 6  5   4  3 0
+/// +---+---+---+---+
+/// header  |   |   |   |version|
+/// +---+---+---+---+
+/// ^ ^
+/// | +-- sorted_strings
+/// +-- offset_size_minus_one
+/// The version is a 4-bit value that must always contain the value 1.
+/// - sorted_strings is a 1-bit value indicating whether dictionary 
strings are sorted and unique.
+/// - offset_size_minus_one is a 2-bit value providing the number of bytes 
per dictionary size and offset field.
+//

Re: [PR] Foundation of API for reading Variant data and metadata [arrow-rs]

2025-05-23 Thread via GitHub


scovich commented on PR #7535:
URL: https://github.com/apache/arrow-rs/pull/7535#issuecomment-2906018297

   > Thank you so much for spearheading it @mkarbo 
   
   +100 -- to both you and PinkCrow007 (who github won't let me mention for 
some reason)


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]



Re: [PR] Foundation of API for reading Variant data and metadata [arrow-rs]

2025-05-23 Thread via GitHub


scovich commented on code in PR #7535:
URL: https://github.com/apache/arrow-rs/pull/7535#discussion_r2105476426


##
parquet-variant/src/variant.rs:
##
@@ -0,0 +1,714 @@
+use crate::decoder::{
+self, get_basic_type, get_primitive_type, VariantBasicType, 
VariantPrimitiveType,
+};
+use crate::utils::{array_from_slice, first_byte_from_slice, slice_from_slice, 
string_from_slice};
+use arrow_schema::ArrowError;
+use std::{
+num::TryFromIntError,
+ops::{Index, Range},
+};
+
+#[derive(Clone, Debug, Copy, PartialEq)]
+enum OffsetSizeBytes {
+One = 1,
+Two = 2,
+Three = 3,
+Four = 4,
+}
+
+impl OffsetSizeBytes {
+/// Build from the `offset_size_minus_one` bits (see spec).
+fn try_new(offset_size_minus_one: u8) -> Result {
+use OffsetSizeBytes::*;
+let result = match offset_size_minus_one {
+0 => One,
+1 => Two,
+2 => Three,
+3 => Four,
+_ => {
+return Err(ArrowError::InvalidArgumentError(
+"offset_size_minus_one must be 0–3".to_string(),
+))
+}
+};
+Ok(result)
+}
+
+/// Return one unsigned little-endian value from `bytes`.
+///
+/// * `bytes` – the Variant-metadata buffer.
+/// * `byte_offset` – number of bytes to skip **before** reading the first
+///   value (usually `1` to move past the header byte).
+/// * `offset_index` – 0-based index **after** the skip
+///   (`0` is the first value, `1` the next, …).
+///
+/// Each value is `self as usize` bytes wide (1, 2, 3 or 4).
+/// Three-byte values are zero-extended to 32 bits before the final
+/// fallible cast to `usize`.
+fn unpack_usize(
+&self,
+bytes: &[u8],
+byte_offset: usize,  // how many bytes to skip
+offset_index: usize, // which offset in an array of offsets
+) -> Result {
+use OffsetSizeBytes::*;
+let offset = byte_offset + (*self as usize) * offset_index;
+let result = match self {
+One => u8::from_le_bytes(array_from_slice(bytes, offset)?).into(),
+Two => u16::from_le_bytes(array_from_slice(bytes, offset)?).into(),
+Three => {
+// Let's grab the three byte le-chunk first
+let b3_chunks: [u8; 3] = array_from_slice(bytes, offset)?;
+// Let's pad it and construct a padded u32 from it.
+let mut buf = [0u8; 4];
+buf[..3].copy_from_slice(&b3_chunks);
+u32::from_le_bytes(buf)
+.try_into()
+.map_err(|e: TryFromIntError| 
ArrowError::InvalidArgumentError(e.to_string()))?
+}
+Four => u32::from_le_bytes(array_from_slice(bytes, offset)?)
+.try_into()
+.map_err(|e: TryFromIntError| 
ArrowError::InvalidArgumentError(e.to_string()))?,
+};
+Ok(result)
+}
+}
+
+#[derive(Clone, Debug, Copy, PartialEq)]
+pub(crate) struct VariantMetadataHeader {
+version: u8,
+is_sorted: bool,
+/// Note: This is `offset_size_minus_one` + 1
+offset_size: OffsetSizeBytes,
+}
+
+// According to the spec this is currently always = 1, and so we store this 
const for validation
+// purposes and to make that visible.
+const CORRECT_VERSION_VALUE: u8 = 1;
+
+impl VariantMetadataHeader {
+/// Tries to construct the variant metadata header, which has the form
+///  7 6  5   4  3 0
+/// +---+---+---+---+
+/// header  |   |   |   |version|
+/// +---+---+---+---+
+/// ^ ^
+/// | +-- sorted_strings
+/// +-- offset_size_minus_one
+/// The version is a 4-bit value that must always contain the value 1.
+/// - sorted_strings is a 1-bit value indicating whether dictionary 
strings are sorted and unique.
+/// - offset_size_minus_one is a 2-bit value providing the number of bytes 
per dictionary size and offset field.
+/// - The actual number of bytes, offset_size, is offset_size_minus_one + 1
+pub fn try_new(bytes: &[u8]) -> Result {
+let header = first_byte_from_slice(bytes)?;
+
+let version = header & 0x0F; // First four bits
+if version != CORRECT_VERSION_VALUE {
+let err_msg = format!(
+"The version bytes in the header is not 
{CORRECT_VERSION_VALUE}, got {:b}",
+version
+);
+return Err(ArrowError::InvalidArgumentError(err_msg));
+}
+let is_sorted = (header & 0x10) != 0; // Fifth bit
+let offset_size_minus_one = header >> 6; // Last two bits
+Ok(Self {
+version,
+is_sorted,
+offset_size: OffsetSizeBytes::try_new(offset_size_minus_one)?,
+})
+}
+

Re: [PR] Foundation of API for reading Variant data and metadata [arrow-rs]

2025-05-23 Thread via GitHub


scovich commented on PR #7535:
URL: https://github.com/apache/arrow-rs/pull/7535#issuecomment-2906015547

   > In my opinion the only outstanding questions are some of the details of 
handling Objects and arrays but I think we can iterate on that in subsequent 
PRs as we haven't really committed to an API
   >
   > What do you think @mkarbo and @scovich ? Is this good enough to merge and 
make follow on PRs?
   
   We're getting very close. I agree the specifics of objects and arrays can be 
follow-on work. 
   
   A few remaining items on my mind (some of which are comments that github 
oh-so-helpfully hides):
   * How much "extra" validation (= not immediately needed by the function that 
performs it) should we be doing? See 
https://github.com/apache/arrow-rs/pull/7535#discussion_r2105503464
   * The `VariantMetadata` API seems overly wide and complex, I wonder if we 
can simplify it? See 
https://github.com/apache/arrow-rs/pull/7535#discussion_r2105467545
   * Significant code simplifications still seem to be possible. A good chunk 
of this is IMO due to the previous two points. I'm interested in this because 
we're still finding ways to factor out redundant code to helper functions that 
will likely make the remaining pathfinding easier; also because this is very 
complex stuff, and the less code we express it in the smaller the cognitive 
burden and bug surface will be. Plus, it's something that gets harder and 
harder to fix later as the code grows and matures.
   
   I suspect one more pass will address the code simplifications. Not sure how 
to resolve the other two most efficiently? Maybe others could chime in so we 
have more voices to consider?


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]



Re: [PR] Foundation of API for reading Variant data and metadata [arrow-rs]

2025-05-23 Thread via GitHub


mapleFU commented on code in PR #7535:
URL: https://github.com/apache/arrow-rs/pull/7535#discussion_r2105517522


##
parquet-variant/src/variant.rs:
##
@@ -0,0 +1,730 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+use crate::decoder::{
+self, get_basic_type, get_primitive_type, VariantBasicType, 
VariantPrimitiveType,
+};
+use crate::utils::{array_from_slice, first_byte_from_slice, slice_from_slice, 
string_from_slice};
+use arrow_schema::ArrowError;
+use std::{
+num::TryFromIntError,
+ops::{Index, Range},
+};
+
+#[derive(Clone, Debug, Copy, PartialEq)]
+enum OffsetSizeBytes {
+One = 1,
+Two = 2,
+Three = 3,
+Four = 4,
+}
+
+impl OffsetSizeBytes {
+/// Build from the `offset_size_minus_one` bits (see spec).
+fn try_new(offset_size_minus_one: u8) -> Result {
+use OffsetSizeBytes::*;
+let result = match offset_size_minus_one {
+0 => One,
+1 => Two,
+2 => Three,
+3 => Four,
+_ => {
+return Err(ArrowError::InvalidArgumentError(
+"offset_size_minus_one must be 0–3".to_string(),
+))
+}
+};
+Ok(result)
+}
+
+/// Return one unsigned little-endian value from `bytes`.
+///
+/// * `bytes` – the Variant-metadata buffer.
+/// * `byte_offset` – number of bytes to skip **before** reading the first
+///   value (usually `1` to move past the header byte).
+/// * `offset_index` – 0-based index **after** the skip
+///   (`0` is the first value, `1` the next, …).
+///
+/// Each value is `self as usize` bytes wide (1, 2, 3 or 4).
+/// Three-byte values are zero-extended to 32 bits before the final
+/// fallible cast to `usize`.
+fn unpack_usize(
+&self,
+bytes: &[u8],
+byte_offset: usize,  // how many bytes to skip
+offset_index: usize, // which offset in an array of offsets
+) -> Result {
+use OffsetSizeBytes::*;
+let offset = byte_offset + (*self as usize) * offset_index;
+let result = match self {
+One => u8::from_le_bytes(array_from_slice(bytes, offset)?).into(),
+Two => u16::from_le_bytes(array_from_slice(bytes, offset)?).into(),
+Three => {
+// Let's grab the three byte le-chunk first
+let b3_chunks: [u8; 3] = array_from_slice(bytes, offset)?;
+// Let's pad it and construct a padded u32 from it.
+let mut buf = [0u8; 4];
+buf[..3].copy_from_slice(&b3_chunks);
+u32::from_le_bytes(buf)
+.try_into()
+.map_err(|e: TryFromIntError| 
ArrowError::InvalidArgumentError(e.to_string()))?
+}
+Four => u32::from_le_bytes(array_from_slice(bytes, offset)?)
+.try_into()
+.map_err(|e: TryFromIntError| 
ArrowError::InvalidArgumentError(e.to_string()))?,
+};
+Ok(result)
+}
+}
+
+#[derive(Clone, Debug, Copy, PartialEq)]
+pub(crate) struct VariantMetadataHeader {
+version: u8,
+is_sorted: bool,
+/// Note: This is `offset_size_minus_one` + 1
+offset_size: OffsetSizeBytes,
+}
+
+// According to the spec this is currently always = 1, and so we store this 
const for validation
+// purposes and to make that visible.
+const CORRECT_VERSION_VALUE: u8 = 1;
+
+impl VariantMetadataHeader {
+/// Tries to construct the variant metadata header, which has the form
+///  7 6  5   4  3 0
+/// +---+---+---+---+
+/// header  |   |   |   |version|
+/// +---+---+---+---+
+/// ^ ^
+/// | +-- sorted_strings
+/// +-- offset_size_minus_one
+/// The version is a 4-bit value that must always contain the value 1.
+/// - sorted_strings is a 1-bit value indicating whether dictionary 
strings are sorted and unique.
+/// - offset_size_minus_one is a 2-bit value providing the number of bytes 
per dictionary size and offset field.
+//

Re: [PR] Foundation of API for reading Variant data and metadata [arrow-rs]

2025-05-23 Thread via GitHub


scovich commented on code in PR #7535:
URL: https://github.com/apache/arrow-rs/pull/7535#discussion_r2105514339


##
parquet-variant/src/variant.rs:
##
@@ -0,0 +1,730 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+use crate::decoder::{
+self, get_basic_type, get_primitive_type, VariantBasicType, 
VariantPrimitiveType,
+};
+use crate::utils::{array_from_slice, first_byte_from_slice, slice_from_slice, 
string_from_slice};
+use arrow_schema::ArrowError;
+use std::{
+num::TryFromIntError,
+ops::{Index, Range},
+};
+
+#[derive(Clone, Debug, Copy, PartialEq)]
+enum OffsetSizeBytes {
+One = 1,
+Two = 2,
+Three = 3,
+Four = 4,
+}
+
+impl OffsetSizeBytes {
+/// Build from the `offset_size_minus_one` bits (see spec).
+fn try_new(offset_size_minus_one: u8) -> Result {
+use OffsetSizeBytes::*;
+let result = match offset_size_minus_one {
+0 => One,
+1 => Two,
+2 => Three,
+3 => Four,
+_ => {
+return Err(ArrowError::InvalidArgumentError(
+"offset_size_minus_one must be 0–3".to_string(),
+))
+}
+};
+Ok(result)
+}
+
+/// Return one unsigned little-endian value from `bytes`.
+///
+/// * `bytes` – the Variant-metadata buffer.
+/// * `byte_offset` – number of bytes to skip **before** reading the first
+///   value (usually `1` to move past the header byte).
+/// * `offset_index` – 0-based index **after** the skip
+///   (`0` is the first value, `1` the next, …).
+///
+/// Each value is `self as usize` bytes wide (1, 2, 3 or 4).
+/// Three-byte values are zero-extended to 32 bits before the final
+/// fallible cast to `usize`.
+fn unpack_usize(
+&self,
+bytes: &[u8],
+byte_offset: usize,  // how many bytes to skip
+offset_index: usize, // which offset in an array of offsets
+) -> Result {
+use OffsetSizeBytes::*;
+let offset = byte_offset + (*self as usize) * offset_index;
+let result = match self {
+One => u8::from_le_bytes(array_from_slice(bytes, offset)?).into(),
+Two => u16::from_le_bytes(array_from_slice(bytes, offset)?).into(),
+Three => {
+// Let's grab the three byte le-chunk first
+let b3_chunks: [u8; 3] = array_from_slice(bytes, offset)?;
+// Let's pad it and construct a padded u32 from it.
+let mut buf = [0u8; 4];
+buf[..3].copy_from_slice(&b3_chunks);
+u32::from_le_bytes(buf)
+.try_into()
+.map_err(|e: TryFromIntError| 
ArrowError::InvalidArgumentError(e.to_string()))?
+}
+Four => u32::from_le_bytes(array_from_slice(bytes, offset)?)
+.try_into()
+.map_err(|e: TryFromIntError| 
ArrowError::InvalidArgumentError(e.to_string()))?,
+};
+Ok(result)
+}
+}
+
+#[derive(Clone, Debug, Copy, PartialEq)]
+pub(crate) struct VariantMetadataHeader {
+version: u8,
+is_sorted: bool,
+/// Note: This is `offset_size_minus_one` + 1
+offset_size: OffsetSizeBytes,
+}
+
+// According to the spec this is currently always = 1, and so we store this 
const for validation
+// purposes and to make that visible.
+const CORRECT_VERSION_VALUE: u8 = 1;
+
+impl VariantMetadataHeader {
+/// Tries to construct the variant metadata header, which has the form
+///  7 6  5   4  3 0
+/// +---+---+---+---+
+/// header  |   |   |   |version|
+/// +---+---+---+---+
+/// ^ ^
+/// | +-- sorted_strings
+/// +-- offset_size_minus_one
+/// The version is a 4-bit value that must always contain the value 1.
+/// - sorted_strings is a 1-bit value indicating whether dictionary 
strings are sorted and unique.
+/// - offset_size_minus_one is a 2-bit value providing the number of bytes 
per dictionary size and offset field.
+//

Re: [PR] Foundation of API for reading Variant data and metadata [arrow-rs]

2025-05-23 Thread via GitHub


scovich commented on code in PR #7535:
URL: https://github.com/apache/arrow-rs/pull/7535#discussion_r2105516377


##
parquet-variant/src/variant.rs:
##
@@ -0,0 +1,714 @@
+use crate::decoder::{
+self, get_basic_type, get_primitive_type, VariantBasicType, 
VariantPrimitiveType,
+};
+use crate::utils::{array_from_slice, first_byte_from_slice, slice_from_slice, 
string_from_slice};
+use arrow_schema::ArrowError;
+use std::{
+num::TryFromIntError,
+ops::{Index, Range},
+};
+
+#[derive(Clone, Debug, Copy, PartialEq)]
+enum OffsetSizeBytes {
+One = 1,
+Two = 2,
+Three = 3,
+Four = 4,
+}
+
+impl OffsetSizeBytes {
+/// Build from the `offset_size_minus_one` bits (see spec).
+fn try_new(offset_size_minus_one: u8) -> Result {
+use OffsetSizeBytes::*;
+let result = match offset_size_minus_one {
+0 => One,
+1 => Two,
+2 => Three,
+3 => Four,
+_ => {
+return Err(ArrowError::InvalidArgumentError(
+"offset_size_minus_one must be 0–3".to_string(),
+))
+}
+};
+Ok(result)
+}
+
+/// Return one unsigned little-endian value from `bytes`.
+///
+/// * `bytes` – the Variant-metadata buffer.
+/// * `byte_offset` – number of bytes to skip **before** reading the first
+///   value (usually `1` to move past the header byte).
+/// * `offset_index` – 0-based index **after** the skip
+///   (`0` is the first value, `1` the next, …).
+///
+/// Each value is `self as usize` bytes wide (1, 2, 3 or 4).
+/// Three-byte values are zero-extended to 32 bits before the final
+/// fallible cast to `usize`.
+fn unpack_usize(
+&self,
+bytes: &[u8],
+byte_offset: usize,  // how many bytes to skip
+offset_index: usize, // which offset in an array of offsets
+) -> Result {
+use OffsetSizeBytes::*;
+let offset = byte_offset + (*self as usize) * offset_index;
+let result = match self {
+One => u8::from_le_bytes(array_from_slice(bytes, offset)?).into(),
+Two => u16::from_le_bytes(array_from_slice(bytes, offset)?).into(),
+Three => {
+// Let's grab the three byte le-chunk first
+let b3_chunks: [u8; 3] = array_from_slice(bytes, offset)?;
+// Let's pad it and construct a padded u32 from it.
+let mut buf = [0u8; 4];
+buf[..3].copy_from_slice(&b3_chunks);
+u32::from_le_bytes(buf)
+.try_into()
+.map_err(|e: TryFromIntError| 
ArrowError::InvalidArgumentError(e.to_string()))?
+}
+Four => u32::from_le_bytes(array_from_slice(bytes, offset)?)
+.try_into()
+.map_err(|e: TryFromIntError| 
ArrowError::InvalidArgumentError(e.to_string()))?,
+};
+Ok(result)
+}
+}
+
+#[derive(Clone, Debug, Copy, PartialEq)]
+pub(crate) struct VariantMetadataHeader {
+version: u8,
+is_sorted: bool,
+/// Note: This is `offset_size_minus_one` + 1
+offset_size: OffsetSizeBytes,
+}
+
+// According to the spec this is currently always = 1, and so we store this 
const for validation
+// purposes and to make that visible.
+const CORRECT_VERSION_VALUE: u8 = 1;
+
+impl VariantMetadataHeader {
+/// Tries to construct the variant metadata header, which has the form
+///  7 6  5   4  3 0
+/// +---+---+---+---+
+/// header  |   |   |   |version|
+/// +---+---+---+---+
+/// ^ ^
+/// | +-- sorted_strings
+/// +-- offset_size_minus_one
+/// The version is a 4-bit value that must always contain the value 1.
+/// - sorted_strings is a 1-bit value indicating whether dictionary 
strings are sorted and unique.
+/// - offset_size_minus_one is a 2-bit value providing the number of bytes 
per dictionary size and offset field.
+/// - The actual number of bytes, offset_size, is offset_size_minus_one + 1
+pub fn try_new(bytes: &[u8]) -> Result {
+let header = first_byte_from_slice(bytes)?;
+
+let version = header & 0x0F; // First four bits
+if version != CORRECT_VERSION_VALUE {
+let err_msg = format!(
+"The version bytes in the header is not 
{CORRECT_VERSION_VALUE}, got {:b}",
+version
+);
+return Err(ArrowError::InvalidArgumentError(err_msg));
+}
+let is_sorted = (header & 0x10) != 0; // Fifth bit
+let offset_size_minus_one = header >> 6; // Last two bits
+Ok(Self {
+version,
+is_sorted,
+offset_size: OffsetSizeBytes::try_new(offset_size_minus_one)?,
+})
+}
+

Re: [PR] Foundation of API for reading Variant data and metadata [arrow-rs]

2025-05-23 Thread via GitHub


mapleFU commented on code in PR #7535:
URL: https://github.com/apache/arrow-rs/pull/7535#discussion_r2105466966


##
parquet-variant/src/variant.rs:
##
@@ -0,0 +1,730 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+use crate::decoder::{
+self, get_basic_type, get_primitive_type, VariantBasicType, 
VariantPrimitiveType,
+};
+use crate::utils::{array_from_slice, first_byte_from_slice, slice_from_slice, 
string_from_slice};
+use arrow_schema::ArrowError;
+use std::{
+num::TryFromIntError,
+ops::{Index, Range},
+};
+
+#[derive(Clone, Debug, Copy, PartialEq)]
+enum OffsetSizeBytes {
+One = 1,
+Two = 2,
+Three = 3,
+Four = 4,
+}
+
+impl OffsetSizeBytes {
+/// Build from the `offset_size_minus_one` bits (see spec).
+fn try_new(offset_size_minus_one: u8) -> Result {
+use OffsetSizeBytes::*;
+let result = match offset_size_minus_one {
+0 => One,
+1 => Two,
+2 => Three,
+3 => Four,
+_ => {
+return Err(ArrowError::InvalidArgumentError(
+"offset_size_minus_one must be 0–3".to_string(),
+))
+}
+};
+Ok(result)
+}
+
+/// Return one unsigned little-endian value from `bytes`.
+///
+/// * `bytes` – the Variant-metadata buffer.
+/// * `byte_offset` – number of bytes to skip **before** reading the first
+///   value (usually `1` to move past the header byte).
+/// * `offset_index` – 0-based index **after** the skip
+///   (`0` is the first value, `1` the next, …).
+///
+/// Each value is `self as usize` bytes wide (1, 2, 3 or 4).
+/// Three-byte values are zero-extended to 32 bits before the final
+/// fallible cast to `usize`.
+fn unpack_usize(
+&self,
+bytes: &[u8],
+byte_offset: usize,  // how many bytes to skip
+offset_index: usize, // which offset in an array of offsets
+) -> Result {
+use OffsetSizeBytes::*;
+let offset = byte_offset + (*self as usize) * offset_index;
+let result = match self {
+One => u8::from_le_bytes(array_from_slice(bytes, offset)?).into(),
+Two => u16::from_le_bytes(array_from_slice(bytes, offset)?).into(),
+Three => {
+// Let's grab the three byte le-chunk first
+let b3_chunks: [u8; 3] = array_from_slice(bytes, offset)?;
+// Let's pad it and construct a padded u32 from it.
+let mut buf = [0u8; 4];
+buf[..3].copy_from_slice(&b3_chunks);
+u32::from_le_bytes(buf)
+.try_into()
+.map_err(|e: TryFromIntError| 
ArrowError::InvalidArgumentError(e.to_string()))?
+}
+Four => u32::from_le_bytes(array_from_slice(bytes, offset)?)
+.try_into()
+.map_err(|e: TryFromIntError| 
ArrowError::InvalidArgumentError(e.to_string()))?,
+};
+Ok(result)
+}
+}
+
+#[derive(Clone, Debug, Copy, PartialEq)]
+pub(crate) struct VariantMetadataHeader {
+version: u8,
+is_sorted: bool,
+/// Note: This is `offset_size_minus_one` + 1
+offset_size: OffsetSizeBytes,
+}
+
+// According to the spec this is currently always = 1, and so we store this 
const for validation
+// purposes and to make that visible.
+const CORRECT_VERSION_VALUE: u8 = 1;
+
+impl VariantMetadataHeader {
+/// Tries to construct the variant metadata header, which has the form
+///  7 6  5   4  3 0
+/// +---+---+---+---+
+/// header  |   |   |   |version|
+/// +---+---+---+---+
+/// ^ ^
+/// | +-- sorted_strings
+/// +-- offset_size_minus_one
+/// The version is a 4-bit value that must always contain the value 1.
+/// - sorted_strings is a 1-bit value indicating whether dictionary 
strings are sorted and unique.
+/// - offset_size_minus_one is a 2-bit value providing the number of bytes 
per dictionary size and offset field.
+//

Re: [PR] Foundation of API for reading Variant data and metadata [arrow-rs]

2025-05-23 Thread via GitHub


mapleFU commented on code in PR #7535:
URL: https://github.com/apache/arrow-rs/pull/7535#discussion_r2105462128


##
parquet-variant/src/variant.rs:
##
@@ -0,0 +1,714 @@
+use crate::decoder::{
+self, get_basic_type, get_primitive_type, VariantBasicType, 
VariantPrimitiveType,
+};
+use crate::utils::{array_from_slice, first_byte_from_slice, slice_from_slice, 
string_from_slice};
+use arrow_schema::ArrowError;
+use std::{
+num::TryFromIntError,
+ops::{Index, Range},
+};
+
+#[derive(Clone, Debug, Copy, PartialEq)]
+enum OffsetSizeBytes {
+One = 1,
+Two = 2,
+Three = 3,
+Four = 4,
+}
+
+impl OffsetSizeBytes {
+/// Build from the `offset_size_minus_one` bits (see spec).
+fn try_new(offset_size_minus_one: u8) -> Result {
+use OffsetSizeBytes::*;
+let result = match offset_size_minus_one {
+0 => One,
+1 => Two,
+2 => Three,
+3 => Four,
+_ => {
+return Err(ArrowError::InvalidArgumentError(
+"offset_size_minus_one must be 0–3".to_string(),
+))
+}
+};
+Ok(result)
+}
+
+/// Return one unsigned little-endian value from `bytes`.
+///
+/// * `bytes` – the Variant-metadata buffer.
+/// * `byte_offset` – number of bytes to skip **before** reading the first
+///   value (usually `1` to move past the header byte).
+/// * `offset_index` – 0-based index **after** the skip
+///   (`0` is the first value, `1` the next, …).
+///
+/// Each value is `self as usize` bytes wide (1, 2, 3 or 4).
+/// Three-byte values are zero-extended to 32 bits before the final
+/// fallible cast to `usize`.
+fn unpack_usize(
+&self,
+bytes: &[u8],
+byte_offset: usize,  // how many bytes to skip
+offset_index: usize, // which offset in an array of offsets
+) -> Result {
+use OffsetSizeBytes::*;
+let offset = byte_offset + (*self as usize) * offset_index;
+let result = match self {
+One => u8::from_le_bytes(array_from_slice(bytes, offset)?).into(),
+Two => u16::from_le_bytes(array_from_slice(bytes, offset)?).into(),
+Three => {
+// Let's grab the three byte le-chunk first
+let b3_chunks: [u8; 3] = array_from_slice(bytes, offset)?;
+// Let's pad it and construct a padded u32 from it.
+let mut buf = [0u8; 4];
+buf[..3].copy_from_slice(&b3_chunks);
+u32::from_le_bytes(buf)
+.try_into()
+.map_err(|e: TryFromIntError| 
ArrowError::InvalidArgumentError(e.to_string()))?
+}
+Four => u32::from_le_bytes(array_from_slice(bytes, offset)?)
+.try_into()
+.map_err(|e: TryFromIntError| 
ArrowError::InvalidArgumentError(e.to_string()))?,
+};
+Ok(result)
+}
+}
+
+#[derive(Clone, Debug, Copy, PartialEq)]
+pub(crate) struct VariantMetadataHeader {
+version: u8,
+is_sorted: bool,
+/// Note: This is `offset_size_minus_one` + 1
+offset_size: OffsetSizeBytes,
+}
+
+// According to the spec this is currently always = 1, and so we store this 
const for validation
+// purposes and to make that visible.
+const CORRECT_VERSION_VALUE: u8 = 1;
+
+impl VariantMetadataHeader {
+/// Tries to construct the variant metadata header, which has the form
+///  7 6  5   4  3 0
+/// +---+---+---+---+
+/// header  |   |   |   |version|
+/// +---+---+---+---+
+/// ^ ^
+/// | +-- sorted_strings
+/// +-- offset_size_minus_one
+/// The version is a 4-bit value that must always contain the value 1.
+/// - sorted_strings is a 1-bit value indicating whether dictionary 
strings are sorted and unique.
+/// - offset_size_minus_one is a 2-bit value providing the number of bytes 
per dictionary size and offset field.
+/// - The actual number of bytes, offset_size, is offset_size_minus_one + 1
+pub fn try_new(bytes: &[u8]) -> Result {
+let header = first_byte_from_slice(bytes)?;
+
+let version = header & 0x0F; // First four bits
+if version != CORRECT_VERSION_VALUE {
+let err_msg = format!(
+"The version bytes in the header is not 
{CORRECT_VERSION_VALUE}, got {:b}",
+version
+);
+return Err(ArrowError::InvalidArgumentError(err_msg));
+}
+let is_sorted = (header & 0x10) != 0; // Fifth bit
+let offset_size_minus_one = header >> 6; // Last two bits
+Ok(Self {
+version,
+is_sorted,
+offset_size: OffsetSizeBytes::try_new(offset_size_minus_one)?,
+})
+}
+

Re: [PR] Foundation of API for reading Variant data and metadata [arrow-rs]

2025-05-23 Thread via GitHub


mapleFU commented on code in PR #7535:
URL: https://github.com/apache/arrow-rs/pull/7535#discussion_r2105461519


##
parquet-variant/src/variant.rs:
##
@@ -0,0 +1,714 @@
+use crate::decoder::{
+self, get_basic_type, get_primitive_type, VariantBasicType, 
VariantPrimitiveType,
+};
+use crate::utils::{array_from_slice, first_byte_from_slice, slice_from_slice, 
string_from_slice};
+use arrow_schema::ArrowError;
+use std::{
+num::TryFromIntError,
+ops::{Index, Range},
+};
+
+#[derive(Clone, Debug, Copy, PartialEq)]
+enum OffsetSizeBytes {
+One = 1,
+Two = 2,
+Three = 3,
+Four = 4,
+}
+
+impl OffsetSizeBytes {
+/// Build from the `offset_size_minus_one` bits (see spec).
+fn try_new(offset_size_minus_one: u8) -> Result {
+use OffsetSizeBytes::*;
+let result = match offset_size_minus_one {
+0 => One,
+1 => Two,
+2 => Three,
+3 => Four,
+_ => {
+return Err(ArrowError::InvalidArgumentError(
+"offset_size_minus_one must be 0–3".to_string(),
+))
+}
+};
+Ok(result)
+}
+
+/// Return one unsigned little-endian value from `bytes`.
+///
+/// * `bytes` – the Variant-metadata buffer.
+/// * `byte_offset` – number of bytes to skip **before** reading the first
+///   value (usually `1` to move past the header byte).
+/// * `offset_index` – 0-based index **after** the skip
+///   (`0` is the first value, `1` the next, …).
+///
+/// Each value is `self as usize` bytes wide (1, 2, 3 or 4).
+/// Three-byte values are zero-extended to 32 bits before the final
+/// fallible cast to `usize`.
+fn unpack_usize(
+&self,
+bytes: &[u8],
+byte_offset: usize,  // how many bytes to skip
+offset_index: usize, // which offset in an array of offsets
+) -> Result {
+use OffsetSizeBytes::*;
+let offset = byte_offset + (*self as usize) * offset_index;
+let result = match self {
+One => u8::from_le_bytes(array_from_slice(bytes, offset)?).into(),
+Two => u16::from_le_bytes(array_from_slice(bytes, offset)?).into(),
+Three => {
+// Let's grab the three byte le-chunk first
+let b3_chunks: [u8; 3] = array_from_slice(bytes, offset)?;
+// Let's pad it and construct a padded u32 from it.
+let mut buf = [0u8; 4];
+buf[..3].copy_from_slice(&b3_chunks);
+u32::from_le_bytes(buf)
+.try_into()
+.map_err(|e: TryFromIntError| 
ArrowError::InvalidArgumentError(e.to_string()))?
+}
+Four => u32::from_le_bytes(array_from_slice(bytes, offset)?)
+.try_into()
+.map_err(|e: TryFromIntError| 
ArrowError::InvalidArgumentError(e.to_string()))?,
+};
+Ok(result)
+}
+}
+
+#[derive(Clone, Debug, Copy, PartialEq)]
+pub(crate) struct VariantMetadataHeader {
+version: u8,
+is_sorted: bool,
+/// Note: This is `offset_size_minus_one` + 1
+offset_size: OffsetSizeBytes,
+}
+
+// According to the spec this is currently always = 1, and so we store this 
const for validation
+// purposes and to make that visible.
+const CORRECT_VERSION_VALUE: u8 = 1;
+
+impl VariantMetadataHeader {
+/// Tries to construct the variant metadata header, which has the form
+///  7 6  5   4  3 0
+/// +---+---+---+---+
+/// header  |   |   |   |version|
+/// +---+---+---+---+
+/// ^ ^
+/// | +-- sorted_strings
+/// +-- offset_size_minus_one
+/// The version is a 4-bit value that must always contain the value 1.
+/// - sorted_strings is a 1-bit value indicating whether dictionary 
strings are sorted and unique.
+/// - offset_size_minus_one is a 2-bit value providing the number of bytes 
per dictionary size and offset field.
+/// - The actual number of bytes, offset_size, is offset_size_minus_one + 1
+pub fn try_new(bytes: &[u8]) -> Result {
+let header = first_byte_from_slice(bytes)?;
+
+let version = header & 0x0F; // First four bits
+if version != CORRECT_VERSION_VALUE {
+let err_msg = format!(
+"The version bytes in the header is not 
{CORRECT_VERSION_VALUE}, got {:b}",
+version
+);
+return Err(ArrowError::InvalidArgumentError(err_msg));
+}
+let is_sorted = (header & 0x10) != 0; // Fifth bit
+let offset_size_minus_one = header >> 6; // Last two bits
+Ok(Self {
+version,
+is_sorted,
+offset_size: OffsetSizeBytes::try_new(offset_size_minus_one)?,
+})
+}
+

Re: [PR] Foundation of API for reading Variant data and metadata [arrow-rs]

2025-05-23 Thread via GitHub


alamb commented on code in PR #7535:
URL: https://github.com/apache/arrow-rs/pull/7535#discussion_r2105442415


##
parquet-variant/src/variant.rs:
##
@@ -0,0 +1,714 @@
+use crate::decoder::{
+self, get_basic_type, get_primitive_type, VariantBasicType, 
VariantPrimitiveType,
+};
+use crate::utils::{array_from_slice, first_byte_from_slice, slice_from_slice, 
string_from_slice};
+use arrow_schema::ArrowError;
+use std::{
+num::TryFromIntError,
+ops::{Index, Range},
+};
+
+#[derive(Clone, Debug, Copy, PartialEq)]
+enum OffsetSizeBytes {
+One = 1,
+Two = 2,
+Three = 3,
+Four = 4,
+}
+
+impl OffsetSizeBytes {
+/// Build from the `offset_size_minus_one` bits (see spec).
+fn try_new(offset_size_minus_one: u8) -> Result {
+use OffsetSizeBytes::*;
+let result = match offset_size_minus_one {
+0 => One,
+1 => Two,
+2 => Three,
+3 => Four,
+_ => {
+return Err(ArrowError::InvalidArgumentError(
+"offset_size_minus_one must be 0–3".to_string(),
+))
+}
+};
+Ok(result)
+}
+
+/// Return one unsigned little-endian value from `bytes`.
+///
+/// * `bytes` – the Variant-metadata buffer.
+/// * `byte_offset` – number of bytes to skip **before** reading the first
+///   value (usually `1` to move past the header byte).
+/// * `offset_index` – 0-based index **after** the skip
+///   (`0` is the first value, `1` the next, …).
+///
+/// Each value is `self as usize` bytes wide (1, 2, 3 or 4).
+/// Three-byte values are zero-extended to 32 bits before the final
+/// fallible cast to `usize`.
+fn unpack_usize(
+&self,
+bytes: &[u8],
+byte_offset: usize,  // how many bytes to skip
+offset_index: usize, // which offset in an array of offsets
+) -> Result {
+use OffsetSizeBytes::*;
+let offset = byte_offset + (*self as usize) * offset_index;
+let result = match self {
+One => u8::from_le_bytes(array_from_slice(bytes, offset)?).into(),
+Two => u16::from_le_bytes(array_from_slice(bytes, offset)?).into(),
+Three => {
+// Let's grab the three byte le-chunk first
+let b3_chunks: [u8; 3] = array_from_slice(bytes, offset)?;
+// Let's pad it and construct a padded u32 from it.
+let mut buf = [0u8; 4];
+buf[..3].copy_from_slice(&b3_chunks);
+u32::from_le_bytes(buf)
+.try_into()
+.map_err(|e: TryFromIntError| 
ArrowError::InvalidArgumentError(e.to_string()))?
+}
+Four => u32::from_le_bytes(array_from_slice(bytes, offset)?)
+.try_into()
+.map_err(|e: TryFromIntError| 
ArrowError::InvalidArgumentError(e.to_string()))?,
+};
+Ok(result)
+}
+}
+
+#[derive(Clone, Debug, Copy, PartialEq)]
+pub(crate) struct VariantMetadataHeader {
+version: u8,
+is_sorted: bool,
+/// Note: This is `offset_size_minus_one` + 1
+offset_size: OffsetSizeBytes,
+}
+
+// According to the spec this is currently always = 1, and so we store this 
const for validation
+// purposes and to make that visible.
+const CORRECT_VERSION_VALUE: u8 = 1;
+
+impl VariantMetadataHeader {
+/// Tries to construct the variant metadata header, which has the form
+///  7 6  5   4  3 0
+/// +---+---+---+---+
+/// header  |   |   |   |version|
+/// +---+---+---+---+
+/// ^ ^
+/// | +-- sorted_strings
+/// +-- offset_size_minus_one
+/// The version is a 4-bit value that must always contain the value 1.
+/// - sorted_strings is a 1-bit value indicating whether dictionary 
strings are sorted and unique.
+/// - offset_size_minus_one is a 2-bit value providing the number of bytes 
per dictionary size and offset field.
+/// - The actual number of bytes, offset_size, is offset_size_minus_one + 1
+pub fn try_new(bytes: &[u8]) -> Result {
+let header = first_byte_from_slice(bytes)?;
+
+let version = header & 0x0F; // First four bits
+if version != CORRECT_VERSION_VALUE {
+let err_msg = format!(
+"The version bytes in the header is not 
{CORRECT_VERSION_VALUE}, got {:b}",
+version
+);
+return Err(ArrowError::InvalidArgumentError(err_msg));
+}
+let is_sorted = (header & 0x10) != 0; // Fifth bit
+let offset_size_minus_one = header >> 6; // Last two bits
+Ok(Self {
+version,
+is_sorted,
+offset_size: OffsetSizeBytes::try_new(offset_size_minus_one)?,
+})
+}
+}