This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 07ae1dd7ce [Variant] Introduce new BorrowedShreddingState concept
(#8499)
07ae1dd7ce is described below
commit 07ae1dd7ceee83a5e17e209626cc92230cd04609
Author: Ryan Johnson <[email protected]>
AuthorDate: Tue Sep 30 03:36:22 2025 -0600
[Variant] Introduce new BorrowedShreddingState concept (#8499)
# Which issue does this PR close?
- Relates to https://github.com/apache/arrow-rs/issues/8336
# Rationale for this change
While pathfinding support for unshredding variant objects and arrays, I
ran into the same issue that `variant_get` already suffers --
`ShreddingState` is inconvenient because it owns the `value` and
`typed_value` columns it refers to, even tho borrowing them usually
suffices.
# What changes are included in this PR?
Define a new `BorrowedShreddingState` which does what it says, and
update `ShreddedPathStep` (in variant_get.rs) to use it.
Also, make the constructor fallible in order to correctly report casting
failures if the `value` column is not a binary view.
# Are these changes tested?
Yes, existing tests cover this code.
# Are there any user-facing changes?
New `TryFrom` and `From` implementations.
`impl From<&StructArray> for ShreddingState` was replaced by a suitable
`TryFrom`.
---
parquet-variant-compute/src/lib.rs | 2 +-
parquet-variant-compute/src/variant_array.rs | 110 ++++++++++++++++++++-------
parquet-variant-compute/src/variant_get.rs | 25 +++---
3 files changed, 97 insertions(+), 40 deletions(-)
diff --git a/parquet-variant-compute/src/lib.rs
b/parquet-variant-compute/src/lib.rs
index 496d550d95..b3876ef6ab 100644
--- a/parquet-variant-compute/src/lib.rs
+++ b/parquet-variant-compute/src/lib.rs
@@ -46,7 +46,7 @@ mod variant_array_builder;
pub mod variant_get;
mod variant_to_arrow;
-pub use variant_array::{ShreddingState, VariantArray, VariantType};
+pub use variant_array::{BorrowedShreddingState, ShreddingState, VariantArray,
VariantType};
pub use variant_array_builder::{VariantArrayBuilder, VariantValueArrayBuilder};
pub use cast_to_variant::{cast_to_variant, cast_to_variant_with_options};
diff --git a/parquet-variant-compute/src/variant_array.rs
b/parquet-variant-compute/src/variant_array.rs
index d366d702aa..79ef37f0c4 100644
--- a/parquet-variant-compute/src/variant_array.rs
+++ b/parquet-variant-compute/src/variant_array.rs
@@ -272,26 +272,11 @@ impl VariantArray {
)));
};
- // Extract value and typed_value fields
- let value = if let Some(value_col) = inner.column_by_name("value") {
- if let Some(binary_view) = value_col.as_binary_view_opt() {
- Some(binary_view.clone())
- } else {
- return Err(ArrowError::NotYetImplemented(format!(
- "VariantArray 'value' field must be BinaryView, got {}",
- value_col.data_type()
- )));
- }
- } else {
- None
- };
- let typed_value = inner.column_by_name("typed_value").cloned();
-
// Note these clones are cheap, they just bump the ref count
Ok(Self {
inner: inner.clone(),
metadata: metadata.clone(),
- shredding_state: ShreddingState::new(value, typed_value),
+ shredding_state: ShreddingState::try_from(inner)?,
})
}
@@ -521,7 +506,7 @@ impl ShreddedVariantFieldArray {
// Note this clone is cheap, it just bumps the ref count
Ok(Self {
inner: inner_struct.clone(),
- shredding_state: ShreddingState::from(inner_struct),
+ shredding_state: ShreddingState::try_from(inner_struct)?,
})
}
@@ -660,7 +645,7 @@ impl ShreddingState {
/// Create a new `ShreddingState` from the given `value` and `typed_value`
fields
///
/// Note you can create a `ShreddingState` from a &[`StructArray`] using
- /// `ShreddingState::from(&struct_array)`, for example:
+ /// `ShreddingState::try_from(&struct_array)`, for example:
///
/// ```no_run
/// # use arrow::array::StructArray;
@@ -669,7 +654,7 @@ impl ShreddingState {
/// # unimplemented!()
/// # }
/// let struct_array: StructArray = get_struct_array();
- /// let shredding_state = ShreddingState::from(&struct_array);
+ /// let shredding_state = ShreddingState::try_from(&struct_array).unwrap();
/// ```
pub fn new(value: Option<BinaryViewArray>, typed_value: Option<ArrayRef>)
-> Self {
Self { value, typed_value }
@@ -685,6 +670,14 @@ impl ShreddingState {
self.typed_value.as_ref()
}
+ /// Returns a borrowed version of this shredding state
+ pub fn borrow(&self) -> BorrowedShreddingState<'_> {
+ BorrowedShreddingState {
+ value: self.value_field(),
+ typed_value: self.typed_value_field(),
+ }
+ }
+
/// Slice all the underlying arrays
pub fn slice(&self, offset: usize, length: usize) -> Self {
Self {
@@ -694,14 +687,79 @@ impl ShreddingState {
}
}
-impl From<&StructArray> for ShreddingState {
- fn from(inner_struct: &StructArray) -> Self {
- let value = inner_struct
- .column_by_name("value")
- .and_then(|col| col.as_binary_view_opt().cloned());
- let typed_value = inner_struct.column_by_name("typed_value").cloned();
+/// Similar to [`ShreddingState`] except it holds borrowed references of the
target arrays. Useful
+/// for avoiding clone operations when the caller does not need a
self-standing shredding state.
+#[derive(Clone, Debug)]
+pub struct BorrowedShreddingState<'a> {
+ value: Option<&'a BinaryViewArray>,
+ typed_value: Option<&'a ArrayRef>,
+}
- ShreddingState::new(value, typed_value)
+impl<'a> BorrowedShreddingState<'a> {
+ /// Create a new `BorrowedShreddingState` from the given `value` and
`typed_value` fields
+ ///
+ /// Note you can create a `BorrowedShreddingState` from a &[`StructArray`]
using
+ /// `BorrowedShreddingState::try_from(&struct_array)`, for example:
+ ///
+ /// ```no_run
+ /// # use arrow::array::StructArray;
+ /// # use parquet_variant_compute::BorrowedShreddingState;
+ /// # fn get_struct_array() -> StructArray {
+ /// # unimplemented!()
+ /// # }
+ /// let struct_array: StructArray = get_struct_array();
+ /// let shredding_state =
BorrowedShreddingState::try_from(&struct_array).unwrap();
+ /// ```
+ pub fn new(value: Option<&'a BinaryViewArray>, typed_value: Option<&'a
ArrayRef>) -> Self {
+ Self { value, typed_value }
+ }
+
+ /// Return a reference to the value field, if present
+ pub fn value_field(&self) -> Option<&'a BinaryViewArray> {
+ self.value
+ }
+
+ /// Return a reference to the typed_value field, if present
+ pub fn typed_value_field(&self) -> Option<&'a ArrayRef> {
+ self.typed_value
+ }
+}
+
+impl<'a> TryFrom<&'a StructArray> for BorrowedShreddingState<'a> {
+ type Error = ArrowError;
+
+ fn try_from(inner_struct: &'a StructArray) -> Result<Self, ArrowError> {
+ // The `value` column need not exist, but if it does it must be a
binary view.
+ let value = if let Some(value_col) =
inner_struct.column_by_name("value") {
+ let Some(binary_view) = value_col.as_binary_view_opt() else {
+ return Err(ArrowError::NotYetImplemented(format!(
+ "VariantArray 'value' field must be BinaryView, got {}",
+ value_col.data_type()
+ )));
+ };
+ Some(binary_view)
+ } else {
+ None
+ };
+ let typed_value = inner_struct.column_by_name("typed_value");
+ Ok(BorrowedShreddingState::new(value, typed_value))
+ }
+}
+
+impl TryFrom<&StructArray> for ShreddingState {
+ type Error = ArrowError;
+
+ fn try_from(inner_struct: &StructArray) -> Result<Self, ArrowError> {
+ Ok(BorrowedShreddingState::try_from(inner_struct)?.into())
+ }
+}
+
+impl From<BorrowedShreddingState<'_>> for ShreddingState {
+ fn from(state: BorrowedShreddingState<'_>) -> Self {
+ ShreddingState {
+ value: state.value_field().cloned(),
+ typed_value: state.typed_value_field().cloned(),
+ }
}
}
diff --git a/parquet-variant-compute/src/variant_get.rs
b/parquet-variant-compute/src/variant_get.rs
index a923732ca4..977507af42 100644
--- a/parquet-variant-compute/src/variant_get.rs
+++ b/parquet-variant-compute/src/variant_get.rs
@@ -23,16 +23,16 @@ use arrow::{
use arrow_schema::{ArrowError, DataType, FieldRef};
use parquet_variant::{VariantPath, VariantPathElement};
-use crate::variant_array::ShreddingState;
+use crate::variant_array::BorrowedShreddingState;
use crate::variant_to_arrow::make_variant_to_arrow_row_builder;
use crate::VariantArray;
use arrow::array::AsArray;
use std::sync::Arc;
-pub(crate) enum ShreddedPathStep {
+pub(crate) enum ShreddedPathStep<'a> {
/// Path step succeeded, return the new shredding state
- Success(ShreddingState),
+ Success(BorrowedShreddingState<'a>),
/// The path element is not present in the `typed_value` column and there
is no `value` column,
/// so we know it does not exist. It, and all paths under it, are all-NULL.
Missing,
@@ -47,18 +47,16 @@ pub(crate) enum ShreddedPathStep {
/// level, or if `typed_value` is not a struct, or if the requested field name
does not exist.
///
/// TODO: Support `VariantPathElement::Index`? It wouldn't be easy, and maybe
not even possible.
-pub(crate) fn follow_shredded_path_element(
- shredding_state: &ShreddingState,
+pub(crate) fn follow_shredded_path_element<'a>(
+ shredding_state: &BorrowedShreddingState<'a>,
path_element: &VariantPathElement<'_>,
cast_options: &CastOptions,
-) -> Result<ShreddedPathStep> {
+) -> Result<ShreddedPathStep<'a>> {
// If the requested path element is not present in `typed_value`, and
`value` is missing, then
// we know it does not exist; it, and all paths under it, are all-NULL.
- let missing_path_step = || {
- let Some(_value_field) = shredding_state.value_field() else {
- return ShreddedPathStep::Missing;
- };
- ShreddedPathStep::NotShredded
+ let missing_path_step = || match shredding_state.value_field() {
+ Some(_) => ShreddedPathStep::NotShredded,
+ None => ShreddedPathStep::Missing,
};
let Some(typed_value) = shredding_state.typed_value_field() else {
@@ -98,7 +96,8 @@ pub(crate) fn follow_shredded_path_element(
))
})?;
- Ok(ShreddedPathStep::Success(struct_array.into()))
+ let state = BorrowedShreddingState::try_from(struct_array)?;
+ Ok(ShreddedPathStep::Success(state))
}
VariantPathElement::Index { .. } => {
// TODO: Support array indexing. Among other things, it will
require slicing not
@@ -152,7 +151,7 @@ fn shredded_get_path(
// Peel away the prefix of path elements that traverses the shredded parts
of this variant
// column. Shredding will traverse the rest of the path on a per-row basis.
- let mut shredding_state = input.shredding_state().clone();
+ let mut shredding_state = input.shredding_state().borrow();
let mut accumulated_nulls = input.inner().nulls().cloned();
let mut path_index = 0;
for path_element in path {