This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 72b45760c1 [Variant] Support Binary/LargeBinary children (#9610)
72b45760c1 is described below
commit 72b45760c131328a53109977d97b3e6742d454ef
Author: Adam Gutglick <[email protected]>
AuthorDate: Wed Apr 15 21:59:53 2026 +0100
[Variant] Support Binary/LargeBinary children (#9610)
# Which issue does this PR close?
- Closes #8387
# Rationale for this change
Improves spec compliance and improves performance, allowing more
zero-copy copies between parquet and/or other implementation.
# What changes are included in this PR?
The main change is replacing `VariantArray`'s children with opaque
`ArrayRef`, and handling validation accordingly.
# Are these changes tested?
All existing tests are still in place.
# Are there any user-facing changes?
Yes, some functions on `VariantArray` now take or return `ArrayRef`,
`from_parts`, `value_field`, `metadata_field`.
---------
Signed-off-by: Adam Gutglick <[email protected]>
Co-authored-by: Ryan Johnson <[email protected]>
---
parquet-variant-compute/src/shred_variant.rs | 126 ++++++-----
parquet-variant-compute/src/unshred_variant.rs | 117 +++++-----
parquet-variant-compute/src/variant_array.rs | 236 +++++++++++++--------
.../src/variant_array_builder.rs | 2 +-
parquet-variant-compute/src/variant_get.rs | 64 +++---
parquet-variant-compute/src/variant_to_arrow.rs | 19 +-
6 files changed, 331 insertions(+), 233 deletions(-)
diff --git a/parquet-variant-compute/src/shred_variant.rs
b/parquet-variant-compute/src/shred_variant.rs
index 7b181179d3..7b919660d6 100644
--- a/parquet-variant-compute/src/shred_variant.rs
+++ b/parquet-variant-compute/src/shred_variant.rs
@@ -96,7 +96,7 @@ pub fn shred_variant(array: &VariantArray, as_type:
&DataType) -> Result<Variant
let (value, typed_value, nulls) = builder.finish()?;
Ok(VariantArray::from_parts(
array.metadata_field().clone(),
- Some(value),
+ Some(Arc::new(value)),
Some(typed_value),
nulls,
))
@@ -443,8 +443,11 @@ impl<'a> VariantToShreddedObjectVariantRowBuilder<'a> {
let mut builder = StructArrayBuilder::new();
for (field_name, typed_value_builder) in self.typed_value_builders {
let (value, typed_value, nulls) = typed_value_builder.finish()?;
- let array =
- ShreddedVariantFieldArray::from_parts(Some(value),
Some(typed_value), nulls);
+ let array = ShreddedVariantFieldArray::from_parts(
+ Some(Arc::new(value)),
+ Some(typed_value),
+ nulls,
+ );
builder = builder.with_field(field_name, ArrayRef::from(array),
false);
}
if let Some(nulls) = self.typed_value_nulls.finish() {
@@ -689,6 +692,7 @@ impl VariantSchemaNode {
mod tests {
use super::*;
use crate::VariantArrayBuilder;
+ use crate::variant_array::{binary_array_value, variant_from_arrays_at};
use arrow::array::{
Array, BinaryViewArray, FixedSizeBinaryArray, Float64Array,
GenericListArray,
GenericListViewArray, Int64Array, LargeBinaryArray, LargeStringArray,
ListArray,
@@ -867,7 +871,8 @@ mod tests {
) {
assert_eq!(array.len(), expected_len);
- let fallbacks = (array.value_field().unwrap(),
Some(array.metadata_field()));
+ let fallback_value = array.value_field().unwrap();
+ let fallback_metadata = array.metadata_field();
let array = downcast_list_like_array::<O>(array);
assert_eq!(
@@ -887,7 +892,7 @@ mod tests {
);
assert_eq!(
array.len(),
- fallbacks.0.len(),
+ fallback_value.len(),
"fallbacks value field should match array length"
);
@@ -902,7 +907,7 @@ mod tests {
// Successfully shredded: typed list value present, no
fallback value
assert!(array.is_valid(idx));
assert_eq!(array.value_size(idx), *len);
- assert!(fallbacks.0.is_null(idx));
+ assert!(fallback_value.is_null(idx));
}
None => {
// Unable to shred: typed list value absent, fallback
should carry the variant
@@ -910,20 +915,25 @@ mod tests {
assert_eq!(array.value_size(idx), O::zero());
match expected_fallback {
Some(expected_variant) => {
- assert!(fallbacks.0.is_valid(idx));
- let metadata_bytes = fallbacks
- .1
- .filter(|m| m.is_valid(idx))
- .map(|m| m.value(idx))
- .filter(|bytes| !bytes.is_empty())
- .unwrap_or(EMPTY_VARIANT_METADATA_BYTES);
+ assert!(fallback_value.is_valid(idx));
+ let metadata_bytes =
+ binary_array_value(fallback_metadata.as_ref(),
idx).unwrap();
+ let metadata_bytes =
+ if fallback_metadata.is_valid(idx) &&
!metadata_bytes.is_empty() {
+ metadata_bytes
+ } else {
+ EMPTY_VARIANT_METADATA_BYTES
+ };
assert_eq!(
- Variant::new(metadata_bytes,
fallbacks.0.value(idx)),
+ Variant::new(
+ metadata_bytes,
+
binary_array_value(fallback_value.as_ref(), idx).unwrap()
+ ),
expected_variant.clone()
);
}
None => {
- assert!(fallbacks.0.is_null(idx));
+ assert!(fallback_value.is_null(idx));
}
}
}
@@ -983,7 +993,10 @@ mod tests {
Some(expected_variant) => {
assert!(element_fallbacks.is_valid(idx));
assert_eq!(
- Variant::new(EMPTY_VARIANT_METADATA_BYTES,
element_fallbacks.value(idx)),
+ Variant::new(
+ EMPTY_VARIANT_METADATA_BYTES,
+ binary_array_value(element_fallbacks.as_ref(),
idx).unwrap()
+ ),
expected_variant.clone()
);
}
@@ -1129,7 +1142,7 @@ mod tests {
#[test]
fn test_all_null_input() {
// Create VariantArray with no value field (all null case)
- let metadata = BinaryViewArray::from_iter_values([&[1u8, 0u8]]); //
minimal valid metadata
+ let metadata = Arc::new(BinaryViewArray::from_iter_values([&[1u8,
0u8]])); // minimal valid metadata
let all_null_array = VariantArray::from_parts(metadata, None, None,
None);
let result = shred_variant(&all_null_array, &DataType::Int64).unwrap();
@@ -1243,7 +1256,7 @@ mod tests {
assert!(!value_field.is_null(1)); // value should contain original
assert!(typed_value_field.is_null(1)); // typed_value should be null
assert_eq!(
- Variant::new(metadata_field.value(1), value_field.value(1)),
+ variant_from_arrays_at(metadata_field, value_field, 1).unwrap(),
Variant::from("hello")
);
@@ -1259,7 +1272,7 @@ mod tests {
assert!(!result.is_null(4));
assert!(!value_field.is_null(4)); // should contain Variant::Null
assert_eq!(
- Variant::new(metadata_field.value(4), value_field.value(4)),
+ variant_from_arrays_at(metadata_field, value_field, 4).unwrap(),
Variant::Null
);
assert!(typed_value_field.is_null(4));
@@ -1336,7 +1349,7 @@ mod tests {
assert!(value.is_valid(1));
assert!(typed_value.is_null(1));
assert_eq!(
- Variant::new(metadata.value(1), value.value(1)),
+ variant_from_arrays_at(metadata, value, 1).unwrap(),
Variant::from(42i64)
);
@@ -1350,7 +1363,7 @@ mod tests {
assert!(value.is_valid(3));
assert!(typed_value.is_null(3));
assert_eq!(
- Variant::new(metadata.value(3), value.value(3)),
+ variant_from_arrays_at(metadata, value, 3).unwrap(),
Variant::Null
);
@@ -1392,7 +1405,7 @@ mod tests {
assert!(value.is_valid(1));
assert!(typed_value.is_null(1));
assert_eq!(
- Variant::new(metadata.value(1), value.value(1)),
+ variant_from_arrays_at(metadata, value, 1).unwrap(),
Variant::from("not_binary")
);
@@ -1406,7 +1419,7 @@ mod tests {
assert!(value.is_valid(3));
assert!(typed_value.is_null(3));
assert_eq!(
- Variant::new(metadata.value(3), value.value(3)),
+ variant_from_arrays_at(metadata, value, 3).unwrap(),
Variant::Null
);
@@ -1682,10 +1695,10 @@ mod tests {
.unwrap();
let outer_fallbacks = outer_elements.value_field().unwrap();
- let outer_metadata =
BinaryViewArray::from_iter_values(std::iter::repeat_n(
+ let outer_metadata =
Arc::new(BinaryViewArray::from_iter_values(std::iter::repeat_n(
EMPTY_VARIANT_METADATA_BYTES,
outer_elements.len(),
- ));
+ )));
let outer_variant = VariantArray::from_parts(
outer_metadata,
Some(outer_fallbacks.clone()),
@@ -1792,7 +1805,10 @@ mod tests {
// null is stored as Variant::Null in values
assert!(id_values.is_valid(1));
assert_eq!(
- Variant::new(EMPTY_VARIANT_METADATA_BYTES, id_values.value(1)),
+ Variant::new(
+ EMPTY_VARIANT_METADATA_BYTES,
+ binary_array_value(id_values.as_ref(), 1).unwrap()
+ ),
Variant::Null
);
assert!(id_typed_values.is_null(1));
@@ -1866,7 +1882,6 @@ mod tests {
assert_eq!(result.len(), 9);
let metadata = result.metadata_field();
-
let value = result.value_field().unwrap();
let typed_value = result
.typed_value_field()
@@ -1882,24 +1897,14 @@ mod tests {
let age_field =
ShreddedVariantFieldArray::try_new(typed_value.column_by_name("age").unwrap()).unwrap();
- let score_value = score_field
- .value_field()
- .unwrap()
- .as_any()
- .downcast_ref::<BinaryViewArray>()
- .unwrap();
+ let score_value = score_field.value_field().unwrap();
let score_typed_value = score_field
.typed_value_field()
.unwrap()
.as_any()
.downcast_ref::<Float64Array>()
.unwrap();
- let age_value = age_field
- .value_field()
- .unwrap()
- .as_any()
- .downcast_ref::<BinaryViewArray>()
- .unwrap();
+ let age_value = age_field.value_field().unwrap();
let age_typed_value = age_field
.typed_value_field()
.unwrap()
@@ -1918,10 +1923,10 @@ mod tests {
}
fn get_value<'m, 'v>(
i: usize,
- metadata: &'m BinaryViewArray,
- value: &'v BinaryViewArray,
+ metadata: &'m dyn Array,
+ value: &'v dyn Array,
) -> Variant<'m, 'v> {
- Variant::new(metadata.value(i), value.value(i))
+ variant_from_arrays_at(metadata, value, i).unwrap()
}
let expect = |i, expected_result:
Option<ShreddedValue<ShreddedStruct>>| {
match expected_result {
@@ -1933,7 +1938,10 @@ mod tests {
match expected_value {
Some(expected_value) => {
assert!(value.is_valid(i));
- assert_eq!(expected_value, get_value(i, metadata,
value));
+ assert_eq!(
+ expected_value,
+ get_value(i, metadata.as_ref(), value.as_ref())
+ );
}
None => {
assert!(value.is_null(i));
@@ -1952,7 +1960,7 @@ mod tests {
assert!(score_value.is_valid(i));
assert_eq!(
expected_score_value,
- get_value(i, metadata, score_value)
+ get_value(i, metadata.as_ref(),
score_value.as_ref())
);
}
None => {
@@ -1973,7 +1981,7 @@ mod tests {
assert!(age_value.is_valid(i));
assert_eq!(
expected_age_value,
- get_value(i, metadata, age_value)
+ get_value(i, metadata.as_ref(),
age_value.as_ref())
);
}
None => {
@@ -2114,7 +2122,7 @@ mod tests {
// Helper to correctly create a variant object using a row's existing
metadata
let object_with_foo_field = |i| {
use parquet_variant::{ParentState, ValueBuilder, VariantMetadata};
- let metadata = VariantMetadata::new(metadata.value(i));
+ let metadata =
VariantMetadata::new(binary_array_value(metadata.as_ref(), i).unwrap());
let mut metadata_builder = ReadOnlyMetadataBuilder::new(&metadata);
let mut value_builder = ValueBuilder::new();
let state = ParentState::variant(&mut value_builder, &mut
metadata_builder);
@@ -2213,7 +2221,7 @@ mod tests {
assert!(value_field.is_null(2));
assert!(value_field.is_valid(3));
assert_eq!(
- Variant::new(result.metadata_field().value(3),
value_field.value(3)),
+ variant_from_arrays_at(result.metadata_field(), value_field,
3).unwrap(),
Variant::from("not an object")
);
assert!(value_field.is_null(4));
@@ -2231,10 +2239,10 @@ mod tests {
.unwrap();
assert_list_structure_and_elements::<Int64Type, i32>(
&VariantArray::from_parts(
- BinaryViewArray::from_iter_values(std::iter::repeat_n(
+ Arc::new(BinaryViewArray::from_iter_values(std::iter::repeat_n(
EMPTY_VARIANT_METADATA_BYTES,
scores_field.len(),
- )),
+ ))),
Some(scores_field.value_field().unwrap().clone()),
Some(scores_field.typed_value_field().unwrap().clone()),
None,
@@ -2350,24 +2358,14 @@ mod tests {
ShreddedVariantFieldArray::try_new(typed_value.column_by_name("session_id").unwrap())
.unwrap();
- let id_value = id_field
- .value_field()
- .unwrap()
- .as_any()
- .downcast_ref::<BinaryViewArray>()
- .unwrap();
+ let id_value = id_field.value_field().unwrap();
let id_typed_value = id_field
.typed_value_field()
.unwrap()
.as_any()
.downcast_ref::<FixedSizeBinaryArray>()
.unwrap();
- let session_id_value = session_id_field
- .value_field()
- .unwrap()
- .as_any()
- .downcast_ref::<BinaryViewArray>()
- .unwrap();
+ let session_id_value = session_id_field.value_field().unwrap();
let session_id_typed_value = session_id_field
.typed_value_field()
.unwrap()
@@ -2404,7 +2402,7 @@ mod tests {
assert_eq!(session_id_typed_value.value(1), mock_uuid_3.as_bytes());
// Verify the value field contains the name field
- let row_1_variant = Variant::new(metadata.value(1), value.value(1));
+ let row_1_variant = variant_from_arrays_at(metadata, value,
1).unwrap();
let Variant::Object(obj) = row_1_variant else {
panic!("Expected object");
};
@@ -2436,7 +2434,7 @@ mod tests {
assert!(session_id_value.is_valid(3)); // type mismatch, stored in
value
assert!(session_id_typed_value.is_null(3));
- let session_id_variant = Variant::new(metadata.value(3),
session_id_value.value(3));
+ let session_id_variant = variant_from_arrays_at(metadata,
session_id_value, 3).unwrap();
assert_eq!(session_id_variant, Variant::from("not-a-uuid"));
// Row 4: Type mismatch - id is int64, not UUID
@@ -2447,7 +2445,7 @@ mod tests {
assert!(id_value.is_valid(4)); // type mismatch, stored in value
assert!(id_typed_value.is_null(4));
- let id_variant = Variant::new(metadata.value(4), id_value.value(4));
+ let id_variant = variant_from_arrays_at(metadata, id_value,
4).unwrap();
assert_eq!(id_variant, Variant::from(12345i64));
assert!(session_id_value.is_null(4));
diff --git a/parquet-variant-compute/src/unshred_variant.rs
b/parquet-variant-compute/src/unshred_variant.rs
index 2df36fa63f..ecffd48bc4 100644
--- a/parquet-variant-compute/src/unshred_variant.rs
+++ b/parquet-variant-compute/src/unshred_variant.rs
@@ -17,11 +17,13 @@
//! Module for unshredding VariantArray by folding typed_value columns back
into the value column.
+use crate::variant_array::binary_array_value;
use crate::{BorrowedShreddingState, VariantArray, VariantValueArrayBuilder};
use arrow::array::{
- Array, AsArray as _, BinaryArray, BinaryViewArray, BooleanArray,
FixedSizeBinaryArray,
- FixedSizeListArray, GenericListArray, GenericListViewArray,
LargeBinaryArray, LargeStringArray,
- ListLikeArray, PrimitiveArray, StringArray, StringViewArray, StructArray,
+ Array, ArrayRef, AsArray as _, BinaryArray, BinaryViewArray, BooleanArray,
+ FixedSizeBinaryArray, FixedSizeListArray, GenericListArray,
GenericListViewArray,
+ LargeBinaryArray, LargeStringArray, ListLikeArray, PrimitiveArray,
StringArray,
+ StringViewArray, StructArray,
};
use arrow::buffer::NullBuffer;
use arrow::datatypes::{
@@ -38,6 +40,7 @@ use parquet_variant::{
VariantDecimal16, VariantDecimalType, VariantMetadata,
};
use std::marker::PhantomData;
+use std::sync::Arc;
use uuid::Uuid;
/// Removes all (nested) typed_value columns from a VariantArray by converting
them back to binary
@@ -73,7 +76,12 @@ pub fn unshred_variant(array: &VariantArray) ->
Result<VariantArray> {
if array.is_null(i) {
value_builder.append_null();
} else {
- let metadata = VariantMetadata::new(metadata.value(i));
+ let metadata_bytes = binary_array_value(metadata.as_ref(),
i).ok_or_else(|| {
+ ArrowError::InvalidArgumentError(
+ "metadata field must be a binary-like array".to_string(),
+ )
+ })?;
+ let metadata = VariantMetadata::new(metadata_bytes);
let mut value_builder = value_builder.builder_ext(&metadata);
row_builder.append_row(&mut value_builder, &metadata, i)?;
}
@@ -82,7 +90,7 @@ pub fn unshred_variant(array: &VariantArray) ->
Result<VariantArray> {
let value = value_builder.build()?;
Ok(VariantArray::from_parts(
metadata.clone(),
- Some(value),
+ Some(Arc::new(value)),
None,
nulls.cloned(),
))
@@ -308,11 +316,11 @@ impl<'a> NullUnshredVariantBuilder<'a> {
/// Builder for arrays that only have value column (already unshredded)
struct ValueOnlyUnshredVariantBuilder<'a> {
- value: &'a arrow::array::BinaryViewArray,
+ value: &'a ArrayRef,
}
impl<'a> ValueOnlyUnshredVariantBuilder<'a> {
- fn new(value: &'a BinaryViewArray) -> Self {
+ fn new(value: &'a ArrayRef) -> Self {
Self { value }
}
@@ -325,7 +333,12 @@ impl<'a> ValueOnlyUnshredVariantBuilder<'a> {
if self.value.is_null(index) {
builder.append_null();
} else {
- let variant = Variant::new_with_metadata(metadata.clone(),
self.value.value(index));
+ let value_bytes = binary_array_value(self.value.as_ref(),
index).ok_or_else(|| {
+ ArrowError::InvalidArgumentError(
+ "value field must be a binary-like array".to_string(),
+ )
+ })?;
+ let variant = Variant::new_with_metadata(metadata.clone(),
value_bytes);
builder.append_value(variant);
}
Ok(())
@@ -347,7 +360,17 @@ trait AppendToVariantBuilder: Array {
macro_rules! handle_unshredded_case {
($self:expr, $builder:expr, $metadata:expr, $index:expr,
$partial_shredding:expr) => {{
let value = $self.value.as_ref().filter(|v| v.is_valid($index));
- let value = value.map(|v|
Variant::new_with_metadata($metadata.clone(), v.value($index)));
+ let value = value
+ .map(|v| {
+ let bytes = binary_array_value(v.as_ref(),
$index).ok_or_else(|| {
+ ArrowError::InvalidArgumentError(format!(
+ "value field must be a binary-like array, instead got
{}",
+ v.data_type(),
+ ))
+ })?;
+ Result::Ok(Variant::new_with_metadata($metadata.clone(),
bytes))
+ })
+ .transpose()?;
// If typed_value is null, handle unshredded case and return early
if $self.typed_value.is_null($index) {
@@ -372,12 +395,12 @@ macro_rules! handle_unshredded_case {
/// Generic unshred builder that works with any Array implementing
AppendToVariantBuilder
struct UnshredPrimitiveRowBuilder<'a, T> {
- value: Option<&'a BinaryViewArray>,
+ value: Option<&'a ArrayRef>,
typed_value: &'a T,
}
impl<'a, T: AppendToVariantBuilder> UnshredPrimitiveRowBuilder<'a, T> {
- fn new(value: Option<&'a BinaryViewArray>, typed_value: &'a T) -> Self {
+ fn new(value: Option<&'a ArrayRef>, typed_value: &'a T) -> Self {
Self { value, typed_value }
}
@@ -475,17 +498,13 @@ impl TimestampType for TimestampNanosecondType {
/// Generic builder for timestamp types that handles timezone-aware conversion
struct TimestampUnshredRowBuilder<'a, T: TimestampType> {
- value: Option<&'a BinaryViewArray>,
+ value: Option<&'a ArrayRef>,
typed_value: &'a PrimitiveArray<T>,
has_timezone: bool,
}
impl<'a, T: TimestampType> TimestampUnshredRowBuilder<'a, T> {
- fn new(
- value: Option<&'a BinaryViewArray>,
- typed_value: &'a dyn Array,
- has_timezone: bool,
- ) -> Self {
+ fn new(value: Option<&'a ArrayRef>, typed_value: &'a dyn Array,
has_timezone: bool) -> Self {
Self {
value,
typed_value: typed_value.as_primitive(),
@@ -518,7 +537,7 @@ struct DecimalUnshredRowBuilder<'a, A: DecimalType, V>
where
V: VariantDecimalType<Native = A::Native>,
{
- value: Option<&'a BinaryViewArray>,
+ value: Option<&'a ArrayRef>,
typed_value: &'a PrimitiveArray<A>,
scale: i8,
_phantom: PhantomData<V>,
@@ -528,7 +547,7 @@ impl<'a, A: DecimalType, V> DecimalUnshredRowBuilder<'a, A,
V>
where
V: VariantDecimalType<Native = A::Native>,
{
- fn new(value: Option<&'a BinaryViewArray>, typed_value: &'a dyn Array,
scale: i8) -> Self {
+ fn new(value: Option<&'a ArrayRef>, typed_value: &'a dyn Array, scale: i8)
-> Self {
Self {
value,
typed_value: typed_value.as_primitive(),
@@ -554,13 +573,13 @@ where
/// Builder for unshredding struct/object types with nested fields
struct StructUnshredVariantBuilder<'a> {
- value: Option<&'a arrow::array::BinaryViewArray>,
+ value: Option<&'a ArrayRef>,
typed_value: &'a arrow::array::StructArray,
field_unshredders: IndexMap<&'a str, Option<UnshredVariantRowBuilder<'a>>>,
}
impl<'a> StructUnshredVariantBuilder<'a> {
- fn try_new(value: Option<&'a BinaryViewArray>, typed_value: &'a
StructArray) -> Result<Self> {
+ fn try_new(value: Option<&'a ArrayRef>, typed_value: &'a StructArray) ->
Result<Self> {
// Create unshredders for each field in constructor
let mut field_unshredders = IndexMap::new();
for (field, field_array) in
typed_value.fields().iter().zip(typed_value.columns()) {
@@ -626,13 +645,13 @@ impl<'a> StructUnshredVariantBuilder<'a> {
/// Builder for unshredding list/array types with recursive element processing
struct ListUnshredVariantBuilder<'a, L: ListLikeArray> {
- value: Option<&'a BinaryViewArray>,
+ value: Option<&'a ArrayRef>,
typed_value: &'a L,
element_unshredder: Box<UnshredVariantRowBuilder<'a>>,
}
impl<'a, L: ListLikeArray> ListUnshredVariantBuilder<'a, L> {
- fn try_new(value: Option<&'a BinaryViewArray>, typed_value: &'a L) ->
Result<Self> {
+ fn try_new(value: Option<&'a ArrayRef>, typed_value: &'a L) ->
Result<Self> {
// Create a recursive unshredder for the list elements
// The element type comes from the values array of the list
let element_values = typed_value.values();
@@ -684,16 +703,18 @@ impl<'a, L: ListLikeArray> ListUnshredVariantBuilder<'a,
L> {
mod tests {
use crate::VariantArray;
use arrow::array::{
- BinaryArray, BinaryViewArray, LargeBinaryArray, LargeStringArray,
StringViewArray,
+ ArrayRef, BinaryArray, BinaryViewArray, LargeBinaryArray,
LargeStringArray, StringViewArray,
};
use parquet_variant::Variant;
+ use std::sync::Arc;
#[test]
fn test_unshred_utf8view_typed_value() {
let metadata_bytes: &[u8] = &[0x01, 0x00, 0x00];
- let metadata = BinaryViewArray::from_iter_values(vec![metadata_bytes;
3]);
+ let metadata: ArrayRef =
+ Arc::new(BinaryViewArray::from_iter_values(vec![metadata_bytes;
3]));
- let typed_value: arrow::array::ArrayRef =
std::sync::Arc::new(StringViewArray::from(vec![
+ let typed_value: ArrayRef = Arc::new(StringViewArray::from(vec![
Some("hello"),
Some("middle"),
Some("world"),
@@ -712,14 +733,14 @@ mod tests {
#[test]
fn test_unshred_largeutf8_typed_value() {
let metadata_bytes: &[u8] = &[0x01, 0x00, 0x00];
- let metadata = BinaryViewArray::from_iter_values(vec![metadata_bytes;
3]);
+ let metadata: ArrayRef =
+ Arc::new(BinaryViewArray::from_iter_values(vec![metadata_bytes;
3]));
- let typed_value: arrow::array::ArrayRef =
- std::sync::Arc::new(LargeStringArray::from(vec![
- Some("hello"),
- Some("middle"),
- Some("world"),
- ]));
+ let typed_value: ArrayRef = Arc::new(LargeStringArray::from(vec![
+ Some("hello"),
+ Some("middle"),
+ Some("world"),
+ ]));
let variant_array = VariantArray::from_parts(metadata, None,
Some(typed_value), None);
@@ -734,14 +755,14 @@ mod tests {
#[test]
fn test_unshred_binary_typed_value() {
let metadata_bytes: &[u8] = &[0x01, 0x00, 0x00];
- let metadata = BinaryViewArray::from_iter_values(vec![metadata_bytes;
3]);
+ let metadata: ArrayRef =
+ Arc::new(BinaryViewArray::from_iter_values(vec![metadata_bytes;
3]));
- let typed_value: arrow::array::ArrayRef =
- std::sync::Arc::new(BinaryArray::from_iter_values(vec![
- &b"\x00\x01\x02"[..],
- &b"\xff\xaa"[..],
- &b"\xde\xad\xbe\xef"[..],
- ]));
+ let typed_value: ArrayRef =
Arc::new(BinaryArray::from_iter_values(vec![
+ &b"\x00\x01\x02"[..],
+ &b"\xff\xaa"[..],
+ &b"\xde\xad\xbe\xef"[..],
+ ]));
let variant_array = VariantArray::from_parts(metadata, None,
Some(typed_value), None);
@@ -756,14 +777,14 @@ mod tests {
#[test]
fn test_unshred_largebinary_typed_value() {
let metadata_bytes: &[u8] = &[0x01, 0x00, 0x00];
- let metadata = BinaryViewArray::from_iter_values(vec![metadata_bytes;
3]);
-
- let typed_value: arrow::array::ArrayRef =
- std::sync::Arc::new(LargeBinaryArray::from_iter_values(vec![
- &b"\x00\x01\x02"[..],
- &b"\xff\xaa"[..],
- &b"\xde\xad\xbe\xef"[..],
- ]));
+ let metadata: ArrayRef =
+ Arc::new(BinaryViewArray::from_iter_values(vec![metadata_bytes;
3]));
+
+ let typed_value: ArrayRef =
Arc::new(LargeBinaryArray::from_iter_values(vec![
+ &b"\x00\x01\x02"[..],
+ &b"\xff\xaa"[..],
+ &b"\xde\xad\xbe\xef"[..],
+ ]));
let variant_array = VariantArray::from_parts(metadata, None,
Some(typed_value), None);
diff --git a/parquet-variant-compute/src/variant_array.rs
b/parquet-variant-compute/src/variant_array.rs
index 1aad3145a8..d7229bc478 100644
--- a/parquet-variant-compute/src/variant_array.rs
+++ b/parquet-variant-compute/src/variant_array.rs
@@ -22,7 +22,7 @@ use crate::type_conversion::{
generic_conversion_single_value,
generic_conversion_single_value_with_result,
primitive_conversion_single_value,
};
-use arrow::array::{Array, ArrayRef, AsArray, BinaryViewArray, StructArray};
+use arrow::array::{Array, ArrayRef, AsArray, StructArray};
use arrow::buffer::NullBuffer;
use arrow::compute::cast;
use arrow::datatypes::{
@@ -41,6 +41,39 @@ use parquet_variant::{
use std::borrow::Cow;
use std::sync::Arc;
+/// Returns the raw bytes at the given index from a binary-like array, return
`None` if the array isn't binary-like.
+pub(crate) fn binary_array_value(array: &dyn Array, index: usize) ->
Option<&[u8]> {
+ match array.data_type() {
+ DataType::Binary => Some(array.as_binary::<i32>().value(index)),
+ DataType::LargeBinary => Some(array.as_binary::<i64>().value(index)),
+ DataType::BinaryView => Some(array.as_binary_view().value(index)),
+ _ => None,
+ }
+}
+
+/// Returns a [`Variant`] from a `metadata` and `value` byte arrays, returns
`None`
+/// if one of them is of invalid type.
+pub(crate) fn variant_from_arrays_at<'m, 'v>(
+ metadata: &'m dyn Array,
+ value: &'v dyn Array,
+ index: usize,
+) -> Option<Variant<'m, 'v>> {
+ let metadata = binary_array_value(metadata, index)?;
+ let value = binary_array_value(value, index)?;
+ Some(Variant::new(metadata, value))
+}
+
+/// Validates that an array has a binary-like data type.
+fn validate_binary_array(array: &dyn Array, field_name: &str) -> Result<()> {
+ match array.data_type() {
+ DataType::Binary | DataType::LargeBinary | DataType::BinaryView =>
Ok(()),
+ _ => Err(ArrowError::InvalidArgumentError(format!(
+ "VariantArray '{field_name}' field must be Binary, LargeBinary, or
BinaryView, got {}",
+ array.data_type()
+ ))),
+ }
+}
+
/// Arrow Variant [`ExtensionType`].
///
/// Represents the canonical Arrow Extension Type for storing variants.
@@ -217,13 +250,13 @@ impl ExtensionType for VariantType {
/// assert_eq!(variant_array.value(0), Variant::from("such wow"));
/// ```
///
-#[derive(Debug, Clone, PartialEq)]
+#[derive(Debug, Clone)]
pub struct VariantArray {
/// Reference to the underlying StructArray
inner: StructArray,
- /// The metadata column of this variant
- metadata: BinaryViewArray,
+ /// The metadata column of this variant (Binary, LargeBinary, or
BinaryView)
+ metadata: ArrayRef,
/// how is this variant array shredded?
shredding_state: ShreddingState,
@@ -256,11 +289,9 @@ impl VariantArray {
/// Dictionary-Encoded, preferably (but not required) with an index type of
/// int8.
///
- /// Currently, only [`BinaryViewArray`] are supported.
pub fn try_new(inner: &dyn Array) -> Result<Self> {
- // Workaround lack of support for Binary
- // https://github.com/apache/arrow-rs/issues/8387
- let inner = cast_to_binary_view_arrays(inner)?;
+ // Canonicalize shredded typed_value fields (e.g. decimal narrowing)
+ let inner = canonicalize_shredded_types(inner)?;
let Some(inner) = inner.as_struct_opt() else {
return Err(ArrowError::InvalidArgumentError(
@@ -270,37 +301,31 @@ impl VariantArray {
// Note the specification allows for any order so we must search by
name
- // Ensure the StructArray has a metadata field of BinaryView
- let Some(metadata_field) = inner.column_by_name("metadata") else {
+ // Ensure the StructArray has a metadata field that is a binary type
+ let Some(metadata_col) = inner.column_by_name("metadata") else {
return Err(ArrowError::InvalidArgumentError(
"Invalid VariantArray: StructArray must contain a 'metadata'
field".to_string(),
));
};
- let Some(metadata) = metadata_field.as_binary_view_opt() else {
- return Err(ArrowError::NotYetImplemented(format!(
- "VariantArray 'metadata' field must be BinaryView, got {}",
- metadata_field.data_type()
- )));
- };
+ validate_binary_array(metadata_col.as_ref(), "metadata")?;
// Note these clones are cheap, they just bump the ref count
Ok(Self {
inner: inner.clone(),
- metadata: metadata.clone(),
+ metadata: metadata_col.clone(),
shredding_state: ShreddingState::try_from(inner)?,
})
}
pub(crate) fn from_parts(
- metadata: BinaryViewArray,
- value: Option<BinaryViewArray>,
+ metadata: ArrayRef,
+ value: Option<ArrayRef>,
typed_value: Option<ArrayRef>,
nulls: Option<NullBuffer>,
) -> Self {
- let mut builder =
- StructArrayBuilder::new().with_field("metadata",
Arc::new(metadata.clone()), false);
+ let mut builder = StructArrayBuilder::new().with_field("metadata",
metadata.clone(), false);
if let Some(value) = value.clone() {
- builder = builder.with_field("value", Arc::new(value), true);
+ builder = builder.with_field("value", value, true);
}
if let Some(typed_value) = typed_value.clone() {
builder = builder.with_field("typed_value", typed_value, true);
@@ -378,9 +403,18 @@ impl VariantArray {
typed_value_to_variant(typed_value, value, index)
}
// Otherwise fall back to value, if available
- (_, Some(value)) if value.is_valid(index) => {
- Ok(Variant::new(self.metadata.value(index),
value.value(index)))
- }
+ (_, Some(value)) if value.is_valid(index) =>
variant_from_arrays_at(
+ &self.metadata,
+ value,
+ index,
+ )
+ .ok_or_else(|| {
+ ArrowError::InvalidArgumentError(format!(
+ "metadata and value fields must be binary-like arrays,
instead got {} and {}",
+ self.metadata.data_type(),
+ value.data_type()
+ ))
+ }),
// It is technically invalid for neither value nor typed_value
fields to be available,
// but the spec specifically requires readers to return
Variant::Null in this case.
_ => Ok(Variant::Null),
@@ -388,12 +422,12 @@ impl VariantArray {
}
/// Return a reference to the metadata field of the [`StructArray`]
- pub fn metadata_field(&self) -> &BinaryViewArray {
+ pub fn metadata_field(&self) -> &ArrayRef {
&self.metadata
}
/// Return a reference to the value field of the `StructArray`
- pub fn value_field(&self) -> Option<&BinaryViewArray> {
+ pub fn value_field(&self) -> Option<&ArrayRef> {
self.shredding_state.value_field()
}
@@ -457,6 +491,12 @@ impl VariantArray {
}
}
+impl PartialEq for VariantArray {
+ fn eq(&self, other: &Self) -> bool {
+ self.inner == other.inner
+ }
+}
+
impl From<VariantArray> for StructArray {
fn from(variant_array: VariantArray) -> Self {
variant_array.into_inner()
@@ -630,7 +670,6 @@ impl ShreddedVariantFieldArray {
/// 2. An optional field named `typed_value` which can be any primitive
type
/// or be a list, large_list, list_view or struct
///
- /// Currently, only `value` columns of type [`BinaryViewArray`] are
supported.
pub fn try_new(inner: &dyn Array) -> Result<Self> {
let Some(inner_struct) = inner.as_struct_opt() else {
return Err(ArrowError::InvalidArgumentError(
@@ -651,7 +690,7 @@ impl ShreddedVariantFieldArray {
}
/// Return a reference to the value field of the `StructArray`
- pub fn value_field(&self) -> Option<&BinaryViewArray> {
+ pub fn value_field(&self) -> Option<&ArrayRef> {
self.shredding_state.value_field()
}
@@ -666,13 +705,13 @@ impl ShreddedVariantFieldArray {
}
pub(crate) fn from_parts(
- value: Option<BinaryViewArray>,
+ value: Option<ArrayRef>,
typed_value: Option<ArrayRef>,
nulls: Option<NullBuffer>,
) -> Self {
let mut builder = StructArrayBuilder::new();
if let Some(value) = value.clone() {
- builder = builder.with_field("value", Arc::new(value), true);
+ builder = builder.with_field("value", value, true);
}
if let Some(typed_value) = typed_value.clone() {
builder = builder.with_field("typed_value", typed_value, true);
@@ -770,9 +809,9 @@ impl From<ShreddedVariantFieldArray> for StructArray {
/// (partial shredding).
///
/// [Parquet Variant Shredding Spec]:
https://github.com/apache/parquet-format/blob/master/VariantShredding.md#value-shredding
-#[derive(Debug, Clone, PartialEq)]
+#[derive(Debug, Clone)]
pub struct ShreddingState {
- value: Option<BinaryViewArray>,
+ value: Option<ArrayRef>,
typed_value: Option<ArrayRef>,
}
@@ -791,12 +830,12 @@ impl ShreddingState {
/// let struct_array: StructArray = get_struct_array();
/// let shredding_state = ShreddingState::try_from(&struct_array).unwrap();
/// ```
- pub fn new(value: Option<BinaryViewArray>, typed_value: Option<ArrayRef>)
-> Self {
+ pub fn new(value: Option<ArrayRef>, typed_value: Option<ArrayRef>) -> Self
{
Self { value, typed_value }
}
/// Return a reference to the value field, if present
- pub fn value_field(&self) -> Option<&BinaryViewArray> {
+ pub fn value_field(&self) -> Option<&ArrayRef> {
self.value.as_ref()
}
@@ -826,7 +865,7 @@ impl ShreddingState {
/// for avoiding clone operations when the caller does not need a
self-standing shredding state.
#[derive(Clone, Debug)]
pub struct BorrowedShreddingState<'a> {
- value: Option<&'a BinaryViewArray>,
+ value: Option<&'a ArrayRef>,
typed_value: Option<&'a ArrayRef>,
}
@@ -845,12 +884,12 @@ impl<'a> BorrowedShreddingState<'a> {
/// let struct_array: StructArray = get_struct_array();
/// let shredding_state =
BorrowedShreddingState::try_from(&struct_array).unwrap();
/// ```
- pub fn new(value: Option<&'a BinaryViewArray>, typed_value: Option<&'a
ArrayRef>) -> Self {
+ pub fn new(value: Option<&'a ArrayRef>, typed_value: Option<&'a ArrayRef>)
-> Self {
Self { value, typed_value }
}
/// Return a reference to the value field, if present
- pub fn value_field(&self) -> Option<&'a BinaryViewArray> {
+ pub fn value_field(&self) -> Option<&'a ArrayRef> {
self.value
}
@@ -864,15 +903,10 @@ impl<'a> TryFrom<&'a StructArray> for
BorrowedShreddingState<'a> {
type Error = ArrowError;
fn try_from(inner_struct: &'a StructArray) -> Result<Self> {
- // The `value` column need not exist, but if it does it must be a
binary view.
+ // The `value` column need not exist, but if it does it must be a
binary type.
let value = if let Some(value_col) =
inner_struct.column_by_name("value") {
- let Some(binary_view) = value_col.as_binary_view_opt() else {
- return Err(ArrowError::NotYetImplemented(format!(
- "VariantArray 'value' field must be BinaryView, got {}",
- value_col.data_type()
- )));
- };
- Some(binary_view)
+ validate_binary_array(value_col.as_ref(), "value")?;
+ Some(value_col)
} else {
None
};
@@ -940,7 +974,7 @@ impl StructArrayBuilder {
/// returns the non-null element at index as a Variant
fn typed_value_to_variant<'a>(
typed_value: &'a ArrayRef,
- value: Option<&BinaryViewArray>,
+ value: Option<&'a ArrayRef>,
index: usize,
) -> Result<Variant<'a, 'a>> {
let data_type = typed_value.data_type();
@@ -961,6 +995,16 @@ fn typed_value_to_variant<'a>(
let value = array.value(index);
Ok(Uuid::from_slice(value).unwrap().into()) // unwrap is safe:
slice is always 16 bytes
}
+ DataType::Binary => {
+ let array = typed_value.as_binary::<i32>();
+ let value = array.value(index);
+ Ok(Variant::from(value))
+ }
+ DataType::LargeBinary => {
+ let array = typed_value.as_binary::<i64>();
+ let value = array.value(index);
+ Ok(Variant::from(value))
+ }
DataType::BinaryView => {
let array = typed_value.as_binary_view();
let value = array.value(index);
@@ -1103,17 +1147,9 @@ fn typed_value_to_variant<'a>(
}
}
-/// Workaround for lack of direct support for BinaryArray
-/// <https://github.com/apache/arrow-rs/issues/8387>
-///
-/// The values are read as
-/// * `StructArray<metadata: Binary, value: Binary>`
-///
-/// but VariantArray needs them as
-/// * `StructArray<metadata: BinaryView, value: BinaryView>`
-///
-/// So cast them to get the right type.
-fn cast_to_binary_view_arrays(array: &dyn Array) -> Result<ArrayRef> {
+/// Canonicalize shredded typed_value fields (e.g. decimal narrowing) and
+/// verify that all data types in the struct are legal for a variant array.
+fn canonicalize_shredded_types(array: &dyn Array) -> Result<ArrayRef> {
let new_type = canonicalize_and_verify_data_type(array.data_type())?;
if let Cow::Borrowed(_) = new_type {
if let Some(array) = array.as_struct_opt() {
@@ -1124,8 +1160,8 @@ fn cast_to_binary_view_arrays(array: &dyn Array) ->
Result<ArrayRef> {
}
/// Recursively visits a data type, ensuring that it only contains data types
that can legally
-/// appear in a (possibly shredded) variant array. It also replaces Binary
fields with BinaryView,
-/// since that's what comes back from the parquet reader and what the variant
code expects to find.
+/// appear in a (possibly shredded) variant array. It also narrows decimal
types to the smallest
+/// valid precision (e.g. Decimal128 -> Decimal32 when the precision fits).
fn canonicalize_and_verify_data_type(data_type: &DataType) -> Result<Cow<'_,
DataType>> {
use DataType::*;
@@ -1176,10 +1212,8 @@ fn canonicalize_and_verify_data_type(data_type:
&DataType) -> Result<Cow<'_, Dat
Date32 | Time64(TimeUnit::Microsecond) => borrow!(),
Date64 | Time32(_) | Time64(_) | Duration(_) | Interval(_) => fail!(),
- // Binary and string are allowed. Force Binary/LargeBinary to
BinaryView because that's what the parquet
- // reader returns and what the rest of the variant code expects.
- Binary | LargeBinary => Cow::Owned(BinaryView),
- BinaryView | Utf8 | LargeUtf8 | Utf8View => borrow!(),
+ // Binary, string, and their view counterparts are allowed.
+ Binary | LargeBinary | BinaryView | Utf8 | LargeUtf8 | Utf8View =>
borrow!(),
// UUID maps to 16-byte fixed-size binary; no other width is allowed
FixedSizeBinary(16) => borrow!(),
@@ -1246,8 +1280,9 @@ mod test {
use super::*;
use arrow::array::{
- BinaryViewArray, Decimal32Array, Decimal64Array, Decimal128Array,
Int32Array, Int64Array,
- LargeListArray, LargeListViewArray, ListArray, ListViewArray,
Time64MicrosecondArray,
+ BinaryArray, BinaryViewArray, Decimal32Array, Decimal64Array,
Decimal128Array, Int32Array,
+ Int64Array, LargeBinaryArray, LargeListArray, LargeListViewArray,
ListArray, ListViewArray,
+ Time64MicrosecondArray,
};
use arrow::buffer::{OffsetBuffer, ScalarBuffer};
use arrow_schema::{Field, Fields};
@@ -1317,7 +1352,7 @@ mod test {
let err = VariantArray::try_new(&array);
assert_eq!(
err.unwrap_err().to_string(),
- "Not yet implemented: VariantArray 'metadata' field must be
BinaryView, got Int32"
+ "Invalid argument error: VariantArray 'metadata' field must be
Binary, LargeBinary, or BinaryView, got Int32"
);
}
@@ -1325,7 +1360,7 @@ mod test {
fn invalid_value_field_type() {
let fields = Fields::from(vec![
Field::new("metadata", DataType::BinaryView, true),
- Field::new("value", DataType::Int32, true), // Not yet supported
+ Field::new("value", DataType::Int32, true),
]);
let array = StructArray::new(
fields,
@@ -1335,7 +1370,7 @@ mod test {
let err = VariantArray::try_new(&array);
assert_eq!(
err.unwrap_err().to_string(),
- "Not yet implemented: VariantArray 'value' field must be
BinaryView, got Int32"
+ "Invalid argument error: VariantArray 'value' field must be
Binary, LargeBinary, or BinaryView, got Int32"
);
}
@@ -1449,27 +1484,28 @@ mod test {
// use Parquet LIST encoding, but those fixtures do not cover
Arrow-specific list container
// variants (`LargeList`, `ListView`, `LargeListView`) accepted by
`VariantArray::try_new`.
let make_item_binary = || Arc::new(Field::new("item",
DataType::Binary, true));
+ let make_large_binary = || Arc::new(Field::new("item",
DataType::LargeBinary, true));
let make_item_binary_view = || Arc::new(Field::new("item",
DataType::BinaryView, true));
let cases = vec![
- (
- DataType::LargeList(make_item_binary()),
- DataType::LargeList(make_item_binary_view()),
- ),
- (
- DataType::ListView(make_item_binary()),
- DataType::ListView(make_item_binary_view()),
- ),
- (
- DataType::LargeListView(make_item_binary()),
- DataType::LargeListView(make_item_binary_view()),
- ),
+ // Binary item
+ DataType::LargeList(make_item_binary()),
+ DataType::ListView(make_item_binary()),
+ DataType::LargeListView(make_item_binary()),
+ // Large binary item
+ DataType::LargeList(make_large_binary()),
+ DataType::ListView(make_large_binary()),
+ DataType::LargeListView(make_large_binary()),
+ // Binary view item
+ DataType::LargeList(make_item_binary_view()),
+ DataType::ListView(make_item_binary_view()),
+ DataType::LargeListView(make_item_binary_view()),
];
- for (input, expected) in cases {
+ for input in cases {
assert_eq!(
canonicalize_and_verify_data_type(&input).unwrap().as_ref(),
- &expected
+ &input
);
}
}
@@ -1670,6 +1706,40 @@ mod test {
}
}
+ #[test]
+ fn binary_typed_value_roundtrips() {
+ // Verify that a shredded variant with Binary typed_value can be read
back
+ let metadata: ArrayRef = Arc::new(BinaryViewArray::from_iter_values([
+ EMPTY_VARIANT_METADATA_BYTES,
+ ]));
+ let typed_value: ArrayRef = Arc::new(BinaryArray::from(vec![b"hello"
as &[u8]]));
+
+ let struct_array = StructArrayBuilder::new()
+ .with_field("metadata", metadata, false)
+ .with_field("typed_value", typed_value, true)
+ .build();
+
+ let variant_array = VariantArray::try_new(&struct_array).unwrap();
+ assert_eq!(variant_array.value(0), Variant::from(b"hello" as &[u8]));
+ }
+
+ #[test]
+ fn large_binary_typed_value_roundtrips() {
+ // Verify that a shredded variant with LargeBinary typed_value can be
read back
+ let metadata: ArrayRef = Arc::new(BinaryViewArray::from_iter_values([
+ EMPTY_VARIANT_METADATA_BYTES,
+ ]));
+ let typed_value: ArrayRef =
Arc::new(LargeBinaryArray::from(vec![b"world" as &[u8]]));
+
+ let struct_array = StructArrayBuilder::new()
+ .with_field("metadata", metadata, false)
+ .with_field("typed_value", typed_value, true)
+ .build();
+
+ let variant_array = VariantArray::try_new(&struct_array).unwrap();
+ assert_eq!(variant_array.value(0), Variant::from(b"world" as &[u8]));
+ }
+
macro_rules! invalid_variant_array_test {
($fn_name: ident, $invalid_typed_value: expr, $error_msg: literal) => {
#[test]
diff --git a/parquet-variant-compute/src/variant_array_builder.rs
b/parquet-variant-compute/src/variant_array_builder.rs
index f6b95c881a..e669277f1d 100644
--- a/parquet-variant-compute/src/variant_array_builder.rs
+++ b/parquet-variant-compute/src/variant_array_builder.rs
@@ -639,7 +639,7 @@ mod test {
let array2 = VariantArray::from_parts(
array.metadata_field().clone(),
- Some(value_builder.build().unwrap()),
+ Some(Arc::new(value_builder.build().unwrap())),
None,
None,
);
diff --git a/parquet-variant-compute/src/variant_get.rs
b/parquet-variant-compute/src/variant_get.rs
index 83df6879aa..1b4b3354e2 100644
--- a/parquet-variant-compute/src/variant_get.rs
+++ b/parquet-variant-compute/src/variant_get.rs
@@ -15,7 +15,8 @@
// specific language governing permissions and limitations
// under the License.
use arrow::{
- array::{self, Array, ArrayRef, BinaryViewArray, StructArray},
+ array::{self, Array, ArrayRef, StructArray},
+ buffer::NullBuffer,
compute::CastOptions,
datatypes::Field,
error::Result,
@@ -114,9 +115,9 @@ fn shredded_get_path(
// Helper that creates a new VariantArray from the given nested value and
typed_value columns,
// properly accounting for accumulated nulls from path traversal
let make_target_variant =
- |value: Option<BinaryViewArray>,
+ |value: Option<ArrayRef>,
typed_value: Option<ArrayRef>,
- accumulated_nulls: Option<arrow::buffer::NullBuffer>| {
+ accumulated_nulls: Option<NullBuffer>| {
let metadata = input.metadata_field().clone();
VariantArray::from_parts(metadata, value, typed_value,
accumulated_nulls)
};
@@ -161,10 +162,8 @@ fn shredded_get_path(
ShreddedPathStep::Success(state) => {
// Union nulls from the typed_value we just accessed
if let Some(typed_value) = shredding_state.typed_value_field()
{
- accumulated_nulls = arrow::buffer::NullBuffer::union(
- accumulated_nulls.as_ref(),
- typed_value.nulls(),
- );
+ accumulated_nulls =
+ NullBuffer::union(accumulated_nulls.as_ref(),
typed_value.nulls());
}
shredding_state = state;
path_index += 1;
@@ -251,6 +250,7 @@ fn try_perfect_shredding(variant_array: &VariantArray,
as_field: &Field) -> Opti
return None;
}
let typed_value = variant_array.typed_value_field()?;
+
if typed_value.data_type() == as_field.data_type()
&& variant_array
.value_field()
@@ -264,6 +264,7 @@ fn try_perfect_shredding(variant_array: &VariantArray,
as_field: &Field) -> Opti
// so we can just return the typed value.
return Some(typed_value.clone());
}
+
None
}
@@ -1048,7 +1049,7 @@ mod test {
EMPTY_VARIANT_METADATA_BYTES,
typed_value.len(),
));
- VariantArray::from_parts(metadata, None, Some(typed_value),
None).into()
+ VariantArray::from_parts(Arc::new(metadata), None,
Some(typed_value), None).into()
}
};
}
@@ -1716,7 +1717,12 @@ mod test {
let metadata =
BinaryViewArray::from_iter_values(std::iter::repeat_n(EMPTY_VARIANT_METADATA_BYTES,
3));
- ArrayRef::from(VariantArray::from_parts(metadata, None, None,
Some(nulls)))
+ ArrayRef::from(VariantArray::from_parts(
+ Arc::new(metadata),
+ None,
+ None,
+ Some(nulls),
+ ))
}
/// This test manually constructs a shredded variant array representing
objects
/// like {"x": 1, "y": "foo"} and {"x": 42} and tests extracting the "x"
field
@@ -1822,8 +1828,8 @@ mod test {
// Create the main VariantArray
ArrayRef::from(VariantArray::from_parts(
- metadata_array,
- Some(value_array),
+ Arc::new(metadata_array),
+ Some(Arc::new(value_array)),
Some(Arc::new(typed_value_struct)),
None,
))
@@ -2199,8 +2205,8 @@ mod test {
// Build final VariantArray
ArrayRef::from(VariantArray::from_parts(
- metadata_array,
- Some(value_array),
+ Arc::new(metadata_array),
+ Some(Arc::new(value_array)),
Some(Arc::new(typed_value_struct)),
None,
))
@@ -2290,7 +2296,7 @@ mod test {
.unwrap(),
) as ArrayRef;
let a_field_shredded = ShreddedVariantFieldArray::from_parts(
- Some(a_value_array),
+ Some(Arc::new(a_value_array)),
Some(a_inner_typed_value),
None,
);
@@ -2310,8 +2316,8 @@ mod test {
// Build final VariantArray
ArrayRef::from(VariantArray::from_parts(
- metadata_array,
- Some(value_array),
+ Arc::new(metadata_array),
+ Some(Arc::new(value_array)),
Some(Arc::new(typed_value_struct)),
None,
))
@@ -2392,7 +2398,7 @@ mod test {
.unwrap(),
) as ArrayRef;
let b_field_shredded = ShreddedVariantFieldArray::from_parts(
- Some(b_value_array),
+ Some(Arc::new(b_value_array)),
Some(b_inner_typed_value),
None,
);
@@ -2421,7 +2427,7 @@ mod test {
.unwrap(),
) as ArrayRef;
let a_field_shredded = ShreddedVariantFieldArray::from_parts(
- Some(a_value_array),
+ Some(Arc::new(a_value_array)),
Some(a_inner_typed_value),
None,
);
@@ -2441,8 +2447,8 @@ mod test {
// Build final VariantArray
ArrayRef::from(VariantArray::from_parts(
- metadata_array,
- Some(value_array),
+ Arc::new(metadata_array),
+ Some(Arc::new(value_array)),
Some(Arc::new(typed_value_struct)),
None,
))
@@ -3208,7 +3214,7 @@ mod test {
// Build final VariantArray with top-level nulls
ArrayRef::from(VariantArray::from_parts(
- metadata_array,
+ Arc::new(metadata_array),
None,
Some(Arc::new(typed_value_struct)),
Some(nulls),
@@ -3267,7 +3273,7 @@ mod test {
false, // row 3: top-level NULL
]);
ArrayRef::from(VariantArray::from_parts(
- metadata_array,
+ Arc::new(metadata_array),
None,
Some(Arc::new(typed_value)),
Some(nulls),
@@ -3336,8 +3342,8 @@ mod test {
// Top-level null is encoded in the main StructArray's null mask
let variant_nulls = NullBuffer::from(vec![true, true, true, false]);
// Row 3 is top-level null
ArrayRef::from(VariantArray::from_parts(
- metadata_array,
- Some(value_array),
+ Arc::new(metadata_array),
+ Some(Arc::new(value_array)),
Some(Arc::new(typed_value_struct)),
Some(variant_nulls),
))
@@ -4014,9 +4020,13 @@ mod test {
EMPTY_VARIANT_METADATA_BYTES,
all_nulls_values.len(),
));
- let variant_array: ArrayRef =
- VariantArray::from_parts(metadata, None,
Some(Arc::new(typed_value_struct)), None)
- .into();
+ let variant_array: ArrayRef = VariantArray::from_parts(
+ Arc::new(metadata),
+ None,
+ Some(Arc::new(typed_value_struct)),
+ None,
+ )
+ .into();
// Case 1: all-null primitive column should reuse the typed_value Arc
directly
let all_nulls_field_ref = FieldRef::from(Field::new("result",
DataType::Int32, true));
diff --git a/parquet-variant-compute/src/variant_to_arrow.rs
b/parquet-variant-compute/src/variant_to_arrow.rs
index dd054a5f7d..6d1626640c 100644
--- a/parquet-variant-compute/src/variant_to_arrow.rs
+++ b/parquet-variant-compute/src/variant_to_arrow.rs
@@ -26,11 +26,10 @@ use crate::type_conversion::{
use crate::variant_array::ShreddedVariantFieldArray;
use crate::{VariantArray, VariantValueArrayBuilder};
use arrow::array::{
- ArrayRef, ArrowNativeTypeOp, BinaryBuilder, BinaryLikeArrayBuilder,
BinaryViewArray,
- BinaryViewBuilder, BooleanBuilder, FixedSizeBinaryBuilder,
GenericListArray,
- GenericListViewArray, LargeBinaryBuilder, LargeStringBuilder, NullArray,
NullBufferBuilder,
- OffsetSizeTrait, PrimitiveBuilder, StringBuilder, StringLikeArrayBuilder,
StringViewBuilder,
- StructArray,
+ ArrayRef, ArrowNativeTypeOp, BinaryBuilder, BinaryLikeArrayBuilder,
BinaryViewBuilder,
+ BooleanBuilder, FixedSizeBinaryBuilder, GenericListArray,
GenericListViewArray,
+ LargeBinaryBuilder, LargeStringBuilder, NullArray, NullBufferBuilder,
OffsetSizeTrait,
+ PrimitiveBuilder, StringBuilder, StringLikeArrayBuilder,
StringViewBuilder, StructArray,
};
use arrow::buffer::{OffsetBuffer, ScalarBuffer};
use arrow::compute::{CastOptions, DecimalCast};
@@ -119,7 +118,7 @@ fn make_typed_variant_to_arrow_row_builder<'a>(
}
pub(crate) fn make_variant_to_arrow_row_builder<'a>(
- metadata: &BinaryViewArray,
+ metadata: &ArrayRef,
path: VariantPath<'a>,
data_type: Option<&'a DataType>,
cast_options: &'a CastOptions,
@@ -924,7 +923,7 @@ impl<'a> ListElementBuilder<'a> {
Self::Shredded(b) => {
let (value, typed_value, nulls) = b.finish()?;
Ok(ArrayRef::from(ShreddedVariantFieldArray::from_parts(
- Some(value),
+ Some(Arc::new(value)),
Some(typed_value),
nulls,
)))
@@ -1052,13 +1051,13 @@ where
/// Builder for creating VariantArray output (for path extraction without type
conversion)
pub(crate) struct VariantToBinaryVariantArrowRowBuilder {
- metadata: BinaryViewArray,
+ metadata: ArrayRef,
builder: VariantValueArrayBuilder,
nulls: NullBufferBuilder,
}
impl VariantToBinaryVariantArrowRowBuilder {
- fn new(metadata: BinaryViewArray, capacity: usize) -> Self {
+ fn new(metadata: ArrayRef, capacity: usize) -> Self {
Self {
metadata,
builder: VariantValueArrayBuilder::new(capacity),
@@ -1083,7 +1082,7 @@ impl VariantToBinaryVariantArrowRowBuilder {
fn finish(mut self) -> Result<ArrayRef> {
let variant_array = VariantArray::from_parts(
self.metadata,
- Some(self.builder.build()?),
+ Some(Arc::new(self.builder.build()?)),
None, // no typed_value column
self.nulls.finish(),
);