scovich commented on code in PR #8673:
URL: https://github.com/apache/arrow-rs/pull/8673#discussion_r2543751339
##########
parquet-variant-compute/src/variant_array.rs:
##########
@@ -733,6 +736,91 @@ impl From<ShreddedVariantFieldArray> for StructArray {
}
}
+/// A typed array reference that pairs an [`ArrayRef`] with its [`Field`]
metadata.
+///
+/// This struct is used to represent the `typed_value` field in shredded
variant arrays,
+/// where we need to preserve both the array data and its field metadata (such
as field
+/// name, data type, nullability, and extension type information).
+///
+/// The separation of array data and field metadata allows for proper handling
of:
+/// - Field names when working with struct fields
+/// - Nullability information for proper null handling
+/// - Extension type metadata (e.g., UUID extension on FixedSizeBinary)
+/// - Data type information for casting and validation
+#[derive(Debug, Clone)]
+pub struct TypedArrayRef {
+ inner: ArrayRef,
+ field: FieldRef,
+}
+
+impl TypedArrayRef {
+ pub fn inner(&self) -> &ArrayRef {
+ &self.inner
+ }
+
+ pub fn into_inner(self) -> ArrayRef {
+ self.inner
+ }
+
+ pub fn field(&self) -> &FieldRef {
+ &self.field
+ }
+
+ // note: these methods below make me want to impl Array for
TypedArrayRef...
+ pub fn slice(&self, offset: usize, length: usize) -> Self {
Review Comment:
I don't think we could actually leverage `Array::slice` because it returns
`Arc<dyn Array>`, which cannot be converted to `TypedArrayRef`?
But we could potentially impl Deref for TypedArrayRef with Target = dyn
Array, to cover e.g. `is_valid` and the various array-casting calls in the
shredding code?
##########
parquet-variant-compute/src/variant_get.rs:
##########
@@ -59,21 +59,26 @@ pub(crate) fn follow_shredded_path_element<'a>(
None => ShreddedPathStep::Missing,
};
- let Some(typed_value) = shredding_state.typed_value_field() else {
+ let Some(BorrowedTypedArrayRef {
+ inner: typed_value_array,
+ field: _typed_value_field,
+ }) = shredding_state.typed_value_field()
+ else {
Review Comment:
again, Deref seems like a less invasive option here?
##########
parquet-variant-compute/src/variant_array.rs:
##########
@@ -797,15 +885,17 @@ impl ShreddingState {
}
/// Return a reference to the typed_value field, if present
- pub fn typed_value_field(&self) -> Option<&ArrayRef> {
+ pub fn typed_value_field(&self) -> Option<&TypedArrayRef> {
self.typed_value.as_ref()
}
/// Returns a borrowed version of this shredding state
pub fn borrow(&self) -> BorrowedShreddingState<'_> {
BorrowedShreddingState {
value: self.value_field(),
- typed_value: self.typed_value_field(),
+ typed_value: self
+ .typed_value_field()
+ .map(|TypedArrayRef { inner, field }| BorrowedTypedArrayRef {
inner, field }),
Review Comment:
Does this merit an impl From?
##########
parquet-variant-compute/src/variant_array.rs:
##########
@@ -935,75 +1063,89 @@ impl StructArrayBuilder {
/// returns the non-null element at index as a Variant
fn typed_value_to_variant<'a>(
- typed_value: &'a ArrayRef,
+ typed_value: &'a TypedArrayRef,
value: Option<&BinaryViewArray>,
index: usize,
) -> Result<Variant<'a, 'a>> {
- let data_type = typed_value.data_type();
+ let TypedArrayRef {
+ inner: typed_value_array,
+ field: typed_value_field,
+ } = typed_value;
+
+ let data_type = typed_value_array.data_type();
Review Comment:
Instead of all this churn, an impl Deref could make it transparent?
(see other comment)
##########
parquet-variant-compute/src/variant_array.rs:
##########
@@ -935,75 +1063,89 @@ impl StructArrayBuilder {
/// returns the non-null element at index as a Variant
fn typed_value_to_variant<'a>(
- typed_value: &'a ArrayRef,
+ typed_value: &'a TypedArrayRef,
value: Option<&BinaryViewArray>,
index: usize,
) -> Result<Variant<'a, 'a>> {
- let data_type = typed_value.data_type();
+ let TypedArrayRef {
+ inner: typed_value_array,
+ field: typed_value_field,
+ } = typed_value;
+
+ let data_type = typed_value_array.data_type();
if value.is_some_and(|v| !matches!(data_type, DataType::Struct(_)) &&
v.is_valid(index)) {
// Only a partially shredded struct is allowed to have values for both
columns
panic!("Invalid variant, conflicting value and typed_value");
}
match data_type {
DataType::Null => Ok(Variant::Null),
DataType::Boolean => {
- let boolean_array = typed_value.as_boolean();
+ let boolean_array = typed_value_array.as_boolean();
let value = boolean_array.value(index);
Ok(Variant::from(value))
}
// 16-byte FixedSizeBinary alway corresponds to a UUID; all other
sizes are illegal.
DataType::FixedSizeBinary(16) => {
- let array = typed_value.as_fixed_size_binary();
+ if !matches!(
+ typed_value_field.try_extension_type(),
+ Ok(arrow_schema::extension::Uuid)
Review Comment:
aside: It seems like we had a TODO somewhere, to check for an extension type
without having to actually allocate it? Or is this check super cheap and we
don't care?
##########
parquet-variant-compute/src/variant_get.rs:
##########
@@ -3145,23 +3150,23 @@ mod test {
// For top-level null row, the field still needs valid content (not
null)
let x_field_typed_value = Int32Array::from(vec![Some(1), Some(2),
Some(3), Some(0)]);
let x_field_struct = StructArrayBuilder::new()
- .with_field("typed_value", Arc::new(x_field_typed_value), true)
+ .with_column_name("typed_value", Arc::new(x_field_typed_value),
true)
.build();
let x_field_shredded =
ShreddedVariantFieldArray::try_new(&x_field_struct)
.expect("should create ShreddedVariantFieldArray for x");
// Create main typed_value struct (only contains shredded fields)
let typed_value_struct = StructArrayBuilder::new()
- .with_field("x", ArrayRef::from(x_field_shredded), false)
+ .with_column_name("x", ArrayRef::from(x_field_shredded), false)
.build();
// Build VariantArray with both value and typed_value
(PartiallyShredded)
// Top-level null is encoded in the main StructArray's null mask
let variant_nulls = NullBuffer::from(vec![true, true, true, false]);
// Row 3 is top-level null
let struct_array = StructArrayBuilder::new()
- .with_field("metadata", Arc::new(metadata_array), false)
- .with_field("value", Arc::new(value_array), true)
- .with_field("typed_value", Arc::new(typed_value_struct), true)
+ .with_column_name("metadata", Arc::new(metadata_array), false)
+ .with_column_name("value", Arc::new(value_array), true)
+ .with_column_name("typed_value", Arc::new(typed_value_struct),
true)
Review Comment:
Is it actually bad to keep the original `with_field` name, given that its
signature nearly matches
[Field::new](https://docs.rs/arrow/latest/arrow/datatypes/struct.Field.html#method.new)?
And the new method can keep its (also perfectly descriptive) name
`with_field_ref`?
##########
parquet-variant-compute/src/variant_get.rs:
##########
@@ -3145,23 +3150,23 @@ mod test {
// For top-level null row, the field still needs valid content (not
null)
let x_field_typed_value = Int32Array::from(vec![Some(1), Some(2),
Some(3), Some(0)]);
let x_field_struct = StructArrayBuilder::new()
- .with_field("typed_value", Arc::new(x_field_typed_value), true)
+ .with_column_name("typed_value", Arc::new(x_field_typed_value),
true)
.build();
let x_field_shredded =
ShreddedVariantFieldArray::try_new(&x_field_struct)
.expect("should create ShreddedVariantFieldArray for x");
// Create main typed_value struct (only contains shredded fields)
let typed_value_struct = StructArrayBuilder::new()
- .with_field("x", ArrayRef::from(x_field_shredded), false)
+ .with_column_name("x", ArrayRef::from(x_field_shredded), false)
.build();
// Build VariantArray with both value and typed_value
(PartiallyShredded)
// Top-level null is encoded in the main StructArray's null mask
let variant_nulls = NullBuffer::from(vec![true, true, true, false]);
// Row 3 is top-level null
let struct_array = StructArrayBuilder::new()
- .with_field("metadata", Arc::new(metadata_array), false)
- .with_field("value", Arc::new(value_array), true)
- .with_field("typed_value", Arc::new(typed_value_struct), true)
+ .with_column_name("metadata", Arc::new(metadata_array), false)
+ .with_column_name("value", Arc::new(value_array), true)
+ .with_column_name("typed_value", Arc::new(typed_value_struct),
true)
Review Comment:
There's a fair bit of churn from this rename, and I'm not certain the new
name is accurate, given that it specifies more than just the column name?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]