scovich commented on code in PR #8392:
URL: https://github.com/apache/arrow-rs/pull/8392#discussion_r2364904431
##########
parquet-variant-compute/src/variant_array.rs:
##########
@@ -368,59 +585,54 @@ impl ShreddedVariantFieldArray {
shredding_state: ShreddingState::new(value, typed_value),
}
}
-}
-impl Array for ShreddedVariantFieldArray {
- fn as_any(&self) -> &dyn Any {
- self
- }
-
- fn to_data(&self) -> ArrayData {
- self.inner.to_data()
- }
-
- fn into_data(self) -> ArrayData {
- self.inner.into_data()
+ /// Returns the inner [`StructArray`], consuming self
+ pub fn into_inner(self) -> StructArray {
+ self.inner
}
- fn data_type(&self) -> &DataType {
+ pub fn data_type(&self) -> &DataType {
self.inner.data_type()
}
- fn slice(&self, offset: usize, length: usize) -> ArrayRef {
- let inner = self.inner.slice(offset, length);
- let shredding_state = self.shredding_state.slice(offset, length);
- Arc::new(Self {
- inner,
- shredding_state,
- })
- }
-
- fn len(&self) -> usize {
+ pub fn len(&self) -> usize {
self.inner.len()
}
- fn is_empty(&self) -> bool {
+ pub fn is_empty(&self) -> bool {
self.inner.is_empty()
}
- fn offset(&self) -> usize {
+ pub fn offset(&self) -> usize {
self.inner.offset()
}
- fn nulls(&self) -> Option<&NullBuffer> {
+ pub fn nulls(&self) -> Option<&NullBuffer> {
// According to the shredding spec, ShreddedVariantFieldArray should be
// physically non-nullable - SQL NULL is inferred by both value and
// typed_value being physically NULL
None
}
+ /// Is the element at index null?
+ pub fn is_null(&self, index: usize) -> bool {
+ self.nulls().map(|n| n.is_null(index)).unwrap_or_default()
Review Comment:
Isn't this just
```suggestion
self.nulls().map_or(false, |n| n.is_null(index))
```
##########
parquet-variant-compute/src/variant_array.rs:
##########
@@ -18,36 +18,194 @@
//! [`VariantArray`] implementation
use crate::type_conversion::primitive_conversion_single_value;
-use arrow::array::{Array, ArrayData, ArrayRef, AsArray, BinaryViewArray,
StructArray};
+use arrow::array::{Array, ArrayRef, AsArray, BinaryViewArray, StructArray};
use arrow::buffer::NullBuffer;
+use arrow::compute::cast;
use arrow::datatypes::{
Date32Type, Float16Type, Float32Type, Float64Type, Int16Type, Int32Type,
Int64Type, Int8Type,
UInt16Type, UInt32Type, UInt64Type, UInt8Type,
};
+use arrow_schema::extension::ExtensionType;
use arrow_schema::{ArrowError, DataType, Field, FieldRef, Fields};
use parquet_variant::Uuid;
use parquet_variant::Variant;
-use std::any::Any;
use std::sync::Arc;
+/// Arrow Variant [`ExtensionType`].
+///
+/// Represents the canonical Arrow Extension Type for storing variants.
+/// See [`VariantArray`] for more examples of using this extension type.
+pub struct VariantType;
+
+impl ExtensionType for VariantType {
+ const NAME: &'static str = "parquet.variant";
+
+ // Variants have no extension metadata
+ type Metadata = ();
+
+ fn metadata(&self) -> &Self::Metadata {
+ &()
+ }
+
+ fn serialize_metadata(&self) -> Option<String> {
+ None
+ }
+
+ fn deserialize_metadata(_metadata: Option<&str>) -> Result<Self::Metadata,
ArrowError> {
+ Ok(())
+ }
+
+ fn supports_data_type(&self, data_type: &DataType) -> Result<(),
ArrowError> {
+ // Note don't check for metadata/value fields here because they may be
+ // absent in shredded variants
Review Comment:
This doesn't sound quite right -- AFAIK the extension type should only
annotate the top-level variant. Any shredded object fields or array elements
nested inside should be identified according to spec, because at that point we
know we're traversing a shredded variant (by virtue of having stepped into the
`typed_value` column of a `VariantArray`. If anything looks out of place when
trying to step into a shredded struct field or array element, that's just plain
an error. No annotations needed to figure that out.
Even if, for some reason, we decide it's important not to require
`metadata`, we should still enforce that the only fields present are some
subset of `metadata`, `value` and `typed_value`, where the first two are
binary. TBD whether we recurse into `typed_value` to verify the rest of the
structure -- we probably should, but that could become expensive if people are
converting from ArrayRef to VariantArray to ArrayRef a lot.
##########
parquet-variant-compute/src/variant_array.rs:
##########
@@ -647,70 +883,57 @@ fn typed_value_to_variant(typed_value: &ArrayRef, index:
usize) -> Variant<'_, '
}
}
-impl Array for VariantArray {
- fn as_any(&self) -> &dyn Any {
- self
- }
-
- fn to_data(&self) -> ArrayData {
- self.inner.to_data()
- }
-
- fn into_data(self) -> ArrayData {
- self.inner.into_data()
- }
-
- fn data_type(&self) -> &DataType {
- self.inner.data_type()
- }
-
- fn slice(&self, offset: usize, length: usize) -> ArrayRef {
- let inner = self.inner.slice(offset, length);
- let metadata = self.metadata.slice(offset, length);
- let shredding_state = self.shredding_state.slice(offset, length);
- Arc::new(Self {
- inner,
- metadata,
- shredding_state,
- })
- }
-
- fn len(&self) -> usize {
- self.inner.len()
- }
-
- fn is_empty(&self) -> bool {
- self.inner.is_empty()
- }
-
- fn offset(&self) -> usize {
- self.inner.offset()
- }
-
- fn nulls(&self) -> Option<&NullBuffer> {
- self.inner.nulls()
- }
-
- fn get_buffer_memory_size(&self) -> usize {
- self.inner.get_buffer_memory_size()
- }
+/// Workaround for lack of direct support for BinaryArray
+/// <https://github.com/apache/arrow-rs/issues/8387>
+///
+/// The values are read as
+/// * `StructArray<metadata: Binary, value: Binary>`
+///
+/// but VariantArray needs them as
+/// * `StructArray<metadata: BinaryView, value: BinaryView>`
+///
+/// So cast them to get the right type.
+fn cast_to_binary_view_arrays(array: &dyn Array) -> Result<ArrayRef,
ArrowError> {
+ let new_type = map_type(array.data_type());
+ cast(array, &new_type)
+}
- fn get_array_memory_size(&self) -> usize {
- self.inner.get_array_memory_size()
+/// replaces all instances of Binary with BinaryView in a DataType
+fn map_type(data_type: &DataType) -> DataType {
Review Comment:
"map" makes it sound like `DataType::Map`? Could we choose a clearer name?
##########
parquet-variant-compute/src/variant_array.rs:
##########
@@ -18,36 +18,194 @@
//! [`VariantArray`] implementation
use crate::type_conversion::primitive_conversion_single_value;
-use arrow::array::{Array, ArrayData, ArrayRef, AsArray, BinaryViewArray,
StructArray};
+use arrow::array::{Array, ArrayRef, AsArray, BinaryViewArray, StructArray};
use arrow::buffer::NullBuffer;
+use arrow::compute::cast;
use arrow::datatypes::{
Date32Type, Float16Type, Float32Type, Float64Type, Int16Type, Int32Type,
Int64Type, Int8Type,
UInt16Type, UInt32Type, UInt64Type, UInt8Type,
};
+use arrow_schema::extension::ExtensionType;
use arrow_schema::{ArrowError, DataType, Field, FieldRef, Fields};
use parquet_variant::Uuid;
use parquet_variant::Variant;
-use std::any::Any;
use std::sync::Arc;
+/// Arrow Variant [`ExtensionType`].
+///
+/// Represents the canonical Arrow Extension Type for storing variants.
+/// See [`VariantArray`] for more examples of using this extension type.
+pub struct VariantType;
+
+impl ExtensionType for VariantType {
+ const NAME: &'static str = "parquet.variant";
+
+ // Variants have no extension metadata
+ type Metadata = ();
+
+ fn metadata(&self) -> &Self::Metadata {
+ &()
+ }
+
+ fn serialize_metadata(&self) -> Option<String> {
+ None
+ }
+
+ fn deserialize_metadata(_metadata: Option<&str>) -> Result<Self::Metadata,
ArrowError> {
+ Ok(())
+ }
+
+ fn supports_data_type(&self, data_type: &DataType) -> Result<(),
ArrowError> {
+ // Note don't check for metadata/value fields here because they may be
+ // absent in shredded variants
+ if matches!(data_type, DataType::Struct(_)) {
+ Ok(())
+ } else {
+ Err(ArrowError::InvalidArgumentError(format!(
+ "VariantType only supports StructArray, got {}",
+ data_type
+ )))
+ }
+ }
+
+ fn try_new(data_type: &DataType, _metadata: Self::Metadata) ->
Result<Self, ArrowError> {
+ let new_self = Self;
+ new_self.supports_data_type(data_type)?;
+ Ok(new_self)
Review Comment:
I would think this should work?
```suggestion
Self.supports_data_type(data_type)?;
Ok(Self)
```
##########
parquet-variant-compute/src/variant_array.rs:
##########
@@ -647,70 +883,57 @@ fn typed_value_to_variant(typed_value: &ArrayRef, index:
usize) -> Variant<'_, '
}
}
-impl Array for VariantArray {
- fn as_any(&self) -> &dyn Any {
- self
- }
-
- fn to_data(&self) -> ArrayData {
- self.inner.to_data()
- }
-
- fn into_data(self) -> ArrayData {
- self.inner.into_data()
- }
-
- fn data_type(&self) -> &DataType {
- self.inner.data_type()
- }
-
- fn slice(&self, offset: usize, length: usize) -> ArrayRef {
- let inner = self.inner.slice(offset, length);
- let metadata = self.metadata.slice(offset, length);
- let shredding_state = self.shredding_state.slice(offset, length);
- Arc::new(Self {
- inner,
- metadata,
- shredding_state,
- })
- }
-
- fn len(&self) -> usize {
- self.inner.len()
- }
-
- fn is_empty(&self) -> bool {
- self.inner.is_empty()
- }
-
- fn offset(&self) -> usize {
- self.inner.offset()
- }
-
- fn nulls(&self) -> Option<&NullBuffer> {
- self.inner.nulls()
- }
-
- fn get_buffer_memory_size(&self) -> usize {
- self.inner.get_buffer_memory_size()
- }
+/// Workaround for lack of direct support for BinaryArray
+/// <https://github.com/apache/arrow-rs/issues/8387>
+///
+/// The values are read as
+/// * `StructArray<metadata: Binary, value: Binary>`
+///
+/// but VariantArray needs them as
+/// * `StructArray<metadata: BinaryView, value: BinaryView>`
+///
+/// So cast them to get the right type.
+fn cast_to_binary_view_arrays(array: &dyn Array) -> Result<ArrayRef,
ArrowError> {
+ let new_type = map_type(array.data_type());
+ cast(array, &new_type)
+}
- fn get_array_memory_size(&self) -> usize {
- self.inner.get_array_memory_size()
+/// replaces all instances of Binary with BinaryView in a DataType
+fn map_type(data_type: &DataType) -> DataType {
+ match data_type {
+ DataType::Binary => DataType::BinaryView,
+ DataType::List(field) => {
+ let new_field = field
+ .as_ref()
+ .clone()
+ .with_data_type(map_type(field.data_type()));
+ DataType::List(Arc::new(new_field))
+ }
+ DataType::Struct(fields) => {
+ let new_fields: Fields = fields
+ .iter()
+ .map(|f| {
+ let new_field =
f.as_ref().clone().with_data_type(map_type(f.data_type()));
+ Arc::new(new_field)
+ })
+ .collect();
+ DataType::Struct(new_fields)
Review Comment:
You _might_ be able to cheat `fmt` by:
```suggestion
let new_fields = fields.iter().map(|f| {
Arc::new(f.as_ref().clone().with_data_type(map_type(f.data_type())))
});
DataType::Struct(new_fields.collect())
```
##########
parquet-variant-compute/src/variant_array.rs:
##########
@@ -242,6 +404,67 @@ impl VariantArray {
pub fn typed_value_field(&self) -> Option<&ArrayRef> {
self.shredding_state.typed_value_field()
}
+
+ /// Return a field to represent this VariantArray in a `Schema` with
+ /// a particular name
+ pub fn field(&self, name: impl Into<String>) -> Field {
+ Field::new(
+ name.into(),
+ self.data_type().clone(),
+ self.inner.is_nullable(),
+ )
+ .with_extension_type(VariantType)
+ }
+
+ /// Returns a new DataType representing this VariantArray's inner type
+ pub fn data_type(&self) -> &DataType {
+ self.inner.data_type()
+ }
+
+ pub fn slice(&self, offset: usize, length: usize) -> Self {
+ let inner = self.inner.slice(offset, length);
+ let metadata = self.metadata.slice(offset, length);
+ let shredding_state = self.shredding_state.slice(offset, length);
+ Self {
+ inner,
+ metadata,
+ shredding_state,
+ }
+ }
+
+ pub fn len(&self) -> usize {
+ self.inner.len()
+ }
+
+ pub fn is_empty(&self) -> bool {
+ self.inner.is_empty()
+ }
+
+ pub fn nulls(&self) -> Option<&NullBuffer> {
+ self.inner.nulls()
+ }
+
+ /// Is the element at index null?
+ pub fn is_null(&self, index: usize) -> bool {
+ self.nulls().map(|n| n.is_null(index)).unwrap_or_default()
Review Comment:
See other comment about `map_or`
##########
parquet-variant-compute/src/variant_get.rs:
##########
@@ -87,20 +88,19 @@ pub(crate) fn follow_shredded_path_element<'a>(
return Ok(missing_path_step());
};
- let field = field
- .as_any()
- .downcast_ref::<ShreddedVariantFieldArray>()
- .ok_or_else(|| {
- // TODO: Should we blow up? Or just end the traversal and
let the normal
- // variant pathing code sort out the mess that it must
anyway be
- // prepared to handle?
- ArrowError::InvalidArgumentError(format!(
- "Expected a ShreddedVariantFieldArray, got {:?}
instead",
- field.data_type(),
- ))
- })?;
-
- Ok(ShreddedPathStep::Success(field.shredding_state()))
+ let struct_array = field.as_struct_opt().ok_or_else(|| {
+ // TODO: Should we blow up? Or just end the traversal and let
the normal
+ // variant pathing code sort out the mess that it must anyway
be
+ // prepared to handle?
+ ArrowError::InvalidArgumentError(format!(
+ "Expected Struct array while following path, got {}",
+ field.data_type(),
+ ))
+ })?;
+
+ let shredding_state = ShreddingState::from(struct_array);
+
+ Ok(ShreddedPathStep::Success(shredding_state))
Review Comment:
```suggestion
Ok(ShreddedPathStep::Success(struct_array.into()))
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]