alamb commented on code in PR #8392:
URL: https://github.com/apache/arrow-rs/pull/8392#discussion_r2365564360
##########
parquet-variant-compute/src/variant_array.rs:
##########
@@ -18,36 +18,194 @@
//! [`VariantArray`] implementation
use crate::type_conversion::primitive_conversion_single_value;
-use arrow::array::{Array, ArrayData, ArrayRef, AsArray, BinaryViewArray,
StructArray};
+use arrow::array::{Array, ArrayRef, AsArray, BinaryViewArray, StructArray};
use arrow::buffer::NullBuffer;
+use arrow::compute::cast;
use arrow::datatypes::{
Date32Type, Float16Type, Float32Type, Float64Type, Int16Type, Int32Type,
Int64Type, Int8Type,
UInt16Type, UInt32Type, UInt64Type, UInt8Type,
};
+use arrow_schema::extension::ExtensionType;
use arrow_schema::{ArrowError, DataType, Field, FieldRef, Fields};
use parquet_variant::Uuid;
use parquet_variant::Variant;
-use std::any::Any;
use std::sync::Arc;
+/// Arrow Variant [`ExtensionType`].
+///
+/// Represents the canonical Arrow Extension Type for storing variants.
+/// See [`VariantArray`] for more examples of using this extension type.
+pub struct VariantType;
+
+impl ExtensionType for VariantType {
+ const NAME: &'static str = "parquet.variant";
+
+ // Variants have no extension metadata
+ type Metadata = ();
+
+ fn metadata(&self) -> &Self::Metadata {
+ &()
+ }
+
+ fn serialize_metadata(&self) -> Option<String> {
+ None
+ }
+
+ fn deserialize_metadata(_metadata: Option<&str>) -> Result<Self::Metadata,
ArrowError> {
+ Ok(())
+ }
+
+ fn supports_data_type(&self, data_type: &DataType) -> Result<(),
ArrowError> {
+ // Note don't check for metadata/value fields here because they may be
+ // absent in shredded variants
+ if matches!(data_type, DataType::Struct(_)) {
+ Ok(())
+ } else {
+ Err(ArrowError::InvalidArgumentError(format!(
+ "VariantType only supports StructArray, got {}",
+ data_type
+ )))
+ }
+ }
+
+ fn try_new(data_type: &DataType, _metadata: Self::Metadata) ->
Result<Self, ArrowError> {
+ let new_self = Self;
+ new_self.supports_data_type(data_type)?;
+ Ok(new_self)
Review Comment:
done!
##########
parquet-variant-compute/src/variant_get.rs:
##########
@@ -23,15 +23,16 @@ use arrow::{
use arrow_schema::{ArrowError, DataType, FieldRef};
use parquet_variant::{VariantPath, VariantPathElement};
-use crate::variant_array::{ShreddedVariantFieldArray, ShreddingState};
+use crate::variant_array::ShreddingState;
use crate::variant_to_arrow::make_variant_to_arrow_row_builder;
use crate::VariantArray;
+use arrow::array::AsArray;
use std::sync::Arc;
-pub(crate) enum ShreddedPathStep<'a> {
+pub(crate) enum ShreddedPathStep {
/// Path step succeeded, return the new shredding state
- Success(&'a ShreddingState),
+ Success(ShreddingState),
Review Comment:
I think BinaryViewArrays also have a `Vec` that will get cloned. However, I
think it is ok to clone for now, as the path / cloning happens once *per array*
and so is likely amortized over a batch.
I tried option 1 briefly but I couldn't get it to work.
I'll let benchmarking be our guide and if we see problems with this clone,
then perhaps we can pursue options 2 or 3.
##########
parquet-variant-compute/src/variant_array.rs:
##########
@@ -368,59 +585,54 @@ impl ShreddedVariantFieldArray {
shredding_state: ShreddingState::new(value, typed_value),
}
}
-}
-impl Array for ShreddedVariantFieldArray {
- fn as_any(&self) -> &dyn Any {
- self
- }
-
- fn to_data(&self) -> ArrayData {
- self.inner.to_data()
- }
-
- fn into_data(self) -> ArrayData {
- self.inner.into_data()
+ /// Returns the inner [`StructArray`], consuming self
+ pub fn into_inner(self) -> StructArray {
+ self.inner
}
- fn data_type(&self) -> &DataType {
+ pub fn data_type(&self) -> &DataType {
self.inner.data_type()
}
- fn slice(&self, offset: usize, length: usize) -> ArrayRef {
- let inner = self.inner.slice(offset, length);
- let shredding_state = self.shredding_state.slice(offset, length);
- Arc::new(Self {
- inner,
- shredding_state,
- })
- }
-
- fn len(&self) -> usize {
+ pub fn len(&self) -> usize {
self.inner.len()
}
- fn is_empty(&self) -> bool {
+ pub fn is_empty(&self) -> bool {
self.inner.is_empty()
}
- fn offset(&self) -> usize {
+ pub fn offset(&self) -> usize {
self.inner.offset()
}
- fn nulls(&self) -> Option<&NullBuffer> {
+ pub fn nulls(&self) -> Option<&NullBuffer> {
// According to the shredding spec, ShreddedVariantFieldArray should be
// physically non-nullable - SQL NULL is inferred by both value and
// typed_value being physically NULL
None
}
+ /// Is the element at index null?
+ pub fn is_null(&self, index: usize) -> bool {
+ self.nulls().map(|n| n.is_null(index)).unwrap_or_default()
Review Comment:
Yes, and in fact when I did that clippy told me it could be simplified
further to
```rust
self.nulls().is_some_and( |n| n.is_null(index))
```
##########
parquet-variant-compute/src/variant_array.rs:
##########
@@ -647,70 +883,57 @@ fn typed_value_to_variant(typed_value: &ArrayRef, index:
usize) -> Variant<'_, '
}
}
-impl Array for VariantArray {
- fn as_any(&self) -> &dyn Any {
- self
- }
-
- fn to_data(&self) -> ArrayData {
- self.inner.to_data()
- }
-
- fn into_data(self) -> ArrayData {
- self.inner.into_data()
- }
-
- fn data_type(&self) -> &DataType {
- self.inner.data_type()
- }
-
- fn slice(&self, offset: usize, length: usize) -> ArrayRef {
- let inner = self.inner.slice(offset, length);
- let metadata = self.metadata.slice(offset, length);
- let shredding_state = self.shredding_state.slice(offset, length);
- Arc::new(Self {
- inner,
- metadata,
- shredding_state,
- })
- }
-
- fn len(&self) -> usize {
- self.inner.len()
- }
-
- fn is_empty(&self) -> bool {
- self.inner.is_empty()
- }
-
- fn offset(&self) -> usize {
- self.inner.offset()
- }
-
- fn nulls(&self) -> Option<&NullBuffer> {
- self.inner.nulls()
- }
-
- fn get_buffer_memory_size(&self) -> usize {
- self.inner.get_buffer_memory_size()
- }
+/// Workaround for lack of direct support for BinaryArray
+/// <https://github.com/apache/arrow-rs/issues/8387>
+///
+/// The values are read as
+/// * `StructArray<metadata: Binary, value: Binary>`
+///
+/// but VariantArray needs them as
+/// * `StructArray<metadata: BinaryView, value: BinaryView>`
+///
+/// So cast them to get the right type.
+fn cast_to_binary_view_arrays(array: &dyn Array) -> Result<ArrayRef,
ArrowError> {
+ let new_type = map_type(array.data_type());
+ cast(array, &new_type)
+}
- fn get_array_memory_size(&self) -> usize {
- self.inner.get_array_memory_size()
+/// replaces all instances of Binary with BinaryView in a DataType
+fn map_type(data_type: &DataType) -> DataType {
+ match data_type {
+ DataType::Binary => DataType::BinaryView,
+ DataType::List(field) => {
+ let new_field = field
+ .as_ref()
+ .clone()
+ .with_data_type(map_type(field.data_type()));
+ DataType::List(Arc::new(new_field))
+ }
+ DataType::Struct(fields) => {
+ let new_fields: Fields = fields
+ .iter()
+ .map(|f| {
+ let new_field =
f.as_ref().clone().with_data_type(map_type(f.data_type()));
+ Arc::new(new_field)
+ })
+ .collect();
+ DataType::Struct(new_fields)
Review Comment:
Good call -- I messed around and found an even better way by factoring this
out into a function.
##########
parquet-variant-compute/src/variant_array.rs:
##########
@@ -647,70 +883,57 @@ fn typed_value_to_variant(typed_value: &ArrayRef, index:
usize) -> Variant<'_, '
}
}
-impl Array for VariantArray {
- fn as_any(&self) -> &dyn Any {
- self
- }
-
- fn to_data(&self) -> ArrayData {
- self.inner.to_data()
- }
-
- fn into_data(self) -> ArrayData {
- self.inner.into_data()
- }
-
- fn data_type(&self) -> &DataType {
- self.inner.data_type()
- }
-
- fn slice(&self, offset: usize, length: usize) -> ArrayRef {
- let inner = self.inner.slice(offset, length);
- let metadata = self.metadata.slice(offset, length);
- let shredding_state = self.shredding_state.slice(offset, length);
- Arc::new(Self {
- inner,
- metadata,
- shredding_state,
- })
- }
-
- fn len(&self) -> usize {
- self.inner.len()
- }
-
- fn is_empty(&self) -> bool {
- self.inner.is_empty()
- }
-
- fn offset(&self) -> usize {
- self.inner.offset()
- }
-
- fn nulls(&self) -> Option<&NullBuffer> {
- self.inner.nulls()
- }
-
- fn get_buffer_memory_size(&self) -> usize {
- self.inner.get_buffer_memory_size()
- }
+/// Workaround for lack of direct support for BinaryArray
+/// <https://github.com/apache/arrow-rs/issues/8387>
+///
+/// The values are read as
+/// * `StructArray<metadata: Binary, value: Binary>`
+///
+/// but VariantArray needs them as
+/// * `StructArray<metadata: BinaryView, value: BinaryView>`
+///
+/// So cast them to get the right type.
+fn cast_to_binary_view_arrays(array: &dyn Array) -> Result<ArrayRef,
ArrowError> {
+ let new_type = map_type(array.data_type());
+ cast(array, &new_type)
+}
- fn get_array_memory_size(&self) -> usize {
- self.inner.get_array_memory_size()
+/// replaces all instances of Binary with BinaryView in a DataType
+fn map_type(data_type: &DataType) -> DataType {
Review Comment:
Changed to `rewrite_to_view_types`
##########
parquet-variant-compute/src/variant_array.rs:
##########
@@ -18,36 +18,194 @@
//! [`VariantArray`] implementation
use crate::type_conversion::primitive_conversion_single_value;
-use arrow::array::{Array, ArrayData, ArrayRef, AsArray, BinaryViewArray,
StructArray};
+use arrow::array::{Array, ArrayRef, AsArray, BinaryViewArray, StructArray};
use arrow::buffer::NullBuffer;
+use arrow::compute::cast;
use arrow::datatypes::{
Date32Type, Float16Type, Float32Type, Float64Type, Int16Type, Int32Type,
Int64Type, Int8Type,
UInt16Type, UInt32Type, UInt64Type, UInt8Type,
};
+use arrow_schema::extension::ExtensionType;
use arrow_schema::{ArrowError, DataType, Field, FieldRef, Fields};
use parquet_variant::Uuid;
use parquet_variant::Variant;
-use std::any::Any;
use std::sync::Arc;
+/// Arrow Variant [`ExtensionType`].
+///
+/// Represents the canonical Arrow Extension Type for storing variants.
+/// See [`VariantArray`] for more examples of using this extension type.
+pub struct VariantType;
+
+impl ExtensionType for VariantType {
+ const NAME: &'static str = "parquet.variant";
+
+ // Variants have no extension metadata
+ type Metadata = ();
+
+ fn metadata(&self) -> &Self::Metadata {
+ &()
+ }
+
+ fn serialize_metadata(&self) -> Option<String> {
+ None
+ }
+
+ fn deserialize_metadata(_metadata: Option<&str>) -> Result<Self::Metadata,
ArrowError> {
+ Ok(())
+ }
+
+ fn supports_data_type(&self, data_type: &DataType) -> Result<(),
ArrowError> {
+ // Note don't check for metadata/value fields here because they may be
+ // absent in shredded variants
Review Comment:
sounds good -- removed the comments
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]