alamb commented on code in PR #8392:
URL: https://github.com/apache/arrow-rs/pull/8392#discussion_r2363903020


##########
parquet-variant-compute/src/variant_array.rs:
##########
@@ -18,36 +18,194 @@
 //! [`VariantArray`] implementation
 
 use crate::type_conversion::primitive_conversion_single_value;
-use arrow::array::{Array, ArrayData, ArrayRef, AsArray, BinaryViewArray, 
StructArray};
+use arrow::array::{Array, ArrayRef, AsArray, BinaryViewArray, StructArray};
 use arrow::buffer::NullBuffer;
+use arrow::compute::cast;
 use arrow::datatypes::{
     Float16Type, Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, 
Int8Type, UInt16Type,
     UInt32Type, UInt64Type, UInt8Type,
 };
+use arrow_schema::extension::ExtensionType;
 use arrow_schema::{ArrowError, DataType, Field, FieldRef, Fields};
 use parquet_variant::Uuid;
 use parquet_variant::Variant;
-use std::any::Any;
 use std::sync::Arc;
 
+/// Arrow Variant [`ExtensionType`].

Review Comment:
   Here is the new canonical variant type



##########
parquet-variant-compute/src/variant_array.rs:
##########
@@ -18,36 +18,194 @@
 //! [`VariantArray`] implementation
 
 use crate::type_conversion::primitive_conversion_single_value;
-use arrow::array::{Array, ArrayData, ArrayRef, AsArray, BinaryViewArray, 
StructArray};
+use arrow::array::{Array, ArrayRef, AsArray, BinaryViewArray, StructArray};
 use arrow::buffer::NullBuffer;
+use arrow::compute::cast;
 use arrow::datatypes::{
     Float16Type, Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, 
Int8Type, UInt16Type,
     UInt32Type, UInt64Type, UInt8Type,
 };
+use arrow_schema::extension::ExtensionType;
 use arrow_schema::{ArrowError, DataType, Field, FieldRef, Fields};
 use parquet_variant::Uuid;
 use parquet_variant::Variant;
-use std::any::Any;
 use std::sync::Arc;
 
+/// Arrow Variant [`ExtensionType`].
+///
+/// Represents the canonical Arrow Extension Type for storing variants.
+/// See [`VariantArray`] for more examples of using this extension type.
+pub struct VariantType;
+
+impl ExtensionType for VariantType {
+    const NAME: &'static str = "parquet.variant";
+
+    // Variants have no extension metadata
+    type Metadata = ();
+
+    fn metadata(&self) -> &Self::Metadata {
+        &()
+    }
+
+    fn serialize_metadata(&self) -> Option<String> {
+        None
+    }
+
+    fn deserialize_metadata(_metadata: Option<&str>) -> Result<Self::Metadata, 
ArrowError> {
+        Ok(())
+    }
+
+    fn supports_data_type(&self, data_type: &DataType) -> Result<(), 
ArrowError> {
+        // Note don't check for metadata/value fields here because they may be
+        // absent in shredded variants
+        if matches!(data_type, DataType::Struct(_)) {
+            Ok(())
+        } else {
+            Err(ArrowError::InvalidArgumentError(format!(
+                "VariantType only supports StructArray, got {}",
+                data_type
+            )))
+        }
+    }
+
+    fn try_new(data_type: &DataType, _metadata: Self::Metadata) -> 
Result<Self, ArrowError> {
+        let new_self = Self;
+        new_self.supports_data_type(data_type)?;
+        Ok(new_self)
+    }
+}
+
 /// An array of Parquet [`Variant`] values
 ///

Review Comment:
   I tried to update these docstrings to explain what is going on here and how 
to use the extension types



##########
parquet-variant-compute/src/variant_array.rs:
##########
@@ -318,17 +539,9 @@ impl ShreddedVariantFieldArray {
             ));
         };
 
-        // Extract value and typed_value fields (metadata is not expected in 
ShreddedVariantFieldArray)
-        let value = inner_struct

Review Comment:
   this code is moved to `ShreddingState`



##########
parquet-variant-compute/src/variant_get.rs:
##########
@@ -23,15 +23,16 @@ use arrow::{
 use arrow_schema::{ArrowError, DataType, FieldRef};
 use parquet_variant::{VariantPath, VariantPathElement};
 
-use crate::variant_array::{ShreddedVariantFieldArray, ShreddingState};
+use crate::variant_array::ShreddingState;
 use crate::variant_to_arrow::make_variant_to_arrow_row_builder;
 use crate::VariantArray;
 
+use arrow::array::AsArray;
 use std::sync::Arc;
 
-pub(crate) enum ShreddedPathStep<'a> {
+pub(crate) enum ShreddedPathStep {
     /// Path step succeeded, return the new shredding state
-    Success(&'a ShreddingState),
+    Success(ShreddingState),

Review Comment:
   this is the only part I am a little sad about -- that it now requires owing 
the `ShreddingState` which requires clone the state. While this is typically 
just a few `Arc`s it is still unfortunate
   
   However, unless we change VariantArray to be reference based (e.g. 
`VariantArray<'a>`) I can't think of any way around it



##########
parquet-variant-compute/src/variant_array.rs:
##########
@@ -627,70 +847,57 @@ fn typed_value_to_variant(typed_value: &ArrayRef, index: 
usize) -> Variant<'_, '
     }
 }
 
-impl Array for VariantArray {

Review Comment:
   this was moved up in the file



##########
parquet-variant-compute/src/variant_array.rs:
##########
@@ -246,6 +406,67 @@ impl VariantArray {
     pub fn typed_value_field(&self) -> Option<&ArrayRef> {
         self.shredding_state.typed_value_field()
     }
+
+    /// Return a field to represent this VariantArray in a `Schema` with
+    /// a particular name
+    pub fn field(&self, name: impl Into<String>) -> Field {
+        Field::new(
+            name.into(),
+            self.data_type().clone(),
+            self.inner.is_nullable(),
+        )
+        .with_extension_type(VariantType)
+    }
+
+    /// Returns a new DataType representing this VariantArray's inner type
+    pub fn data_type(&self) -> &DataType {

Review Comment:
   these methods are just moved from the Array impl



##########
parquet-variant-compute/src/variant_array.rs:
##########
@@ -456,16 +655,26 @@ pub enum ShreddingState {
 }
 
 impl ShreddingState {
-    /// try to create a new `ShreddingState` from the given fields
-    pub fn try_new(
-        value: Option<BinaryViewArray>,
-        typed_value: Option<ArrayRef>,
-    ) -> Result<Self, ArrowError> {
+    /// try to create a new `ShreddingState` from the given `value` and 
`typed_value` fields
+    ///
+    /// Note you can create a `ShreddingState` from a &[`StructArray`] using
+    /// `ShreddingState::try_from(&struct_array)`, for example:
+    ///
+    /// ```no_run
+    /// # use arrow::array::StructArray;
+    /// # use parquet_variant_compute::ShreddingState;
+    /// # fn get_struct_array() -> StructArray {
+    /// #   unimplemented!()
+    /// # }
+    /// let struct_array: StructArray = get_struct_array();
+    /// let shredding_state = ShreddingState::try_from(&struct_array).unwrap();
+    /// ```
+    pub fn new(value: Option<BinaryViewArray>, typed_value: Option<ArrayRef>) 
-> Self {

Review Comment:
   This is now infallable per @scovich 's suggestion 
https://github.com/apache/arrow-rs/pull/8365#discussion_r2362345781



##########
parquet-variant-compute/src/variant_array.rs:
##########
@@ -88,7 +246,11 @@ impl VariantArray {
     /// int8.
     ///
     /// Currently, only [`BinaryViewArray`] are supported.
-    pub fn try_new(inner: ArrayRef) -> Result<Self, ArrowError> {
+    pub fn try_new(inner: &dyn Array) -> Result<Self, ArrowError> {
+        // Workaround lack of support for Binary
+        // https://github.com/apache/arrow-rs/issues/8387
+        let inner = cast_to_binary_view_arrays(inner)?;

Review Comment:
   This cast is unfortunate, but I think it is the best (simplest) solution 
while we work on the real solution
   - https://github.com/apache/arrow-rs/issues/8387



##########
parquet-variant-compute/src/variant_get.rs:
##########
@@ -242,19 +242,15 @@ fn shredded_get_path(
 /// quickly become annoying (and inefficient) to call `variant_get` for each 
leaf value in a struct or
 /// list and then try to assemble the results.
 pub fn variant_get(input: &ArrayRef, options: GetOptions) -> Result<ArrayRef> {
-    let variant_array: &VariantArray = 
input.as_any().downcast_ref().ok_or_else(|| {
-        ArrowError::InvalidArgumentError(
-            "expected a VariantArray as the input for variant_get".to_owned(),
-        )
-    })?;
+    let variant_array = VariantArray::try_new(input)?;

Review Comment:
   I will admit the new `try_new` is quite a bit more ergonomic



##########
parquet/tests/variant_integration.rs:
##########
@@ -399,57 +396,12 @@ impl VariantTestCase {
             .column_by_name("var")
             .unwrap_or_else(|| panic!("No 'var' column found in parquet file 
{path:?}"));
 
-        // the values are read as

Review Comment:
   this code is moved to `VariantArray::try_new`



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to