scovich commented on code in PR #8392:
URL: https://github.com/apache/arrow-rs/pull/8392#discussion_r2364904431


##########
parquet-variant-compute/src/variant_array.rs:
##########
@@ -368,59 +585,54 @@ impl ShreddedVariantFieldArray {
             shredding_state: ShreddingState::new(value, typed_value),
         }
     }
-}
 
-impl Array for ShreddedVariantFieldArray {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn to_data(&self) -> ArrayData {
-        self.inner.to_data()
-    }
-
-    fn into_data(self) -> ArrayData {
-        self.inner.into_data()
+    /// Returns the inner [`StructArray`], consuming self
+    pub fn into_inner(self) -> StructArray {
+        self.inner
     }
 
-    fn data_type(&self) -> &DataType {
+    pub fn data_type(&self) -> &DataType {
         self.inner.data_type()
     }
 
-    fn slice(&self, offset: usize, length: usize) -> ArrayRef {
-        let inner = self.inner.slice(offset, length);
-        let shredding_state = self.shredding_state.slice(offset, length);
-        Arc::new(Self {
-            inner,
-            shredding_state,
-        })
-    }
-
-    fn len(&self) -> usize {
+    pub fn len(&self) -> usize {
         self.inner.len()
     }
 
-    fn is_empty(&self) -> bool {
+    pub fn is_empty(&self) -> bool {
         self.inner.is_empty()
     }
 
-    fn offset(&self) -> usize {
+    pub fn offset(&self) -> usize {
         self.inner.offset()
     }
 
-    fn nulls(&self) -> Option<&NullBuffer> {
+    pub fn nulls(&self) -> Option<&NullBuffer> {
         // According to the shredding spec, ShreddedVariantFieldArray should be
         // physically non-nullable - SQL NULL is inferred by both value and
         // typed_value being physically NULL
         None
     }
+    /// Is the element at index null?
+    pub fn is_null(&self, index: usize) -> bool {
+        self.nulls().map(|n| n.is_null(index)).unwrap_or_default()

Review Comment:
   Isn't this just
   ```suggestion
           self.nulls().map_or(false, |n| n.is_null(index))
   ```



##########
parquet-variant-compute/src/variant_array.rs:
##########
@@ -18,36 +18,194 @@
 //! [`VariantArray`] implementation
 
 use crate::type_conversion::primitive_conversion_single_value;
-use arrow::array::{Array, ArrayData, ArrayRef, AsArray, BinaryViewArray, 
StructArray};
+use arrow::array::{Array, ArrayRef, AsArray, BinaryViewArray, StructArray};
 use arrow::buffer::NullBuffer;
+use arrow::compute::cast;
 use arrow::datatypes::{
     Date32Type, Float16Type, Float32Type, Float64Type, Int16Type, Int32Type, 
Int64Type, Int8Type,
     UInt16Type, UInt32Type, UInt64Type, UInt8Type,
 };
+use arrow_schema::extension::ExtensionType;
 use arrow_schema::{ArrowError, DataType, Field, FieldRef, Fields};
 use parquet_variant::Uuid;
 use parquet_variant::Variant;
-use std::any::Any;
 use std::sync::Arc;
 
+/// Arrow Variant [`ExtensionType`].
+///
+/// Represents the canonical Arrow Extension Type for storing variants.
+/// See [`VariantArray`] for more examples of using this extension type.
+pub struct VariantType;
+
+impl ExtensionType for VariantType {
+    const NAME: &'static str = "parquet.variant";
+
+    // Variants have no extension metadata
+    type Metadata = ();
+
+    fn metadata(&self) -> &Self::Metadata {
+        &()
+    }
+
+    fn serialize_metadata(&self) -> Option<String> {
+        None
+    }
+
+    fn deserialize_metadata(_metadata: Option<&str>) -> Result<Self::Metadata, 
ArrowError> {
+        Ok(())
+    }
+
+    fn supports_data_type(&self, data_type: &DataType) -> Result<(), 
ArrowError> {
+        // Note don't check for metadata/value fields here because they may be
+        // absent in shredded variants

Review Comment:
   This doesn't sound quite right -- AFAIK the extension type should only 
annotate the top-level variant. Any shredded object fields or array elements 
nested inside should be identified according to spec, because at that point we 
know we're traversing a shredded variant (by virtue of having stepped into the 
`typed_value` column of a `VariantArray`. If anything looks out of place when 
trying to step into a shredded struct field or array element, that's just plain 
an error. No annotations needed to figure that out.
   
   Even if, for some reason, we decide it's important not to require 
`metadata`, we should still enforce that the only fields present are some 
subset of `metadata`, `value` and `typed_value`, where the first two are 
binary. TBD whether we recurse into `typed_value` to verify the rest of the 
structure -- we probably should, but that could become expensive if people are 
converting from ArrayRef to VariantArray to ArrayRef a lot.



##########
parquet-variant-compute/src/variant_array.rs:
##########
@@ -647,70 +883,57 @@ fn typed_value_to_variant(typed_value: &ArrayRef, index: 
usize) -> Variant<'_, '
     }
 }
 
-impl Array for VariantArray {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn to_data(&self) -> ArrayData {
-        self.inner.to_data()
-    }
-
-    fn into_data(self) -> ArrayData {
-        self.inner.into_data()
-    }
-
-    fn data_type(&self) -> &DataType {
-        self.inner.data_type()
-    }
-
-    fn slice(&self, offset: usize, length: usize) -> ArrayRef {
-        let inner = self.inner.slice(offset, length);
-        let metadata = self.metadata.slice(offset, length);
-        let shredding_state = self.shredding_state.slice(offset, length);
-        Arc::new(Self {
-            inner,
-            metadata,
-            shredding_state,
-        })
-    }
-
-    fn len(&self) -> usize {
-        self.inner.len()
-    }
-
-    fn is_empty(&self) -> bool {
-        self.inner.is_empty()
-    }
-
-    fn offset(&self) -> usize {
-        self.inner.offset()
-    }
-
-    fn nulls(&self) -> Option<&NullBuffer> {
-        self.inner.nulls()
-    }
-
-    fn get_buffer_memory_size(&self) -> usize {
-        self.inner.get_buffer_memory_size()
-    }
+/// Workaround for lack of direct support for BinaryArray
+/// <https://github.com/apache/arrow-rs/issues/8387>
+///
+/// The values are read as
+/// * `StructArray<metadata: Binary, value: Binary>`
+///
+/// but VariantArray needs them as
+/// * `StructArray<metadata: BinaryView, value: BinaryView>`
+///
+/// So cast them to get the right type.
+fn cast_to_binary_view_arrays(array: &dyn Array) -> Result<ArrayRef, 
ArrowError> {
+    let new_type = map_type(array.data_type());
+    cast(array, &new_type)
+}
 
-    fn get_array_memory_size(&self) -> usize {
-        self.inner.get_array_memory_size()
+/// replaces all instances of Binary with BinaryView in a DataType
+fn map_type(data_type: &DataType) -> DataType {

Review Comment:
   "map" makes it sound like `DataType::Map`? Could we choose a clearer name?



##########
parquet-variant-compute/src/variant_array.rs:
##########
@@ -18,36 +18,194 @@
 //! [`VariantArray`] implementation
 
 use crate::type_conversion::primitive_conversion_single_value;
-use arrow::array::{Array, ArrayData, ArrayRef, AsArray, BinaryViewArray, 
StructArray};
+use arrow::array::{Array, ArrayRef, AsArray, BinaryViewArray, StructArray};
 use arrow::buffer::NullBuffer;
+use arrow::compute::cast;
 use arrow::datatypes::{
     Date32Type, Float16Type, Float32Type, Float64Type, Int16Type, Int32Type, 
Int64Type, Int8Type,
     UInt16Type, UInt32Type, UInt64Type, UInt8Type,
 };
+use arrow_schema::extension::ExtensionType;
 use arrow_schema::{ArrowError, DataType, Field, FieldRef, Fields};
 use parquet_variant::Uuid;
 use parquet_variant::Variant;
-use std::any::Any;
 use std::sync::Arc;
 
+/// Arrow Variant [`ExtensionType`].
+///
+/// Represents the canonical Arrow Extension Type for storing variants.
+/// See [`VariantArray`] for more examples of using this extension type.
+pub struct VariantType;
+
+impl ExtensionType for VariantType {
+    const NAME: &'static str = "parquet.variant";
+
+    // Variants have no extension metadata
+    type Metadata = ();
+
+    fn metadata(&self) -> &Self::Metadata {
+        &()
+    }
+
+    fn serialize_metadata(&self) -> Option<String> {
+        None
+    }
+
+    fn deserialize_metadata(_metadata: Option<&str>) -> Result<Self::Metadata, 
ArrowError> {
+        Ok(())
+    }
+
+    fn supports_data_type(&self, data_type: &DataType) -> Result<(), 
ArrowError> {
+        // Note don't check for metadata/value fields here because they may be
+        // absent in shredded variants
+        if matches!(data_type, DataType::Struct(_)) {
+            Ok(())
+        } else {
+            Err(ArrowError::InvalidArgumentError(format!(
+                "VariantType only supports StructArray, got {}",
+                data_type
+            )))
+        }
+    }
+
+    fn try_new(data_type: &DataType, _metadata: Self::Metadata) -> 
Result<Self, ArrowError> {
+        let new_self = Self;
+        new_self.supports_data_type(data_type)?;
+        Ok(new_self)

Review Comment:
   I would think this should work?
   ```suggestion
           Self.supports_data_type(data_type)?;
           Ok(Self)
   ```



##########
parquet-variant-compute/src/variant_array.rs:
##########
@@ -647,70 +883,57 @@ fn typed_value_to_variant(typed_value: &ArrayRef, index: 
usize) -> Variant<'_, '
     }
 }
 
-impl Array for VariantArray {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn to_data(&self) -> ArrayData {
-        self.inner.to_data()
-    }
-
-    fn into_data(self) -> ArrayData {
-        self.inner.into_data()
-    }
-
-    fn data_type(&self) -> &DataType {
-        self.inner.data_type()
-    }
-
-    fn slice(&self, offset: usize, length: usize) -> ArrayRef {
-        let inner = self.inner.slice(offset, length);
-        let metadata = self.metadata.slice(offset, length);
-        let shredding_state = self.shredding_state.slice(offset, length);
-        Arc::new(Self {
-            inner,
-            metadata,
-            shredding_state,
-        })
-    }
-
-    fn len(&self) -> usize {
-        self.inner.len()
-    }
-
-    fn is_empty(&self) -> bool {
-        self.inner.is_empty()
-    }
-
-    fn offset(&self) -> usize {
-        self.inner.offset()
-    }
-
-    fn nulls(&self) -> Option<&NullBuffer> {
-        self.inner.nulls()
-    }
-
-    fn get_buffer_memory_size(&self) -> usize {
-        self.inner.get_buffer_memory_size()
-    }
+/// Workaround for lack of direct support for BinaryArray
+/// <https://github.com/apache/arrow-rs/issues/8387>
+///
+/// The values are read as
+/// * `StructArray<metadata: Binary, value: Binary>`
+///
+/// but VariantArray needs them as
+/// * `StructArray<metadata: BinaryView, value: BinaryView>`
+///
+/// So cast them to get the right type.
+fn cast_to_binary_view_arrays(array: &dyn Array) -> Result<ArrayRef, 
ArrowError> {
+    let new_type = map_type(array.data_type());
+    cast(array, &new_type)
+}
 
-    fn get_array_memory_size(&self) -> usize {
-        self.inner.get_array_memory_size()
+/// replaces all instances of Binary with BinaryView in a DataType
+fn map_type(data_type: &DataType) -> DataType {
+    match data_type {
+        DataType::Binary => DataType::BinaryView,
+        DataType::List(field) => {
+            let new_field = field
+                .as_ref()
+                .clone()
+                .with_data_type(map_type(field.data_type()));
+            DataType::List(Arc::new(new_field))
+        }
+        DataType::Struct(fields) => {
+            let new_fields: Fields = fields
+                .iter()
+                .map(|f| {
+                    let new_field = 
f.as_ref().clone().with_data_type(map_type(f.data_type()));
+                    Arc::new(new_field)
+                })
+                .collect();
+            DataType::Struct(new_fields)

Review Comment:
   You _might_ be able to cheat `fmt` by:
   ```suggestion
               let new_fields = fields.iter().map(|f| {
                   
Arc::new(f.as_ref().clone().with_data_type(map_type(f.data_type())))
               });
               DataType::Struct(new_fields.collect())
   ```



##########
parquet-variant-compute/src/variant_array.rs:
##########
@@ -242,6 +404,67 @@ impl VariantArray {
     pub fn typed_value_field(&self) -> Option<&ArrayRef> {
         self.shredding_state.typed_value_field()
     }
+
+    /// Return a field to represent this VariantArray in a `Schema` with
+    /// a particular name
+    pub fn field(&self, name: impl Into<String>) -> Field {
+        Field::new(
+            name.into(),
+            self.data_type().clone(),
+            self.inner.is_nullable(),
+        )
+        .with_extension_type(VariantType)
+    }
+
+    /// Returns a new DataType representing this VariantArray's inner type
+    pub fn data_type(&self) -> &DataType {
+        self.inner.data_type()
+    }
+
+    pub fn slice(&self, offset: usize, length: usize) -> Self {
+        let inner = self.inner.slice(offset, length);
+        let metadata = self.metadata.slice(offset, length);
+        let shredding_state = self.shredding_state.slice(offset, length);
+        Self {
+            inner,
+            metadata,
+            shredding_state,
+        }
+    }
+
+    pub fn len(&self) -> usize {
+        self.inner.len()
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.inner.is_empty()
+    }
+
+    pub fn nulls(&self) -> Option<&NullBuffer> {
+        self.inner.nulls()
+    }
+
+    /// Is the element at index null?
+    pub fn is_null(&self, index: usize) -> bool {
+        self.nulls().map(|n| n.is_null(index)).unwrap_or_default()

Review Comment:
   See other comment about `map_or`



##########
parquet-variant-compute/src/variant_get.rs:
##########
@@ -87,20 +88,19 @@ pub(crate) fn follow_shredded_path_element<'a>(
                 return Ok(missing_path_step());
             };
 
-            let field = field
-                .as_any()
-                .downcast_ref::<ShreddedVariantFieldArray>()
-                .ok_or_else(|| {
-                    // TODO: Should we blow up? Or just end the traversal and 
let the normal
-                    // variant pathing code sort out the mess that it must 
anyway be
-                    // prepared to handle?
-                    ArrowError::InvalidArgumentError(format!(
-                        "Expected a ShreddedVariantFieldArray, got {:?} 
instead",
-                        field.data_type(),
-                    ))
-                })?;
-
-            Ok(ShreddedPathStep::Success(field.shredding_state()))
+            let struct_array = field.as_struct_opt().ok_or_else(|| {
+                // TODO: Should we blow up? Or just end the traversal and let 
the normal
+                // variant pathing code sort out the mess that it must anyway 
be
+                // prepared to handle?
+                ArrowError::InvalidArgumentError(format!(
+                    "Expected Struct array while following path, got {}",
+                    field.data_type(),
+                ))
+            })?;
+
+            let shredding_state = ShreddingState::from(struct_array);
+
+            Ok(ShreddedPathStep::Success(shredding_state))

Review Comment:
   ```suggestion
               Ok(ShreddedPathStep::Success(struct_array.into()))
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to