alamb commented on code in PR #8021: URL: https://github.com/apache/arrow-rs/pull/8021#discussion_r2254333332
########## parquet-variant-compute/src/variant_array.rs: ########## @@ -79,12 +143,22 @@ impl VariantArray { /// # Errors: /// - If the `StructArray` does not contain the required fields /// - /// # Current support - /// This structure does not (yet) support the full Arrow Variant Array specification. + /// # Requirements of the `StructArray` + /// + /// 1. A required field named `metadata` which is binary, large_binary, or + /// binary_view /// - /// Only `StructArrays` with `metadata` and `value` fields that are - /// [`BinaryViewArray`] are supported. Shredded values are not currently supported - /// nor are using types other than `BinaryViewArray` + /// 2. An optional field named `value` that is binary, large_binary, or + /// binary_view + /// + /// 3. An optional field named `typed_value` which can be any primitive type + /// or be a list, large_list, list_view or struct + /// + /// NOTE: It is also permissible for the metadata field to be + /// Dictionary-Encoded, preferably (but not required) with an index type of + /// int8. + /// + /// Currently, only [`BinaryViewArray`] are supported. Review Comment: I don't think so -- thank you ########## parquet-variant-compute/src/variant_get/mod.rs: ########## @@ -0,0 +1,431 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +use arrow::{ + array::{Array, ArrayRef}, + compute::CastOptions, + error::Result, +}; +use arrow_schema::{ArrowError, FieldRef}; +use parquet_variant::VariantPath; + +use crate::variant_array::ShreddingState; +use crate::variant_get::output::instantiate_output_builder; +use crate::VariantArray; + +mod output; + +/// Returns an array with the specified path extracted from the variant values. +/// +/// The return array type depends on the `as_type` field of the options parameter +/// 1. `as_type: None`: a VariantArray is returned. The values in this new VariantArray will point +/// to the specified path. +/// 2. `as_type: Some(<specific field>)`: an array of the specified type is returned. +pub fn variant_get(input: &ArrayRef, options: GetOptions) -> Result<ArrayRef> { + let variant_array: &VariantArray = input.as_any().downcast_ref().ok_or_else(|| { + ArrowError::InvalidArgumentError( + "expected a VariantArray as the input for variant_get".to_owned(), + ) + })?; + + // Create the output writer based on the specified output options + let output_builder = instantiate_output_builder(options.clone())?; + + // Dispatch based on the shredding state of the input variant array + // TODO make this an enum on VariantArray (e.g ShreddingState) Review Comment: good call. Removed ########## parquet-variant-compute/src/from_json.rs: ########## @@ -69,8 +69,8 @@ mod test { let array_ref: ArrayRef = Arc::new(input); let variant_array = batch_json_string_to_variant(&array_ref).unwrap(); - let metadata_array = variant_array.metadata_field().as_binary_view(); - let value_array = variant_array.value_field().as_binary_view(); + let metadata_array = variant_array.metadata_field(); + let value_array = variant_array.value_field().expect("value field"); Review Comment: You are right that `expect` causes a panic. However I think this code is part of the test `test_batch_json_string_to_variant` so it is probably ok to panic if something changes ########## parquet-variant-compute/src/variant_array.rs: ########## @@ -93,35 +167,64 @@ impl VariantArray { "Invalid VariantArray: requires StructArray as input".to_string(), )); }; - // Ensure the StructArray has a metadata field of BinaryView - let Some(metadata_field) = VariantArray::find_metadata_field(inner) else { + // Note the specification allows for any order so we must search by name + + // Ensure the StructArray has a metadata field of BinaryView + let Some(metadata_field) = inner.column_by_name("metadata") else { return Err(ArrowError::InvalidArgumentError( "Invalid VariantArray: StructArray must contain a 'metadata' field".to_string(), )); }; - if metadata_field.data_type() != &DataType::BinaryView { + let Some(metadata) = metadata_field.as_binary_view_opt() else { return Err(ArrowError::NotYetImplemented(format!( "VariantArray 'metadata' field must be BinaryView, got {}", metadata_field.data_type() ))); - } - let Some(value_field) = VariantArray::find_value_field(inner) else { - return Err(ArrowError::InvalidArgumentError( - "Invalid VariantArray: StructArray must contain a 'value' field".to_string(), - )); }; - if value_field.data_type() != &DataType::BinaryView { - return Err(ArrowError::NotYetImplemented(format!( - "VariantArray 'value' field must be BinaryView, got {}", - value_field.data_type() - ))); - } + + // Find the value field, if present + let value_field = inner.column_by_name("value"); + let value = value_field + .map(|v| match v.as_binary_view_opt() { + Some(bv) => Ok(bv), + None => Err(ArrowError::NotYetImplemented(format!( + "VariantArray 'value' field must be BinaryView, got {}", + v.data_type() + ))), + }) + .transpose()?; + + // Find the typed_value field, if present + let typed_value = inner.column_by_name("typed_value"); + + // Note these clones are cheap, they just bump the ref count + let inner = inner.clone(); + let metadata = metadata.clone(); + let value = value.cloned(); + let typed_value = typed_value.cloned(); + + let shredding_state = match (metadata, value, typed_value) { + (metadata, Some(value), Some(typed_value)) => ShreddingState::PartiallyShredded { + metadata, + value, + typed_value, + }, + (metadata, Some(value), None) => ShreddingState::Unshredded { metadata, value }, + (metadata, None, Some(typed_value)) => ShreddingState::FullyShredded { + metadata, + typed_value, + }, + (_metadata_field, None, None) => { + return Err(ArrowError::InvalidArgumentError(String::from( + "VariantArray has neither value nor typed_value field", + ))); + } + }; Review Comment: Good idea -- done in 924155ca68 ########## parquet-variant-compute/src/variant_array.rs: ########## @@ -186,13 +340,11 @@ impl Array for VariantArray { } fn slice(&self, offset: usize, length: usize) -> ArrayRef { - let slice = self.inner.slice(offset, length); - let met = self.metadata_ref.slice(offset, length); - let val = self.value_ref.slice(offset, length); + let inner = self.inner.slice(offset, length); + let shredding_state = self.shredding_state.slice(offset, length); Arc::new(Self { Review Comment: We could definitely add a fast path for slicing the whole array. I am not quite sure what ShreddingState::All and ShreddingState::Mixed is supposed to represent ########## parquet-variant-compute/src/variant_get/output/primitive.rs: ########## @@ -0,0 +1,166 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::variant_get::output::OutputBuilder; +use crate::VariantArray; +use arrow::error::Result; + +use arrow::array::{ + Array, ArrayRef, ArrowPrimitiveType, AsArray, BinaryViewArray, NullBufferBuilder, + PrimitiveArray, +}; +use arrow::compute::{cast_with_options, CastOptions}; +use arrow::datatypes::Int32Type; +use arrow_schema::{ArrowError, FieldRef}; +use parquet_variant::{Variant, VariantPath}; +use std::marker::PhantomData; +use std::sync::Arc; + +/// Trait for Arrow primitive types that can be used in the output builder +/// +/// This just exists to add a generic way to convert from Variant to the primitive type +pub(super) trait ArrowPrimitiveVariant: ArrowPrimitiveType { + /// Try to extract the primitive value from a Variant, returning None if it + /// cannot be converted + /// + /// TODO: figure out how to handle coercion/casting + fn from_variant(variant: &Variant) -> Option<Self::Native>; +} + +/// Outputs Primitive arrays +pub(super) struct PrimitiveOutputBuilder<'a, T: ArrowPrimitiveVariant> { + /// What path to extract + path: VariantPath<'a>, + /// Returned output type + as_type: FieldRef, + /// Controls the casting behavior (e.g. error vs substituting null on cast error). + cast_options: CastOptions<'a>, + /// Phantom data for the primitive type + _phantom: PhantomData<T>, +} + +impl<'a, T: ArrowPrimitiveVariant> PrimitiveOutputBuilder<'a, T> { + pub(super) fn new( + path: VariantPath<'a>, + as_type: FieldRef, + cast_options: CastOptions<'a>, + ) -> Self { + Self { + path, + as_type, + cast_options, + _phantom: PhantomData, + } + } +} + +impl<'a, T: ArrowPrimitiveVariant> OutputBuilder for PrimitiveOutputBuilder<'a, T> { + fn partially_shredded( + &self, + variant_array: &VariantArray, + _metadata: &BinaryViewArray, + _value_field: &BinaryViewArray, + typed_value: &ArrayRef, + ) -> arrow::error::Result<ArrayRef> { + // build up the output array element by element + let mut nulls = NullBufferBuilder::new(variant_array.len()); + let mut values = Vec::with_capacity(variant_array.len()); + let typed_value = + cast_with_options(typed_value, self.as_type.data_type(), &self.cast_options)?; + // downcast to the primitive array (e.g. Int32Array, Float64Array, etc) + let typed_value = typed_value.as_primitive::<T>(); + + for i in 0..variant_array.len() { + if variant_array.is_null(i) { + nulls.append_null(); + values.push(T::default_value()); // not used, placeholder + continue; + } + + // if the typed value is null, decode the variant and extract the value + if typed_value.is_null(i) { + // todo follow path Review Comment: I think you are probably right that the path handling should be handled elsewhere I haven't worked out exactly how extracting paths would be implemented. Maybe that would be a good follow on PR -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org