scovich commented on code in PR #9610:
URL: https://github.com/apache/arrow-rs/pull/9610#discussion_r3064489273


##########
parquet-variant-compute/src/variant_array.rs:
##########
@@ -41,6 +41,31 @@ use parquet_variant::{
 use std::borrow::Cow;
 use std::sync::Arc;
 
+/// Returns the raw bytes at the given index from a binary-like array, return 
`None` if the array isn't binary-like.
+pub(crate) fn binary_array_value(array: &dyn Array, index: usize) -> 
Option<&[u8]> {
+    match array.data_type() {
+        DataType::Binary => Some(array.as_binary::<i32>().value(index)),
+        DataType::LargeBinary => Some(array.as_binary::<i64>().value(index)),
+        DataType::BinaryView => Some(array.as_binary_view().value(index)),
+        _ => None,
+    }
+}
+
+/// Validates that an array has a binary-like data type.
+fn validate_binary_array(array: &dyn Array, field_name: &str) -> Result<()> {
+    if matches!(
+        array.data_type(),
+        DataType::Binary | DataType::LargeBinary | DataType::BinaryView
+    ) {
+        Ok(())
+    } else {
+        Err(ArrowError::InvalidArgumentError(format!(
+            "VariantArray '{field_name}' field must be Binary, LargeBinary, or 
BinaryView, got {}",
+            array.data_type()
+        )))
+    }

Review Comment:
   nit: Simpler as an actual `match`?
   
   ```suggestion
       match array.data_type() {
           DataType::Binary | DataType::LargeBinary | DataType::BinaryView => 
Ok(()),
           _ => Err(ArrowError::InvalidArgumentError(format!(
               "VariantArray '{field_name}' field must be Binary, LargeBinary, 
or BinaryView, got {}",
               array.data_type()
           )))
       }
   ```



##########
parquet-variant-compute/src/variant_array.rs:
##########
@@ -375,7 +392,20 @@ impl VariantArray {
             }
             // Otherwise fall back to value, if available
             (_, Some(value)) if value.is_valid(index) => {
-                Ok(Variant::new(self.metadata.value(index), 
value.value(index)))
+                let metadata =
+                    binary_array_value(self.metadata.as_ref(), 
index).ok_or_else(|| {
+                        ArrowError::InvalidArgumentError(format!(
+                            "metadata field must be a binary-like array, 
instead got {}",
+                            self.metadata.data_type(),
+                        ))
+                    })?;
+                let value = binary_array_value(value.as_ref(), 
index).ok_or_else(|| {
+                    ArrowError::InvalidArgumentError(format!(
+                        "value field must be a binary-like array, instead got 
{}",
+                        value.data_type(),
+                    ))
+                })?;
+                Ok(Variant::new(metadata, value))

Review Comment:
   This matches the pattern I flagged in unit tests earlier. Should we consider 
adding a new `Variant::try_new_from_arrays_at` constructor instead of the test 
helper I had suggested there?



##########
parquet-variant-compute/src/shred_variant.rs:
##########
@@ -96,7 +96,7 @@ pub fn shred_variant(array: &VariantArray, as_type: 
&DataType) -> Result<Variant
     let (value, typed_value, nulls) = builder.finish()?;
     Ok(VariantArray::from_parts(
         array.metadata_field().clone(),
-        Some(value),
+        Some(Arc::new(value) as ArrayRef),

Review Comment:
   nit: I'm pretty sure the cast is unnecessary (automatic coercion by compiler)
   
   (several more below)



##########
parquet-variant-compute/src/variant_get.rs:
##########
@@ -268,9 +268,25 @@ fn try_perfect_shredding(variant_array: &VariantArray, 
as_field: &Field) -> Opti
         // 2. If every row in the `value` column is null
 
         // This is a perfect shredding, where the value is entirely shredded 
out,
-        // so we can just return the typed value.
-        return Some(typed_value.clone());
+        // so we can just return the typed value after merging the accumulated 
nulls.

Review Comment:
   Yelp! This is the bug you mentioned finding? We may want to consider merging 
the fix+test as a separate PR, since unrelated to binary-like arrays?



##########
parquet-variant-compute/src/shred_variant.rs:
##########
@@ -1406,7 +1434,10 @@ mod tests {
         assert!(value.is_valid(3));
         assert!(typed_value.is_null(3));
         assert_eq!(
-            Variant::new(metadata.value(3), value.value(3)),
+            Variant::new(
+                binary_array_value(metadata.as_ref(), 3).unwrap(),
+                binary_array_value(value.as_ref(), 3).unwrap()
+            ),

Review Comment:
   There are a lot of these, spread across several test modules. Do you think 
it would be helpful to define a test helper function that takes a pair of 
arrays (applying deref) plus an index, and encapsulates the binary array 
value+unwrap calls?
   ```rust
   assert_eq!(
       variant_from_arrays_at(metadata, value, 3), 
       Ok(Variant::Null)
   );
   ```



##########
parquet-variant-compute/src/shred_variant.rs:
##########
@@ -1952,7 +1981,7 @@ mod tests {
                                     assert!(score_value.is_valid(i));
                                     assert_eq!(
                                         expected_score_value,
-                                        get_value(i, metadata, score_value)
+                                        get_value(i, metadata.as_ref(), 
score_value.as_ref())

Review Comment:
   Does deref coercion somehow not kick in here? 
   `metadata: &Arc<dyn Array>` (returned by `VariantArray::metadata_field`) 
should coerce automatically to `&dyn Array`.



##########
parquet-variant-compute/src/unshred_variant.rs:
##########
@@ -347,7 +360,17 @@ trait AppendToVariantBuilder: Array {
 macro_rules! handle_unshredded_case {
     ($self:expr, $builder:expr, $metadata:expr, $index:expr, 
$partial_shredding:expr) => {{
         let value = $self.value.as_ref().filter(|v| v.is_valid($index));
-        let value = value.map(|v| 
Variant::new_with_metadata($metadata.clone(), v.value($index)));
+        let value = value
+            .map(|v| {
+                let bytes = binary_array_value(v.as_ref(), 
$index).ok_or_else(|| {
+                    ArrowError::InvalidArgumentError(format!(
+                        "value field must be a binary-like array, instead got 
{}",
+                        v.data_type(),
+                    ))
+                })?;
+                Result::Ok(Variant::new_with_metadata($metadata.clone(), 
bytes))
+            })
+            .transpose()?;

Review Comment:
   Out of curiosity, why not just `return` the error directly? What does the 
map+transpose pair accomplish?



##########
parquet-variant-compute/src/variant_array.rs:
##########
@@ -860,15 +895,10 @@ impl<'a> TryFrom<&'a StructArray> for 
BorrowedShreddingState<'a> {
     type Error = ArrowError;
 
     fn try_from(inner_struct: &'a StructArray) -> Result<Self> {
-        // The `value` column need not exist, but if it does it must be a 
binary view.
+        // The `value` column need not exist, but if it does it must be a 
binary type.
         let value = if let Some(value_col) = 
inner_struct.column_by_name("value") {
-            let Some(binary_view) = value_col.as_binary_view_opt() else {
-                return Err(ArrowError::NotYetImplemented(format!(
-                    "VariantArray 'value' field must be BinaryView, got {}",
-                    value_col.data_type()
-                )));
-            };
-            Some(binary_view)
+            validate_binary_array(value_col.as_ref(), "value")?;

Review Comment:
   Nice side effect, seeing this repeated code pattern simplified and 
encapsulated!



##########
parquet-variant-compute/src/variant_array.rs:
##########
@@ -375,7 +392,20 @@ impl VariantArray {
             }
             // Otherwise fall back to value, if available
             (_, Some(value)) if value.is_valid(index) => {
-                Ok(Variant::new(self.metadata.value(index), 
value.value(index)))
+                let metadata =
+                    binary_array_value(self.metadata.as_ref(), 
index).ok_or_else(|| {
+                        ArrowError::InvalidArgumentError(format!(
+                            "metadata field must be a binary-like array, 
instead got {}",
+                            self.metadata.data_type(),
+                        ))
+                    })?;
+                let value = binary_array_value(value.as_ref(), 
index).ok_or_else(|| {
+                    ArrowError::InvalidArgumentError(format!(
+                        "value field must be a binary-like array, instead got 
{}",
+                        value.data_type(),
+                    ))
+                })?;
+                Ok(Variant::new(metadata, value))

Review Comment:
   Oh... arrow-array is not available in the parquet-variant crate, and we 
can't define new constructors on `Variant` in this parquet-variant-compute 
crate. So we'd need to keep the stand-alone `variant_from_arrays_at` helper 
after all, unless we got fancy with extension traits.



##########
parquet-variant-compute/src/shred_variant.rs:
##########
@@ -1952,7 +1981,7 @@ mod tests {
                                     assert!(score_value.is_valid(i));
                                     assert_eq!(
                                         expected_score_value,
-                                        get_value(i, metadata, score_value)
+                                        get_value(i, metadata.as_ref(), 
score_value.as_ref())

Review Comment:
   Ah... that only works for concrete types, [not dyn trait 
refs](https://play.rust-lang.org/?version=stable&mode=debug&edition=2024&gist=266c70aaebf3d5e2ad1ebf9fa1bc07ff).
 Blech.
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to