klion26 commented on code in PR #8438:
URL: https://github.com/apache/arrow-rs/pull/8438#discussion_r2379473713


##########
parquet-variant-compute/src/variant_array.rs:
##########
@@ -891,28 +864,120 @@ fn typed_value_to_variant(typed_value: &ArrayRef, index: 
usize) -> Variant<'_, '
 ///
 /// So cast them to get the right type.
 fn cast_to_binary_view_arrays(array: &dyn Array) -> Result<ArrayRef, 
ArrowError> {
-    let new_type = rewrite_to_view_types(array.data_type());
-    cast(array, &new_type)
+    let new_type = canonicalize_and_verify_data_type(array.data_type())?;
+    cast(array, new_type.as_ref())
 }
 
-/// replaces all instances of Binary with BinaryView in a DataType
-fn rewrite_to_view_types(data_type: &DataType) -> DataType {
-    match data_type {
-        DataType::Binary => DataType::BinaryView,
-        DataType::List(field) => DataType::List(rewrite_field_type(field)),
-        DataType::Struct(fields) => {
-            DataType::Struct(fields.iter().map(rewrite_field_type).collect())
-        }
-        _ => data_type.clone(),
+/// Validates whether a given arrow decimal is a valid variant decimal
+///
+/// NOTE: By a strict reading of the "decimal table" in the [shredding spec], 
each decimal type
+/// should have a width-dependent lower bound on precision as well as an upper 
bound (i.e. Decimal16
+/// with precision 5 is invalid because Decimal4 "covers" it). But the variant 
shredding integration
+/// tests specifically expect such cases to succeed, so we only enforce the 
upper bound here.
+///
+/// [shredding spec]: 
https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#encoding-types
+fn is_valid_variant_decimal(p: &u8, s: &i8, max_precision: u8) -> bool {
+    (1..=max_precision).contains(p) && (0..=*p as i8).contains(s)
+}
+
+/// Recursively visits a data type, ensuring that it only contains data types 
that can legally
+/// appear in a (possibly shredded) variant array. It also replaces Binary 
fields with BinaryView,
+/// since that's what comes back from the parquet reader and what the variant 
code expects to find.
+fn canonicalize_and_verify_data_type(
+    data_type: &DataType,
+) -> Result<Cow<'_, DataType>, ArrowError> {
+    use DataType::*;
+
+    // helper macros
+    macro_rules! fail {
+        () => {
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "Illegal shredded value type: {data_type}"
+            )))
+        };
     }
+    macro_rules! borrow {
+        () => {
+            Cow::Borrowed(data_type)
+        };
+    }
+
+    let new_data_type = match data_type {

Review Comment:
   Nice!



##########
parquet-variant-compute/src/variant_array.rs:
##########
@@ -796,8 +778,17 @@ impl StructArrayBuilder {
 }
 
 /// returns the non-null element at index as a Variant
-fn typed_value_to_variant(typed_value: &ArrayRef, index: usize) -> Variant<'_, 
'_> {
-    match typed_value.data_type() {
+fn typed_value_to_variant<'a>(
+    typed_value: &'a ArrayRef,
+    value: Option<&BinaryViewArray>,
+    index: usize,
+) -> Variant<'a, 'a> {
+    let data_type = typed_value.data_type();
+    if value.is_some_and(|v| !matches!(data_type, DataType::Struct(_)) && 
v.is_valid(index)) {

Review Comment:
   We'll panic here if (`data_type` is not `DataType::Struct(_)`) and 
(`v.is_valid(index)`), do we need to panic if `data_type` is `DataType::Struct` 
and `v.is_valid(index)` here?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to