Re: [PR] Variant integration fixes [arrow-rs]

via GitHub Wed, 24 Sep 2025 16:26:10 -0700


scovich commented on code in PR #8438:
URL: https://github.com/apache/arrow-rs/pull/8438#discussion_r2377283417



##########
parquet-variant-compute/src/variant_array.rs:
##########
@@ -891,28 +864,116 @@ fn typed_value_to_variant(typed_value: &ArrayRef, index: 
usize) -> Variant<'_, '
 ///
 /// So cast them to get the right type.
 fn cast_to_binary_view_arrays(array: &dyn Array) -> Result<ArrayRef, 
ArrowError> {
-    let new_type = rewrite_to_view_types(array.data_type());
-    cast(array, &new_type)
+    let new_type = canonicalize_and_verify_data_type(array.data_type())?;
+    cast(array, new_type.as_ref())
+}
+
+/// Validates whether a given arrow decimal is a valid variant decimal
+///
+/// NOTE: By a strict reading of the "decimal table" in the [shredding spec], 
each decimal type
+/// should have a lower bound on precision as well as an upper bound (i.e. 
Decimal16 with precision
+/// 5 is invalid because Decimal4 can cover it). But the variant shredding 
integration tests
+/// specifically expect such cases to succeed, so we only enforce the upper 
bound here.
+///
+/// [shredding spec]: 
https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#encoding-types
+fn is_valid_variant_decimal(p: &u8, s: &i8, max_precision: u8) -> bool {
+    *p <= max_precision && (0..=*p as i8).contains(s)
 }
 
 /// replaces all instances of Binary with BinaryView in a DataType
-fn rewrite_to_view_types(data_type: &DataType) -> DataType {
-    match data_type {
-        DataType::Binary => DataType::BinaryView,
-        DataType::List(field) => DataType::List(rewrite_field_type(field)),
-        DataType::Struct(fields) => {
-            DataType::Struct(fields.iter().map(rewrite_field_type).collect())
-        }
-        _ => data_type.clone(),
+fn canonicalize_and_verify_data_type(
+    data_type: &DataType,
+) -> Result<Cow<'_, DataType>, ArrowError> {
+    use DataType::*;
+
+    // helper macros
+    macro_rules! fail {
+        () => {
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "Illegal shredded value type: {data_type}"
+            )))
+        };
     }
+    macro_rules! borrow {
+        () => {
+            Cow::Borrowed(data_type)
+        };
+    }
+
+    let new_data_type = match data_type {
+        // Primitive arrow types that have a direct variant counterpart are 
allowed
+        Null | Boolean => borrow!(),
+        Int8 | Int16 | Int32 | Int64 | Float32 | Float64 => borrow!(),
+
+        // Unsigned integers are not allowed at all
+        UInt8 | UInt16 | UInt32 | UInt64 | Float16 => fail!(),
+
+        // Most decimal types are allowed, with restrictions on precision and 
scale
+        Decimal32(p, s) if is_valid_variant_decimal(p, s, 9) => borrow!(),
+        Decimal64(p, s) if is_valid_variant_decimal(p, s, 18) => borrow!(),
+        Decimal128(p, s) if is_valid_variant_decimal(p, s, 38) => borrow!(),
+        Decimal32(..) | Decimal64(..) | Decimal128(..) | Decimal256(..) => 
fail!(),
+
+        // Only micro and nano timestamps are supported
+        Timestamp(TimeUnit::Microsecond | TimeUnit::Nanosecond, _) => 
borrow!(),
+        Timestamp(TimeUnit::Millisecond | TimeUnit::Second, _) => fail!(),
+
+        // Only 32-bit dates and 64-bit microsecond time are supported.
+        Date32 | Time64(TimeUnit::Microsecond) => borrow!(),
+        Date64 | Time32(_) | Time64(_) | Duration(_) | Interval(_) => fail!(),
+
+        // Binary and string are allowed.
+        // NOTE: We force Binary to BinaryView because that's what the parquet 
reader returns.
+        Binary => Cow::Owned(DataType::BinaryView),
+        BinaryView | Utf8 => borrow!(),
+
+        // UUID maps to 16-byte fixed-size binary; no other width is allowed
+        FixedSizeBinary(16) => borrow!(),
+        FixedSizeBinary(_) | FixedSizeList(..) => fail!(),
+
+        // We can _possibly_ support (some of) these some day?
+        LargeBinary | LargeUtf8 | Utf8View | ListView(_) | LargeList(_) | 
LargeListView(_) => {

Review Comment:
   It's unclear to me what leeway writers have in producing different physical 
forms of the same logical data. Not just large vs. normal offsets vs. view, but 
also layout optimizations like dictionary or run-end coding?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] Variant integration fixes [arrow-rs]

Reply via email to