klion26 commented on code in PR #8438:
URL: https://github.com/apache/arrow-rs/pull/8438#discussion_r2379473713
##########
parquet-variant-compute/src/variant_array.rs:
##########
@@ -891,28 +864,120 @@ fn typed_value_to_variant(typed_value: &ArrayRef, index:
usize) -> Variant<'_, '
///
/// So cast them to get the right type.
fn cast_to_binary_view_arrays(array: &dyn Array) -> Result<ArrayRef,
ArrowError> {
- let new_type = rewrite_to_view_types(array.data_type());
- cast(array, &new_type)
+ let new_type = canonicalize_and_verify_data_type(array.data_type())?;
+ cast(array, new_type.as_ref())
}
-/// replaces all instances of Binary with BinaryView in a DataType
-fn rewrite_to_view_types(data_type: &DataType) -> DataType {
- match data_type {
- DataType::Binary => DataType::BinaryView,
- DataType::List(field) => DataType::List(rewrite_field_type(field)),
- DataType::Struct(fields) => {
- DataType::Struct(fields.iter().map(rewrite_field_type).collect())
- }
- _ => data_type.clone(),
+/// Validates whether a given arrow decimal is a valid variant decimal
+///
+/// NOTE: By a strict reading of the "decimal table" in the [shredding spec],
each decimal type
+/// should have a width-dependent lower bound on precision as well as an upper
bound (i.e. Decimal16
+/// with precision 5 is invalid because Decimal4 "covers" it). But the variant
shredding integration
+/// tests specifically expect such cases to succeed, so we only enforce the
upper bound here.
+///
+/// [shredding spec]:
https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#encoding-types
+fn is_valid_variant_decimal(p: &u8, s: &i8, max_precision: u8) -> bool {
+ (1..=max_precision).contains(p) && (0..=*p as i8).contains(s)
+}
+
+/// Recursively visits a data type, ensuring that it only contains data types
that can legally
+/// appear in a (possibly shredded) variant array. It also replaces Binary
fields with BinaryView,
+/// since that's what comes back from the parquet reader and what the variant
code expects to find.
+fn canonicalize_and_verify_data_type(
+ data_type: &DataType,
+) -> Result<Cow<'_, DataType>, ArrowError> {
+ use DataType::*;
+
+ // helper macros
+ macro_rules! fail {
+ () => {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "Illegal shredded value type: {data_type}"
+ )))
+ };
}
+ macro_rules! borrow {
+ () => {
+ Cow::Borrowed(data_type)
+ };
+ }
+
+ let new_data_type = match data_type {
Review Comment:
Nice!
##########
parquet-variant-compute/src/variant_array.rs:
##########
@@ -796,8 +778,17 @@ impl StructArrayBuilder {
}
/// returns the non-null element at index as a Variant
-fn typed_value_to_variant(typed_value: &ArrayRef, index: usize) -> Variant<'_,
'_> {
- match typed_value.data_type() {
+fn typed_value_to_variant<'a>(
+ typed_value: &'a ArrayRef,
+ value: Option<&BinaryViewArray>,
+ index: usize,
+) -> Variant<'a, 'a> {
+ let data_type = typed_value.data_type();
+ if value.is_some_and(|v| !matches!(data_type, DataType::Struct(_)) &&
v.is_valid(index)) {
Review Comment:
We'll panic here if (`data_type` is not `DataType::Struct(_)`) and
(`v.is_valid(index)`), do we need to panic if `data_type` is `DataType::Struct`
and `v.is_valid(index)` here?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]