alamb commented on code in PR #8438:
URL: https://github.com/apache/arrow-rs/pull/8438#discussion_r2376876343
##########
parquet-variant-compute/src/variant_array.rs:
##########
@@ -843,18 +826,6 @@ fn typed_value_to_variant(typed_value: &ArrayRef, index:
usize) -> Variant<'_, '
DataType::Int64 => {
primitive_conversion_single_value!(Int64Type, typed_value, index)
}
- DataType::UInt8 => {
Review Comment:
If I understand this correctly, the point is that since the Variant spec has
no unsigned types, it wouldn't be permissible to shred out such arrow types
https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#encoding-types
##########
parquet-variant-compute/src/variant_array.rs:
##########
@@ -898,6 +869,14 @@ fn cast_to_binary_view_arrays(array: &dyn Array) ->
Result<ArrayRef, ArrowError>
/// replaces all instances of Binary with BinaryView in a DataType
fn rewrite_to_view_types(data_type: &DataType) -> DataType {
match data_type {
+ // Unsigned integers are not allowed at all
+ DataType::UInt8 | DataType::UInt16 | DataType::UInt32 |
DataType::UInt64 => {
+ panic!("Illegal shredded value type: {data_type:?}");
Review Comment:
this would be a good place to return errors I think
##########
parquet-variant-compute/src/variant_array.rs:
##########
@@ -898,6 +871,14 @@ fn cast_to_binary_view_arrays(array: &dyn Array) ->
Result<ArrayRef, ArrowError>
/// replaces all instances of Binary with BinaryView in a DataType
fn rewrite_to_view_types(data_type: &DataType) -> DataType {
Review Comment:
YesI agree checking the types up front as part of construction is 💯 and
avoids the potential for errors later on in `value` methods
##########
parquet-variant-compute/src/variant_array.rs:
##########
@@ -809,17 +800,11 @@ fn typed_value_to_variant(typed_value: &ArrayRef, index:
usize) -> Variant<'_, '
let date = Date32Type::to_naive_date(value);
Variant::from(date)
}
- DataType::FixedSizeBinary(binary_len) => {
+ // 16-byte FixedSizeBinary is always UUID; all others illegal.
+ DataType::FixedSizeBinary(16) => {
let array = typed_value.as_fixed_size_binary();
- // Try to treat 16 byte FixedSizeBinary as UUID
- let value = array.value(index);
- if *binary_len == 16 {
- if let Ok(uuid) = Uuid::from_slice(value) {
- return Variant::from(uuid);
- }
- }
let value = array.value(index);
- Variant::from(value)
+ Uuid::from_slice(value).map_or(Variant::Null, Variant::from)
Review Comment:
Yes, this would be an appropriate use of `unwrap` I think
##########
parquet/tests/variant_integration.rs:
##########
@@ -238,21 +236,19 @@ variant_test_case!(124);
variant_test_case!(125, "Unsupported typed_value type: Struct");
variant_test_case!(126, "Unsupported typed_value type: List(");
// Is an error case (should be failing as the expected error message indicates)
-variant_test_case!(
- 127,
- "Invalid variant data: InvalidArgumentError(\"Received empty bytes\")"
-);
+variant_test_case!(127, "Illegal shredded value type: UInt32");
// Is an error case (should be failing as the expected error message indicates)
+// TODO: Once structs are supported, expect "Invalid variant, non-object value
with shredded fields"
variant_test_case!(128, "Unsupported typed_value type: Struct(");
-variant_test_case!(129, "Invalid variant data: InvalidArgumentError(");
Review Comment:
Confirmed the test says this case should return Variant::Null 👍
https://github.com/apache/parquet-testing/blob/a3d96a65e11e2bbca7d22a894e8313ede90a33a3/shredded_variant/cases.json#L764-L768
##########
parquet/tests/variant_integration.rs:
##########
@@ -238,21 +236,19 @@ variant_test_case!(124);
variant_test_case!(125, "Unsupported typed_value type: Struct");
variant_test_case!(126, "Unsupported typed_value type: List(");
// Is an error case (should be failing as the expected error message indicates)
-variant_test_case!(
- 127,
- "Invalid variant data: InvalidArgumentError(\"Received empty bytes\")"
-);
+variant_test_case!(127, "Illegal shredded value type: UInt32");
// Is an error case (should be failing as the expected error message indicates)
+// TODO: Once structs are supported, expect "Invalid variant, non-object value
with shredded fields"
variant_test_case!(128, "Unsupported typed_value type: Struct(");
-variant_test_case!(129, "Invalid variant data: InvalidArgumentError(");
+variant_test_case!(129);
variant_test_case!(130, "Unsupported typed_value type: Struct(");
variant_test_case!(131);
variant_test_case!(132, "Unsupported typed_value type: Struct(");
variant_test_case!(133, "Unsupported typed_value type: Struct(");
variant_test_case!(134, "Unsupported typed_value type: Struct(");
variant_test_case!(135);
variant_test_case!(136, "Unsupported typed_value type: List(");
-variant_test_case!(137, "Invalid variant data: InvalidArgumentError(");
Review Comment:
The new error seems more like the expected error in cases:
https://github.com/apache/parquet-testing/blob/a3d96a65e11e2bbca7d22a894e8313ede90a33a3/shredded_variant/cases.json#L812-L815
##########
parquet-variant-compute/src/variant_get.rs:
##########
@@ -438,26 +438,6 @@ mod test {
numeric_partially_shredded_test!(i64,
partially_shredded_int64_variant_array);
}
- #[test]
- fn get_variant_partially_shredded_uint8_as_variant() {
Review Comment:
I don't think we need to worry too much about it. Let's just makes sure each
error path is hit
##########
parquet-variant-compute/src/variant_array.rs:
##########
@@ -796,8 +778,17 @@ impl StructArrayBuilder {
}
/// returns the non-null element at index as a Variant
-fn typed_value_to_variant(typed_value: &ArrayRef, index: usize) -> Variant<'_,
'_> {
- match typed_value.data_type() {
+fn typed_value_to_variant<'a>(
+ typed_value: &'a ArrayRef,
+ value: Option<&BinaryViewArray>,
+ index: usize,
+) -> Variant<'a, 'a> {
+ let data_type = typed_value.data_type();
+ if value.is_some_and(|v| !matches!(data_type, DataType::Struct(_)) &&
v.is_valid(index)) {
+ // Only a partially shredded struct is allowed to have values for both
columns
+ panic!("Invalid variant, conflicting value and typed_value");
Review Comment:
I think adding `try_value` sounds like a good idea to me
However, it seems to me that most of these checks can be done once per array
(e.g. this check for `value` and compare to the datatype doesn't change row by
row, so paying the cost to do the validation on each row feels wasteful to me)
Can we perhaps move this check into the constructor of VariantArray 🤔
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]