This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 4676c06686 [Variant] Take top-level nulls into consideration when
extracting perfectly shredded children (#9702)
4676c06686 is described below
commit 4676c06686a5be476b5cd632493a73c803d64a04
Author: Adam Gutglick <[email protected]>
AuthorDate: Thu Apr 16 14:03:03 2026 +0100
[Variant] Take top-level nulls into consideration when extracting perfectly
shredded children (#9702)
# Which issue does this PR close?
- Closes #9701
# Rationale for this change
Fixes a correctness issue, where top-level nullability will be dropped
in these cases.
Its important to note that due to the current canonicalization behavior,
some types (like `Binary`) actually do behave correctly, this will be
fully addressed in #9610 where we can support more underlying types,
which simplifies it significantly.
# What changes are included in this PR?
Union the nullability buffers of perfectly shredded variant children
with the array's top-level nullability.
# Are these changes tested?
In addition to existing tests, add tests that verify that the nulls are
applied, both when the child is has no-nulls and when it does.
# Are there any user-facing changes?
Fixes incorrect behavior
---
parquet-variant-compute/src/variant_get.rs | 155 ++++++++++++++++++++++++++++-
1 file changed, 151 insertions(+), 4 deletions(-)
diff --git a/parquet-variant-compute/src/variant_get.rs
b/parquet-variant-compute/src/variant_get.rs
index 1b4b3354e2..e95c774f29 100644
--- a/parquet-variant-compute/src/variant_get.rs
+++ b/parquet-variant-compute/src/variant_get.rs
@@ -15,7 +15,7 @@
// specific language governing permissions and limitations
// under the License.
use arrow::{
- array::{self, Array, ArrayRef, StructArray},
+ array::{self, Array, ArrayRef, StructArray, make_array},
buffer::NullBuffer,
compute::CastOptions,
datatypes::Field,
@@ -261,8 +261,24 @@ fn try_perfect_shredding(variant_array: &VariantArray,
as_field: &Field) -> Opti
// 2. If every row in the `value` column is null
// This is a perfect shredding, where the value is entirely shredded
out,
- // so we can just return the typed value.
- return Some(typed_value.clone());
+ // so we can just return the typed value after merging the accumulated
nulls.
+ let parent_nulls = variant_array.nulls();
+
+ // If we have no nulls OR the shredded array is `Null`, which doesn't
support external nulls.
+ let target_array = if parent_nulls.is_none() ||
typed_value.data_type().is_null() {
+ typed_value.clone()
+ } else {
+ let merged_nulls = NullBuffer::union(parent_nulls,
typed_value.nulls());
+ let data = typed_value
+ .to_data()
+ .into_builder()
+ .nulls(merged_nulls)
+ .build()
+ .ok()?;
+ make_array(data)
+ };
+
+ return Some(target_array);
}
None
@@ -347,7 +363,7 @@ mod test {
Date64Array, Decimal32Array, Decimal64Array, Decimal128Array,
Decimal256Array,
Float32Array, Float64Array, Int8Array, Int16Array, Int32Array,
Int64Array,
LargeBinaryArray, LargeListArray, LargeListViewArray,
LargeStringArray, ListArray,
- ListViewArray, NullBuilder, StringArray, StringViewArray, StructArray,
+ ListViewArray, NullArray, NullBuilder, StringArray, StringViewArray,
StructArray,
Time32MillisecondArray, Time32SecondArray, Time64MicrosecondArray,
Time64NanosecondArray,
};
use arrow::buffer::{NullBuffer, OffsetBuffer, ScalarBuffer};
@@ -4331,4 +4347,135 @@ mod test {
);
}
}
+
+ macro_rules! perfectly_shredded_preserves_top_level_nulls_test {
+ ($name:ident, $result_type:expr, $typed_value:expr,
$expected_array:expr) => {
+ perfectly_shredded_preserves_top_level_nulls_test!(
+ $name,
+ $result_type,
+ $typed_value,
+ Some(NullBuffer::from(vec![true, false, true])),
+ $expected_array
+ );
+ };
+ ($name:ident, $result_type:expr, $typed_value:expr,
$parent_nulls:expr, $expected_array:expr) => {
+ #[test]
+ fn $name() {
+ let metadata =
Arc::new(BinaryViewArray::from_iter_values(std::iter::repeat_n(
+ EMPTY_VARIANT_METADATA_BYTES,
+ 3,
+ )));
+ let typed_value: ArrayRef = Arc::new($typed_value);
+ let variant_array: ArrayRef =
+ VariantArray::from_parts(metadata, None,
Some(typed_value), $parent_nulls)
+ .into();
+
+ let result = variant_get(
+ &variant_array,
+
GetOptions::new().with_as_type(Some(FieldRef::from(Field::new(
+ "result",
+ $result_type,
+ true,
+ )))),
+ )
+ .unwrap();
+
+ let expected_array: ArrayRef = Arc::new($expected_array);
+ assert_eq!(&result, &expected_array);
+ }
+ };
+ }
+
+ perfectly_shredded_preserves_top_level_nulls_test!(
+ test_variant_get_perfectly_shredded_integer_preserves_top_level_nulls,
+ DataType::Int32,
+ Int32Array::from(vec![Some(0_i32), Some(1_i32), Some(2_i32)]),
+ Int32Array::from(vec![Some(0_i32), None, Some(2_i32)])
+ );
+
+ perfectly_shredded_preserves_top_level_nulls_test!(
+
test_variant_get_perfectly_shredded_integer_unions_child_and_top_level_nulls,
+ DataType::Int32,
+ Int32Array::from(vec![None, Some(1_i32), Some(2_i32)]),
+ Some(NullBuffer::from(vec![true, false, true])),
+ Int32Array::from(vec![None, None, Some(2_i32)])
+ );
+
+ perfectly_shredded_preserves_top_level_nulls_test!(
+ test_variant_get_perfectly_shredded_null_preserves_top_level_nulls,
+ DataType::Null,
+ NullArray::new(3),
+ NullArray::new(3)
+ );
+
+ perfectly_shredded_preserves_top_level_nulls_test!(
+
test_variant_get_perfectly_shredded_binary_view_preserves_top_level_nulls,
+ DataType::BinaryView,
+ BinaryViewArray::from(vec![
+ Some(b"Apache" as &[u8]),
+ Some(b"masked-null" as &[u8]),
+ Some(b"Parquet-variant" as &[u8]),
+ ]),
+ BinaryViewArray::from(vec![
+ Some(b"Apache" as &[u8]),
+ None,
+ Some(b"Parquet-variant" as &[u8]),
+ ])
+ );
+
+ perfectly_shredded_preserves_top_level_nulls_test!(
+ test_variant_get_perfectly_shredded_binary_preserves_top_level_nulls,
+ DataType::Binary,
+ BinaryArray::from(vec![
+ Some(b"Apache" as &[u8]),
+ Some(b"masked-null" as &[u8]),
+ Some(b"Parquet-variant" as &[u8]),
+ ]),
+ BinaryArray::from(vec![
+ Some(b"Apache" as &[u8]),
+ None,
+ Some(b"Parquet-variant" as &[u8]),
+ ])
+ );
+
+ perfectly_shredded_preserves_top_level_nulls_test!(
+ test_variant_get_perfectly_shredded_decimal4_preserves_top_level_nulls,
+ DataType::Decimal32(5, 2),
+ Decimal32Array::from(vec![Some(12345), Some(23400), Some(-12342)])
+ .with_precision_and_scale(5, 2)
+ .unwrap(),
+ Decimal32Array::from(vec![Some(12345), None, Some(-12342)])
+ .with_precision_and_scale(5, 2)
+ .unwrap()
+ );
+
+ perfectly_shredded_preserves_top_level_nulls_test!(
+ test_variant_get_perfectly_shredded_decimal8_preserves_top_level_nulls,
+ DataType::Decimal64(10, 1),
+ Decimal64Array::from(vec![Some(1234567809), Some(1456787000),
Some(-1234561203)])
+ .with_precision_and_scale(10, 1)
+ .unwrap(),
+ Decimal64Array::from(vec![Some(1234567809), None, Some(-1234561203)])
+ .with_precision_and_scale(10, 1)
+ .unwrap()
+ );
+
+ perfectly_shredded_preserves_top_level_nulls_test!(
+
test_variant_get_perfectly_shredded_decimal16_preserves_top_level_nulls,
+ DataType::Decimal128(20, 3),
+ Decimal128Array::from(vec![
+ Some(i128::from_str("12345678901234567899").unwrap()),
+ Some(i128::from_str("23445677483748324300").unwrap()),
+ Some(i128::from_str("-12345678901234567899").unwrap()),
+ ])
+ .with_precision_and_scale(20, 3)
+ .unwrap(),
+ Decimal128Array::from(vec![
+ Some(i128::from_str("12345678901234567899").unwrap()),
+ None,
+ Some(i128::from_str("-12345678901234567899").unwrap()),
+ ])
+ .with_precision_and_scale(20, 3)
+ .unwrap()
+ );
}