This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 83946594d6 [Variant] Fix NULL handling for shredded object fields
(#8395)
83946594d6 is described below
commit 83946594d6ebfb4f1d62353e6caa7bd5566c01b4
Author: Ryan Johnson <[email protected]>
AuthorDate: Fri Sep 19 18:18:32 2025 -0600
[Variant] Fix NULL handling for shredded object fields (#8395)
# Which issue does this PR close?
- Fast-follow for https://github.com/apache/arrow-rs/pull/8366
- Related to https://github.com/apache/arrow-rs/pull/8392
# Rationale for this change
Somehow, https://github.com/apache/arrow-rs/pull/8392 exposes a latent
bug in https://github.com/apache/arrow-rs/pull/8366, which has improper
NULL handling for shredded object fields. The shredding PR originally
attempted to handle this case, but somehow the test did not trigger the
bug and so the (admittedly incomplete) code was removed. See
https://github.com/apache/arrow-rs/pull/8366#discussion_r2357552451. To
be honest, I have no idea how the original ever worked correctly, nor
why the new PR is able to expose the problem.
# What changes are included in this PR?
When used as a top-level builder,
`VariantToShreddedVariantRowBuilder::append_null` must append NULL to
its own `NullBufferBuilder`; but when used as a shredded object field
builder, it must append non-NULL. Plumb a new `top_level` parameter
through the various functions and into the two sub-builders it relies
on, so they can implement the correct semantics.
# Are these changes tested?
In theory, yes (I don't know how the object shredding test ever passed).
And it fixes the breakage in
https://github.com/apache/arrow-rs/pull/8392.
# Are there any user-facing changes?
No
---
parquet-variant-compute/src/shred_variant.rs | 40 ++++++++++++++++++++++------
1 file changed, 32 insertions(+), 8 deletions(-)
diff --git a/parquet-variant-compute/src/shred_variant.rs
b/parquet-variant-compute/src/shred_variant.rs
index 9b517c0346..aea36266e8 100644
--- a/parquet-variant-compute/src/shred_variant.rs
+++ b/parquet-variant-compute/src/shred_variant.rs
@@ -76,8 +76,12 @@ pub fn shred_variant(array: &VariantArray, as_type:
&DataType) -> Result<Variant
};
let cast_options = CastOptions::default();
- let mut builder =
- make_variant_to_shredded_variant_arrow_row_builder(as_type,
&cast_options, array.len())?;
+ let mut builder = make_variant_to_shredded_variant_arrow_row_builder(
+ as_type,
+ &cast_options,
+ array.len(),
+ true,
+ )?;
for i in 0..array.len() {
if array.is_null(i) {
builder.append_null()?;
@@ -98,11 +102,16 @@ pub(crate) fn
make_variant_to_shredded_variant_arrow_row_builder<'a>(
data_type: &'a DataType,
cast_options: &'a CastOptions,
capacity: usize,
+ top_level: bool,
) -> Result<VariantToShreddedVariantRowBuilder<'a>> {
let builder = match data_type {
DataType::Struct(fields) => {
- let typed_value_builder =
- VariantToShreddedObjectVariantRowBuilder::try_new(fields,
cast_options, capacity)?;
+ let typed_value_builder =
VariantToShreddedObjectVariantRowBuilder::try_new(
+ fields,
+ cast_options,
+ capacity,
+ top_level,
+ )?;
VariantToShreddedVariantRowBuilder::Object(typed_value_builder)
}
DataType::List(_)
@@ -118,7 +127,7 @@ pub(crate) fn
make_variant_to_shredded_variant_arrow_row_builder<'a>(
let builder =
make_primitive_variant_to_arrow_row_builder(data_type,
cast_options, capacity)?;
let typed_value_builder =
- VariantToShreddedPrimitiveVariantRowBuilder::new(builder,
capacity);
+ VariantToShreddedPrimitiveVariantRowBuilder::new(builder,
capacity, top_level);
VariantToShreddedVariantRowBuilder::Primitive(typed_value_builder)
}
};
@@ -160,21 +169,26 @@ pub(crate) struct
VariantToShreddedPrimitiveVariantRowBuilder<'a> {
value_builder: VariantValueArrayBuilder,
typed_value_builder: PrimitiveVariantToArrowRowBuilder<'a>,
nulls: NullBufferBuilder,
+ top_level: bool,
}
impl<'a> VariantToShreddedPrimitiveVariantRowBuilder<'a> {
pub(crate) fn new(
typed_value_builder: PrimitiveVariantToArrowRowBuilder<'a>,
capacity: usize,
+ top_level: bool,
) -> Self {
Self {
value_builder: VariantValueArrayBuilder::new(capacity),
typed_value_builder,
nulls: NullBufferBuilder::new(capacity),
+ top_level,
}
}
fn append_null(&mut self) -> Result<()> {
- self.nulls.append_null();
+ // Only the top-level struct that represents the variant can be
nullable; object fields and
+ // array elements are non-nullable.
+ self.nulls.append(!self.top_level);
self.value_builder.append_null();
self.typed_value_builder.append_null()
}
@@ -201,15 +215,22 @@ pub(crate) struct
VariantToShreddedObjectVariantRowBuilder<'a> {
typed_value_builders: IndexMap<&'a str,
VariantToShreddedVariantRowBuilder<'a>>,
typed_value_nulls: NullBufferBuilder,
nulls: NullBufferBuilder,
+ top_level: bool,
}
impl<'a> VariantToShreddedObjectVariantRowBuilder<'a> {
- fn try_new(fields: &'a Fields, cast_options: &'a CastOptions, capacity:
usize) -> Result<Self> {
+ fn try_new(
+ fields: &'a Fields,
+ cast_options: &'a CastOptions,
+ capacity: usize,
+ top_level: bool,
+ ) -> Result<Self> {
let typed_value_builders = fields.iter().map(|field| {
let builder = make_variant_to_shredded_variant_arrow_row_builder(
field.data_type(),
cast_options,
capacity,
+ false,
)?;
Ok((field.name().as_str(), builder))
});
@@ -218,11 +239,14 @@ impl<'a> VariantToShreddedObjectVariantRowBuilder<'a> {
typed_value_builders: typed_value_builders.collect::<Result<_>>()?,
typed_value_nulls: NullBufferBuilder::new(capacity),
nulls: NullBufferBuilder::new(capacity),
+ top_level,
})
}
fn append_null(&mut self) -> Result<()> {
- self.nulls.append_null();
+ // Only the top-level struct that represents the variant can be
nullable; object fields and
+ // array elements are non-nullable.
+ self.nulls.append(!self.top_level);
self.value_builder.append_null();
self.typed_value_nulls.append_null();
for (_, typed_value_builder) in &mut self.typed_value_builders {