This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new e9cbabdd4c feat(parquet): batch consecutive null/empty rows in
`write_list` (#9752)
e9cbabdd4c is described below
commit e9cbabdd4ce191c4057f6f3fe2366275d931e02c
Author: Hippolyte Barraud <[email protected]>
AuthorDate: Wed Apr 22 10:34:07 2026 -0400
feat(parquet): batch consecutive null/empty rows in `write_list` (#9752)
# Which issue does this PR close?
- Spawn off from #9653
- Contributes to #9731
# Rationale for this change
See #9731
# What changes are included in this PR?
Restructure `write_list()` to accumulate consecutive null and empty rows
and flush them in a single `visit_leaves()` call using
`extend(repeat_n(...))`, instead of calling `visit_leaves()` per row.
With sparse data (99% nulls), a 4096-row batch previously triggered
~4000 individual tree traversals, each pushing a single value per leaf.
Now consecutive null/empty runs are collapsed into one traversal that
extends all leaf level buffers in bulk.
This follows the same pattern already used by `write_struct()`. The
`write_non_null_slice` path is unchanged since each non-null row has
different offsets and cannot be batched.
# Are these changes tested?
All tests passing; existing tests give 100% coverage.
# Are there any user-facing changes?
N/A
Signed-off-by: Hippolyte Barraud <[email protected]>
---
parquet/src/arrow/arrow_writer/levels.rs | 68 +++++++++++++++++++++++---------
1 file changed, 49 insertions(+), 19 deletions(-)
diff --git a/parquet/src/arrow/arrow_writer/levels.rs
b/parquet/src/arrow/arrow_writer/levels.rs
index 8374e905e1..d23873278e 100644
--- a/parquet/src/arrow/arrow_writer/levels.rs
+++ b/parquet/src/arrow/arrow_writer/levels.rs
@@ -336,51 +336,81 @@ impl LevelInfoBuilder {
})
};
- let write_empty_slice = |child: &mut LevelInfoBuilder| {
- child.visit_leaves(|leaf| {
- let rep_levels = leaf.rep_levels.as_mut().unwrap();
- rep_levels.push(ctx.rep_level - 1);
- let def_levels = leaf.def_levels.as_mut().unwrap();
- def_levels.push(ctx.def_level - 1);
- })
+ let write_null_run = |child: &mut LevelInfoBuilder, count: usize| {
+ if count > 0 {
+ child.visit_leaves(|leaf| {
+ leaf.rep_levels
+ .as_mut()
+ .unwrap()
+ .extend(std::iter::repeat_n(ctx.rep_level - 1, count));
+ leaf.def_levels
+ .as_mut()
+ .unwrap()
+ .extend(std::iter::repeat_n(ctx.def_level - 2, count));
+ });
+ }
};
- let write_null_slice = |child: &mut LevelInfoBuilder| {
- child.visit_leaves(|leaf| {
- let rep_levels = leaf.rep_levels.as_mut().unwrap();
- rep_levels.push(ctx.rep_level - 1);
- let def_levels = leaf.def_levels.as_mut().unwrap();
- def_levels.push(ctx.def_level - 2);
- })
+ let write_empty_run = |child: &mut LevelInfoBuilder, count: usize| {
+ if count > 0 {
+ child.visit_leaves(|leaf| {
+ leaf.rep_levels
+ .as_mut()
+ .unwrap()
+ .extend(std::iter::repeat_n(ctx.rep_level - 1, count));
+ leaf.def_levels
+ .as_mut()
+ .unwrap()
+ .extend(std::iter::repeat_n(ctx.def_level - 1, count));
+ });
+ }
};
match nulls {
Some(nulls) => {
let null_offset = range.start;
+ let mut pending_nulls: usize = 0;
+ let mut pending_empties: usize = 0;
+
// TODO: Faster bitmask iteration (#1757)
for (idx, w) in offsets.windows(2).enumerate() {
let is_valid = nulls.is_valid(idx + null_offset);
let start_idx = w[0].as_usize();
let end_idx = w[1].as_usize();
+
if !is_valid {
- write_null_slice(child)
+ write_empty_run(child, pending_empties);
+ pending_empties = 0;
+ pending_nulls += 1;
} else if start_idx == end_idx {
- write_empty_slice(child)
+ write_null_run(child, pending_nulls);
+ pending_nulls = 0;
+ pending_empties += 1;
} else {
- write_non_null_slice(child, start_idx, end_idx)
+ write_null_run(child, pending_nulls);
+ pending_nulls = 0;
+ write_empty_run(child, pending_empties);
+ pending_empties = 0;
+ write_non_null_slice(child, start_idx, end_idx);
}
}
+ write_null_run(child, pending_nulls);
+ write_empty_run(child, pending_empties);
}
None => {
+ let mut pending_empties: usize = 0;
for w in offsets.windows(2) {
let start_idx = w[0].as_usize();
let end_idx = w[1].as_usize();
if start_idx == end_idx {
- write_empty_slice(child)
+ pending_empties += 1;
} else {
- write_non_null_slice(child, start_idx, end_idx)
+ write_empty_run(child, pending_empties);
+ pending_empties = 0;
+ write_non_null_slice(child, start_idx, end_idx);
}
}
+ write_empty_run(child, pending_empties);
}
}
}