This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new e9cbabdd4c feat(parquet): batch consecutive null/empty rows in 
`write_list` (#9752)
e9cbabdd4c is described below

commit e9cbabdd4ce191c4057f6f3fe2366275d931e02c
Author: Hippolyte Barraud <[email protected]>
AuthorDate: Wed Apr 22 10:34:07 2026 -0400

    feat(parquet): batch consecutive null/empty rows in `write_list` (#9752)
    
    # Which issue does this PR close?
    
    - Spawn off from #9653
    - Contributes to #9731
    
    # Rationale for this change
    
    See #9731
    
    # What changes are included in this PR?
    
    Restructure `write_list()` to accumulate consecutive null and empty rows
    and flush them in a single `visit_leaves()` call using
    `extend(repeat_n(...))`, instead of calling `visit_leaves()` per row.
    
    With sparse data (99% nulls), a 4096-row batch previously triggered
    ~4000 individual tree traversals, each pushing a single value per leaf.
    Now consecutive null/empty runs are collapsed into one traversal that
    extends all leaf level buffers in bulk.
    
    This follows the same pattern already used by `write_struct()`. The
    `write_non_null_slice` path is unchanged since each non-null row has
    different offsets and cannot be batched.
    
    # Are these changes tested?
    
    All tests passing; existing tests give 100% coverage.
    
    # Are there any user-facing changes?
    
    N/A
    
    Signed-off-by: Hippolyte Barraud <[email protected]>
---
 parquet/src/arrow/arrow_writer/levels.rs | 68 +++++++++++++++++++++++---------
 1 file changed, 49 insertions(+), 19 deletions(-)

diff --git a/parquet/src/arrow/arrow_writer/levels.rs 
b/parquet/src/arrow/arrow_writer/levels.rs
index 8374e905e1..d23873278e 100644
--- a/parquet/src/arrow/arrow_writer/levels.rs
+++ b/parquet/src/arrow/arrow_writer/levels.rs
@@ -336,51 +336,81 @@ impl LevelInfoBuilder {
                 })
             };
 
-        let write_empty_slice = |child: &mut LevelInfoBuilder| {
-            child.visit_leaves(|leaf| {
-                let rep_levels = leaf.rep_levels.as_mut().unwrap();
-                rep_levels.push(ctx.rep_level - 1);
-                let def_levels = leaf.def_levels.as_mut().unwrap();
-                def_levels.push(ctx.def_level - 1);
-            })
+        let write_null_run = |child: &mut LevelInfoBuilder, count: usize| {
+            if count > 0 {
+                child.visit_leaves(|leaf| {
+                    leaf.rep_levels
+                        .as_mut()
+                        .unwrap()
+                        .extend(std::iter::repeat_n(ctx.rep_level - 1, count));
+                    leaf.def_levels
+                        .as_mut()
+                        .unwrap()
+                        .extend(std::iter::repeat_n(ctx.def_level - 2, count));
+                });
+            }
         };
 
-        let write_null_slice = |child: &mut LevelInfoBuilder| {
-            child.visit_leaves(|leaf| {
-                let rep_levels = leaf.rep_levels.as_mut().unwrap();
-                rep_levels.push(ctx.rep_level - 1);
-                let def_levels = leaf.def_levels.as_mut().unwrap();
-                def_levels.push(ctx.def_level - 2);
-            })
+        let write_empty_run = |child: &mut LevelInfoBuilder, count: usize| {
+            if count > 0 {
+                child.visit_leaves(|leaf| {
+                    leaf.rep_levels
+                        .as_mut()
+                        .unwrap()
+                        .extend(std::iter::repeat_n(ctx.rep_level - 1, count));
+                    leaf.def_levels
+                        .as_mut()
+                        .unwrap()
+                        .extend(std::iter::repeat_n(ctx.def_level - 1, count));
+                });
+            }
         };
 
         match nulls {
             Some(nulls) => {
                 let null_offset = range.start;
+                let mut pending_nulls: usize = 0;
+                let mut pending_empties: usize = 0;
+
                 // TODO: Faster bitmask iteration (#1757)
                 for (idx, w) in offsets.windows(2).enumerate() {
                     let is_valid = nulls.is_valid(idx + null_offset);
                     let start_idx = w[0].as_usize();
                     let end_idx = w[1].as_usize();
+
                     if !is_valid {
-                        write_null_slice(child)
+                        write_empty_run(child, pending_empties);
+                        pending_empties = 0;
+                        pending_nulls += 1;
                     } else if start_idx == end_idx {
-                        write_empty_slice(child)
+                        write_null_run(child, pending_nulls);
+                        pending_nulls = 0;
+                        pending_empties += 1;
                     } else {
-                        write_non_null_slice(child, start_idx, end_idx)
+                        write_null_run(child, pending_nulls);
+                        pending_nulls = 0;
+                        write_empty_run(child, pending_empties);
+                        pending_empties = 0;
+                        write_non_null_slice(child, start_idx, end_idx);
                     }
                 }
+                write_null_run(child, pending_nulls);
+                write_empty_run(child, pending_empties);
             }
             None => {
+                let mut pending_empties: usize = 0;
                 for w in offsets.windows(2) {
                     let start_idx = w[0].as_usize();
                     let end_idx = w[1].as_usize();
                     if start_idx == end_idx {
-                        write_empty_slice(child)
+                        pending_empties += 1;
                     } else {
-                        write_non_null_slice(child, start_idx, end_idx)
+                        write_empty_run(child, pending_empties);
+                        pending_empties = 0;
+                        write_non_null_slice(child, start_idx, end_idx);
                     }
                 }
+                write_empty_run(child, pending_empties);
             }
         }
     }

Reply via email to