mapleFU commented on code in PR #10037:
URL: https://github.com/apache/arrow-rs/pull/10037#discussion_r3326039783
##########
parquet/src/arrow/arrow_writer/levels.rs:
##########
@@ -427,6 +449,115 @@ impl LevelInfoBuilder {
}
}
+ /// Optimized write path for lists whose child has no nested repetition
levels.
+ ///
+ /// When the child is a leaf (or a struct of leaves), each child element
maps to
+ /// exactly one rep_level entry. This lets us batch contiguous non-empty
list
+ /// slots into a single `child.write()` call, then stamp the list-start
markers
+ /// at positions computed directly from offsets — avoiding per-slot
`write` +
+ /// reverse-scan overhead.
+ fn write_list_last_level<O: OffsetSizeTrait>(
+ child: &mut LevelInfoBuilder,
+ ctx: &LevelContext,
+ offsets: &[O],
+ nulls: Option<&NullBuffer>,
+ range: Range<usize>,
+ ) {
+ let null_offset = range.start;
+ let offsets = &offsets[range.start..range.end + 1];
+ let list_start_rep = ctx.rep_level - 1;
+
+ // Emit `count` null list slots (list itself is absent)
+ let emit_nulls = |child: &mut LevelInfoBuilder, count: usize| {
+ child.visit_leaves(|leaf| {
+ leaf.append_rep_level_run(list_start_rep, count);
+ leaf.append_def_level_run(ctx.def_level - 2, count);
+ });
+ };
+
+ // Emit `count` empty list slots (list present but has zero elements)
+ let emit_empties = |child: &mut LevelInfoBuilder, count: usize| {
+ child.visit_leaves(|leaf| {
+ leaf.append_rep_level_run(list_start_rep, count);
+ leaf.append_def_level_run(ctx.def_level - 1, count);
+ });
+ };
+
+ // Write a batched run of contiguous non-empty list slots.
+ // `run_offsets` = &offsets[run_first_slot..=run_last_slot+1], i.e. one
+ // offset per slot boundary: [o0, o1, ..., oN] for N slots.
+ let emit_non_empty_run = |child: &mut LevelInfoBuilder, run_offsets:
&[O]| {
+ debug_assert!(run_offsets.len() >= 2);
+ let values_start = run_offsets[0].as_usize();
+ let values_end = run_offsets[run_offsets.len() - 1].as_usize();
+ debug_assert!(values_end > values_start);
+
+ // Write all leaf values in one batch. Since the child has no
nested
+ // rep, this emits (values_end - values_start) rep_levels all equal
+ // to ctx.rep_level (= "continuation within list").
+ child.write(values_start..values_end);
+
+ // Fix up: the first element of each list slot needs rep_level =
+ // list_start_rep to mark a new list boundary. Because there's a
1:1
+ // mapping between child elements and rep_level entries, the
position
+ // of each slot's first element is directly computable from
offsets.
+ child.visit_leaves(|leaf| {
+ let rep_levels = leaf.rep_levels.materialize_mut().unwrap();
+ let batch_len = values_end - values_start;
+ let batch_base = rep_levels.len() - batch_len;
+
+ for slot_offset in run_offsets.iter().take(run_offsets.len() -
1) {
+ let list_start_pos = batch_base + (slot_offset.as_usize()
- values_start);
+ rep_levels[list_start_pos] = list_start_rep;
+ }
+ });
+ };
+
+ // Classify each slot then detect run boundaries. On each transition
+ // (or end of iteration), flush the completed run.
+ #[derive(Clone, Copy, PartialEq)]
Review Comment:
oops, so this path can be optimized
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]