alamb commented on code in PR #7463:
URL: https://github.com/apache/arrow-rs/pull/7463#discussion_r2073584836
##########
arrow-select/src/filter.rs:
##########
@@ -656,29 +653,44 @@ where
(start, end, len)
}
- /// Extends the in-progress array by the indexes in the provided iterator
- fn extend_idx(&mut self, iter: impl Iterator<Item = usize>) {
+ fn extend_offsets_idx(&mut self, iter: impl Iterator<Item = usize>) {
self.dst_offsets.extend(iter.map(|idx| {
let start = self.src_offsets[idx].as_usize();
let end = self.src_offsets[idx + 1].as_usize();
let len = OffsetSize::from_usize(end - start).expect("illegal
offset range");
self.cur_offset += len;
- self.dst_values
- .extend_from_slice(&self.src_values[start..end]);
+
self.cur_offset
}));
+ self.dst_values.reserve_exact(self.cur_offset.as_usize());
}
- /// Extends the in-progress array by the ranges in the provided iterator
- fn extend_slices(&mut self, iter: impl Iterator<Item = (usize, usize)>) {
+ /// Extends the in-progress array by the indexes in the provided iterator
+ fn extend_idx(&mut self, iter: impl Iterator<Item = usize>) {
+ for idx in iter {
+ let start = self.src_offsets[idx].as_usize();
+ let end = self.src_offsets[idx + 1].as_usize();
+ self.dst_values
+ .extend_from_slice(&self.src_values[start..end]);
+ }
+ }
+
+ fn extend_offsets_slices(&mut self, iter: impl Iterator<Item = (usize,
usize)>, count: usize) {
+ self.dst_offsets.reserve_exact(count);
for (start, end) in iter {
// These can only fail if `array` contains invalid data
for idx in start..end {
let (_, _, len) = self.get_value_range(idx);
self.cur_offset += len;
- self.dst_offsets.push(self.cur_offset); // push_unchecked?
+ self.dst_offsets.push(self.cur_offset);
}
+ }
+ self.dst_values.reserve_exact(self.cur_offset.as_usize());
Review Comment:
I think it would make sense for this call to reserve_exact to be a part of
`extend_idx` (which is what actually extends the values). Otherwise I think it
is quite tricky to understand that the performance of `extend_idx` is relying
on calling `extend_offsts_slices` first
Since they are now always called in a pair `extend_offsets_slices` and
`extend_slices` I don't think this would change anything functionally but it
would make the code easier for me to understand (even though you would also
have to pass `count` to the extend_idx was well
Another alternative might be to make a function `fn reserve_exact` that to
make the allocations explicit
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]