This is an automated email from the ASF dual-hosted git repository. alamb pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push: new 9acc9fa0b8 Minor API adjustments for StringViewBuilder (#6047) 9acc9fa0b8 is described below commit 9acc9fa0b84755c5caffb8acf806fa4fd21928bb Author: Xiangpeng Hao <haoxiangpeng...@gmail.com> AuthorDate: Mon Jul 15 05:52:37 2024 -0400 Minor API adjustments for StringViewBuilder (#6047) * minor update * add memory accounting * Update arrow-buffer/src/builder/null.rs Co-authored-by: Andrew Lamb <and...@nerdnetworks.org> * Update arrow-array/src/builder/generic_bytes_view_builder.rs Co-authored-by: Andrew Lamb <and...@nerdnetworks.org> * update comments --------- Co-authored-by: Andrew Lamb <and...@nerdnetworks.org> --- arrow-array/src/array/byte_view_array.rs | 6 ++++-- arrow-array/src/builder/generic_bytes_view_builder.rs | 16 +++++++++++++++- arrow-buffer/src/builder/null.rs | 8 ++++++++ 3 files changed, 27 insertions(+), 3 deletions(-) diff --git a/arrow-array/src/array/byte_view_array.rs b/arrow-array/src/array/byte_view_array.rs index a00bf7271b..5ce150a7a3 100644 --- a/arrow-array/src/array/byte_view_array.rs +++ b/arrow-array/src/array/byte_view_array.rs @@ -324,9 +324,11 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> { /// Note that it will copy the array regardless of whether the original array is compact. /// Use with caution as this can be an expensive operation, only use it when you are sure that the view /// array is significantly smaller than when it is originally created, e.g., after filtering or slicing. + /// + /// Note: this function does not attempt to canonicalize / deduplicate values. For this + /// feature see [`GenericByteViewBuilder::with_deduplicate_strings`]. pub fn gc(&self) -> Self { - let mut builder = - GenericByteViewBuilder::<T>::with_capacity(self.len()).with_deduplicate_strings(); + let mut builder = GenericByteViewBuilder::<T>::with_capacity(self.len()); for v in self.iter() { builder.append_option(v); diff --git a/arrow-array/src/builder/generic_bytes_view_builder.rs b/arrow-array/src/builder/generic_bytes_view_builder.rs index dda5535456..587255cc6b 100644 --- a/arrow-array/src/builder/generic_bytes_view_builder.rs +++ b/arrow-array/src/builder/generic_bytes_view_builder.rs @@ -201,7 +201,8 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> { /// Returns the value at the given index /// Useful if we want to know what value has been inserted to the builder - fn get_value(&self, index: usize) -> &[u8] { + /// The index has to be smaller than `self.len()`, otherwise it will panic + pub fn get_value(&self, index: usize) -> &[u8] { let view = self.views_builder.as_slice().get(index).unwrap(); let len = *view as u32; if len <= 12 { @@ -337,6 +338,19 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> { pub fn validity_slice(&self) -> Option<&[u8]> { self.null_buffer_builder.as_slice() } + + /// Return the allocated size of this builder in bytes, useful for memory accounting. + pub fn allocated_size(&self) -> usize { + let views = self.views_builder.capacity() * std::mem::size_of::<u128>(); + let null = self.null_buffer_builder.allocated_size(); + let buffer_size = self.completed.iter().map(|b| b.capacity()).sum::<usize>(); + let in_progress = self.in_progress.capacity(); + let tracker = match &self.string_tracker { + Some((ht, _)) => ht.capacity() * std::mem::size_of::<usize>(), + None => 0, + }; + buffer_size + in_progress + tracker + views + null + } } impl<T: ByteViewType + ?Sized> Default for GenericByteViewBuilder<T> { diff --git a/arrow-buffer/src/builder/null.rs b/arrow-buffer/src/builder/null.rs index 55b3303c9e..a1cea6ef2c 100644 --- a/arrow-buffer/src/builder/null.rs +++ b/arrow-buffer/src/builder/null.rs @@ -161,6 +161,14 @@ impl NullBufferBuilder { pub fn as_slice_mut(&mut self) -> Option<&mut [u8]> { self.bitmap_builder.as_mut().map(|b| b.as_slice_mut()) } + + /// Return the allocated size of this builder, in bytes, useful for memory accounting. + pub fn allocated_size(&self) -> usize { + self.bitmap_builder + .as_ref() + .map(|b| b.capacity()) + .unwrap_or(0) + } } impl NullBufferBuilder {