This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new bd1e76b085 Implement exponential block size growing strategy for
`StringViewBuilder` (#6136)
bd1e76b085 is described below
commit bd1e76b0857fc1c4fcbf8ba51aa55698a0f527ab
Author: Xiangpeng Hao <[email protected]>
AuthorDate: Mon Jul 29 14:22:38 2024 -0400
Implement exponential block size growing strategy for `StringViewBuilder`
(#6136)
* new block size growing strategy
* Update arrow-array/src/builder/generic_bytes_view_builder.rs
Co-authored-by: Andrew Lamb <[email protected]>
* update function name, deprecate old function
* update comments
---------
Co-authored-by: Andrew Lamb <[email protected]>
---
arrow-array/src/array/byte_view_array.rs | 4 +-
.../src/builder/generic_bytes_view_builder.rs | 104 +++++++++++++++++++--
arrow-cast/src/cast/mod.rs | 8 +-
3 files changed, 104 insertions(+), 12 deletions(-)
diff --git a/arrow-array/src/array/byte_view_array.rs
b/arrow-array/src/array/byte_view_array.rs
index 63b9fe30ed..a9aed95318 100644
--- a/arrow-array/src/array/byte_view_array.rs
+++ b/arrow-array/src/array/byte_view_array.rs
@@ -757,7 +757,7 @@ mod tests {
fn test_in_progress_recreation() {
let array = {
// make a builder with small block size.
- let mut builder = StringViewBuilder::new().with_block_size(14);
+ let mut builder =
StringViewBuilder::new().with_fixed_block_size(14);
builder.append_value("large payload over 12 bytes");
builder.append_option(Some("another large payload over 12 bytes
that double than the first one, so that we can trigger the in_progress in
builder re-created"));
builder.finish()
@@ -848,7 +848,7 @@ mod tests {
];
let array = {
- let mut builder = StringViewBuilder::new().with_block_size(8); //
create multiple buffers
+ let mut builder =
StringViewBuilder::new().with_fixed_block_size(8); // create multiple buffers
test_data.into_iter().for_each(|v| builder.append_option(v));
builder.finish()
};
diff --git a/arrow-array/src/builder/generic_bytes_view_builder.rs
b/arrow-array/src/builder/generic_bytes_view_builder.rs
index 7726ee3524..4f19204b86 100644
--- a/arrow-array/src/builder/generic_bytes_view_builder.rs
+++ b/arrow-array/src/builder/generic_bytes_view_builder.rs
@@ -30,7 +30,30 @@ use crate::types::bytes::ByteArrayNativeType;
use crate::types::{BinaryViewType, ByteViewType, StringViewType};
use crate::{ArrayRef, GenericByteViewArray};
-const DEFAULT_BLOCK_SIZE: u32 = 8 * 1024;
+const STARTING_BLOCK_SIZE: u32 = 8 * 1024; // 8KiB
+const MAX_BLOCK_SIZE: u32 = 2 * 1024 * 1024; // 2MiB
+
+enum BlockSizeGrowthStrategy {
+ Fixed { size: u32 },
+ Exponential { current_size: u32 },
+}
+
+impl BlockSizeGrowthStrategy {
+ fn next_size(&mut self) -> u32 {
+ match self {
+ Self::Fixed { size } => *size,
+ Self::Exponential { current_size } => {
+ if *current_size < MAX_BLOCK_SIZE {
+ // we have fixed start/end block sizes, so we can't
overflow
+ *current_size = current_size.saturating_mul(2);
+ *current_size
+ } else {
+ MAX_BLOCK_SIZE
+ }
+ }
+ }
+ }
+}
/// A builder for [`GenericByteViewArray`]
///
@@ -58,7 +81,7 @@ pub struct GenericByteViewBuilder<T: ByteViewType + ?Sized> {
null_buffer_builder: NullBufferBuilder,
completed: Vec<Buffer>,
in_progress: Vec<u8>,
- block_size: u32,
+ block_size: BlockSizeGrowthStrategy,
/// Some if deduplicating strings
/// map `<string hash> -> <index to the views>`
string_tracker: Option<(HashTable<usize>, ahash::RandomState)>,
@@ -78,15 +101,42 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
null_buffer_builder: NullBufferBuilder::new(capacity),
completed: vec![],
in_progress: vec![],
- block_size: DEFAULT_BLOCK_SIZE,
+ block_size: BlockSizeGrowthStrategy::Exponential {
+ current_size: STARTING_BLOCK_SIZE,
+ },
string_tracker: None,
phantom: Default::default(),
}
}
+ /// Set a fixed buffer size for variable length strings
+ ///
+ /// The block size is the size of the buffer used to store values greater
+ /// than 12 bytes. The builder allocates new buffers when the current
+ /// buffer is full.
+ ///
+ /// By default the builder balances buffer size and buffer count by
+ /// growing buffer size exponentially from 8KB up to 2MB. The
+ /// first buffer allocated is 8KB, then 16KB, then 32KB, etc up to 2MB.
+ ///
+ /// If this method is used, any new buffers allocated are
+ /// exactly this size. This can be useful for advanced users
+ /// that want to control the memory usage and buffer count.
+ ///
+ /// See <https://github.com/apache/arrow-rs/issues/6094> for more details
on the implications.
+ pub fn with_fixed_block_size(self, block_size: u32) -> Self {
+ debug_assert!(block_size > 0, "Block size must be greater than 0");
+ Self {
+ block_size: BlockSizeGrowthStrategy::Fixed { size: block_size },
+ ..self
+ }
+ }
+
/// Override the size of buffers to allocate for holding string data
+ /// Use `with_fixed_block_size` instead.
+ #[deprecated(note = "Use `with_fixed_block_size` instead")]
pub fn with_block_size(self, block_size: u32) -> Self {
- Self { block_size, ..self }
+ self.with_fixed_block_size(block_size)
}
/// Deduplicate strings while building the array
@@ -277,7 +327,7 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
let required_cap = self.in_progress.len() + v.len();
if self.in_progress.capacity() < required_cap {
self.flush_in_progress();
- let to_reserve = v.len().max(self.block_size as usize);
+ let to_reserve = v.len().max(self.block_size.next_size() as usize);
self.in_progress.reserve(to_reserve);
};
let offset = self.in_progress.len() as u32;
@@ -478,7 +528,7 @@ mod tests {
let mut builder = StringViewBuilder::new()
.with_deduplicate_strings()
- .with_block_size(value_1.len() as u32 * 2); // so that we will
have multiple buffers
+ .with_fixed_block_size(value_1.len() as u32 * 2); // so that we
will have multiple buffers
let values = vec![
Some(value_1),
@@ -585,4 +635,46 @@ mod tests {
"Invalid argument error: No block found with index 5"
);
}
+
+ #[test]
+ fn test_string_view_with_block_size_growth() {
+ let mut exp_builder = StringViewBuilder::new();
+ let mut fixed_builder =
StringViewBuilder::new().with_fixed_block_size(STARTING_BLOCK_SIZE);
+
+ let long_string = String::from_utf8(vec![b'a'; STARTING_BLOCK_SIZE as
usize]).unwrap();
+
+ for i in 0..9 {
+ // 8k, 16k, 32k, 64k, 128k, 256k, 512k, 1M, 2M
+ for _ in 0..(2_u32.pow(i)) {
+ exp_builder.append_value(&long_string);
+ fixed_builder.append_value(&long_string);
+ }
+ exp_builder.flush_in_progress();
+ fixed_builder.flush_in_progress();
+
+ // Every step only add one buffer, but the buffer size is much
larger
+ assert_eq!(exp_builder.completed.len(), i as usize + 1);
+ assert_eq!(
+ exp_builder.completed[i as usize].len(),
+ STARTING_BLOCK_SIZE as usize * 2_usize.pow(i)
+ );
+
+ // This step we added 2^i blocks, the sum of blocks should be
2^(i+1) - 1
+ assert_eq!(fixed_builder.completed.len(), 2_usize.pow(i + 1) - 1);
+
+ // Every buffer is fixed size
+ assert!(fixed_builder
+ .completed
+ .iter()
+ .all(|b| b.len() == STARTING_BLOCK_SIZE as usize));
+ }
+
+ // Add one more value, and the buffer stop growing.
+ exp_builder.append_value(&long_string);
+ exp_builder.flush_in_progress();
+ assert_eq!(
+ exp_builder.completed.last().unwrap().capacity(),
+ MAX_BLOCK_SIZE as usize
+ );
+ }
}
diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs
index 5f72debcda..f6103cb841 100644
--- a/arrow-cast/src/cast/mod.rs
+++ b/arrow-cast/src/cast/mod.rs
@@ -5321,7 +5321,7 @@ mod tests {
let typed_dict =
string_dict_array.downcast_dict::<StringArray>().unwrap();
let string_view_array = {
- let mut builder = StringViewBuilder::new().with_block_size(8); //
multiple buffers.
+ let mut builder =
StringViewBuilder::new().with_fixed_block_size(8); // multiple buffers.
for v in typed_dict.into_iter() {
builder.append_option(v);
}
@@ -5338,7 +5338,7 @@ mod tests {
let typed_binary_dict =
binary_dict_array.downcast_dict::<BinaryArray>().unwrap();
let binary_view_array = {
- let mut builder = BinaryViewBuilder::new().with_block_size(8); //
multiple buffers.
+ let mut builder =
BinaryViewBuilder::new().with_fixed_block_size(8); // multiple buffers.
for v in typed_binary_dict.into_iter() {
builder.append_option(v);
}
@@ -5381,7 +5381,7 @@ mod tests {
O: OffsetSizeTrait,
{
let view_array = {
- let mut builder = StringViewBuilder::new().with_block_size(8); //
multiple buffers.
+ let mut builder =
StringViewBuilder::new().with_fixed_block_size(8); // multiple buffers.
for s in VIEW_TEST_DATA.iter() {
builder.append_option(*s);
}
@@ -5410,7 +5410,7 @@ mod tests {
O: OffsetSizeTrait,
{
let view_array = {
- let mut builder = BinaryViewBuilder::new().with_block_size(8); //
multiple buffers.
+ let mut builder =
BinaryViewBuilder::new().with_fixed_block_size(8); // multiple buffers.
for s in VIEW_TEST_DATA.iter() {
builder.append_option(*s);
}