alamb commented on code in PR #8990:
URL: https://github.com/apache/arrow-rs/pull/8990#discussion_r2620563121
##########
arrow-array/src/builder/generic_bytes_view_builder.rs:
##########
@@ -334,35 +350,41 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
// Deduplication if:
// (1) deduplication is enabled.
- // (2) len > 12
- if let Some((mut ht, hasher)) = self.string_tracker.take() {
- let hash_val = hasher.hash_one(v);
- let hasher_fn = |v: &_| hasher.hash_one(v);
-
- let entry = ht.entry(
- hash_val,
- |idx| {
- let stored_value = self.get_value(*idx);
- v == stored_value
- },
- hasher_fn,
- );
- match entry {
- Entry::Occupied(occupied) => {
- // If the string already exists, we will directly use the
view
- let idx = occupied.get();
- self.views_buffer.push(self.views_buffer[*idx]);
- self.null_buffer_builder.append_non_null();
- self.string_tracker = Some((ht, hasher));
- return Ok(());
- }
- Entry::Vacant(vacant) => {
- // o.w. we insert the (string hash -> view index)
- // the idx is current length of views_builder, as we are
inserting a new view
- vacant.insert(self.views_buffer.len());
+ // (2) len > `MAX_INLINE_VIEW_LEN` and len < `max_deduplication_len`
+ let can_deduplicate = match self.max_deduplication_len {
+ Some(max_deduplication_len) => length <= max_deduplication_len,
+ None => true,
+ };
+ if can_deduplicate {
Review Comment:
It is minor, but as written this code will check the deduplicate length on
each row, even if we are not deduplicating
How about only checking after the call to `string_tracker.take()`?
something like
```rust
// Deduplication if:
// (1) deduplication is enabled.
// (2) len > `MAX_INLINE_VIEW_LEN` and len < `max_deduplication_len`
if let Some((mut ht, hasher)) = self.string_tracker.take() {
if self
.max_deduplication_len
.map(|max_len| length > max_len)
.unwrap_or(false)
{
let hash_val = hasher.hash_one(v);
let hasher_fn = |v: &_| hasher.hash_one(v);
let entry = ht.entry(
hash_val,
|idx| {
let stored_value = self.get_value(*idx);
v == stored_value
},
hasher_fn,
);
match entry {
Entry::Occupied(occupied) => {
// If the string already exists, we will directly
use the view
let idx = occupied.get();
self.views_buffer.push(self.views_buffer[*idx]);
self.null_buffer_builder.append_non_null();
self.string_tracker = Some((ht, hasher));
return Ok(());
}
Entry::Vacant(vacant) => {
// o.w. we insert the (string hash -> view index)
// the idx is current length of views_builder, as we
are inserting a new view
vacant.insert(self.views_buffer.len());
}
}
}
self.string_tracker = Some((ht, hasher));
}
```
##########
arrow-array/src/builder/generic_bytes_view_builder.rs:
##########
@@ -107,10 +108,25 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
current_size: STARTING_BLOCK_SIZE,
},
string_tracker: None,
+ max_deduplication_len: None,
phantom: Default::default(),
}
}
+ /// Configure max deduplication length when deduplicating strings while
building the array.
+ /// Default is [`MAX_INLINE_VIEW_LEN`] bytes.
+ /// See <https://github.com/apache/arrow-rs/issues/7187> for more details
on the implications.
Review Comment:
I think it would be nice to inline the implications here inline. Here is a
suggestion
```suggestion
///
/// When [`Self::with_deduplicate_strings`] is enabled, the builder
attempts to deduplicate
/// any strings longer than 12 bytes. However, since it takes time
proportional to the length of the string
/// to deduplicate, setting this option limits the CPU overhead for this
option.
```
##########
arrow-array/src/builder/generic_bytes_view_builder.rs:
##########
@@ -636,8 +658,58 @@ pub fn make_view(data: &[u8], block_id: u32, offset: u32)
-> u128 {
mod tests {
use core::str;
+ use arrow_buffer::ArrowNativeType;
+
use super::*;
+ #[test]
+ fn test_string_max_deduplication_len() {
+ let value_1 = "short";
+ let value_2 = "not so similar string but long";
+ let value_3 = "1234567890123";
+
+ let mut builder = StringViewBuilder::new()
+ .with_deduplicate_strings()
+ .with_max_deduplication_len(MAX_INLINE_VIEW_LEN * 2);
+ // safe to unwrap
+ let max_deduplication_len = builder.max_deduplication_len.unwrap();
+ assert!(builder.string_tracker.is_some());
+ assert!(max_deduplication_len > MAX_INLINE_VIEW_LEN);
+ assert!(value_1.len() < MAX_INLINE_VIEW_LEN.as_usize());
+ assert!(value_2.len() > max_deduplication_len.as_usize());
+ assert!(
+ value_3.len() > MAX_INLINE_VIEW_LEN.as_usize()
+ && value_3.len() < max_deduplication_len.as_usize()
+ );
+
+ let value_checker = |v: &[u8], builder: &StringViewBuilder| {
Review Comment:
I guess I was thinking something like this which verifies the output (that
the strings are not deduplicated) rather than the internal state of the builder:
```rust
#[test]
fn test_string_max_deduplication_len() {
let value_1 = "short";
let value_2 = "not so similar string but long";
let value_3 = "1234567890123";
let max_deduplication_len = MAX_INLINE_VIEW_LEN * 2;
let mut builder = StringViewBuilder::new()
.with_deduplicate_strings()
.with_max_deduplication_len(max_deduplication_len);
assert!(value_1.len() < MAX_INLINE_VIEW_LEN.as_usize());
assert!(value_2.len() > max_deduplication_len.as_usize());
assert!(
value_3.len() > MAX_INLINE_VIEW_LEN.as_usize()
&& value_3.len() < max_deduplication_len.as_usize()
);
// append value1 (short), expect it is inlined and deduplicated
builder.append_value(value_1); // view 0
builder.append_value(value_1); // view 1
// append value2, expect second copy is not deduplicated as it
exceeds max_deduplication_len
builder.append_value(value_2); // view 2
builder.append_value(value_2); // view 3
// append value3, expect second copy is deduplicated
builder.append_value(value_3); // view 4
builder.append_value(value_3); // view 5
let array = builder.finish();
// verify
let v2 = ByteView::from(array.views()[2]);
let v3 = ByteView::from(array.views()[3]);
assert_eq!(v2.buffer_index, v3.buffer_index); // stored in same
buffer
assert_ne!(v2.offset, v3.offset); // different offsets --> not
deduplicated
let v4 = ByteView::from(array.views()[4]);
let v5 = ByteView::from(array.views()[5]);
assert_eq!(v4.buffer_index, v5.buffer_index); // stored in same
buffer
assert_eq!(v4.offset, v5.offset); // same offsets --> deduplicated
}
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]