alamb commented on code in PR #8990:
URL: https://github.com/apache/arrow-rs/pull/8990#discussion_r2623426383
##########
arrow-array/src/builder/generic_bytes_view_builder.rs:
##########
@@ -334,35 +350,41 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
// Deduplication if:
// (1) deduplication is enabled.
- // (2) len > 12
- if let Some((mut ht, hasher)) = self.string_tracker.take() {
- let hash_val = hasher.hash_one(v);
- let hasher_fn = |v: &_| hasher.hash_one(v);
-
- let entry = ht.entry(
- hash_val,
- |idx| {
- let stored_value = self.get_value(*idx);
- v == stored_value
- },
- hasher_fn,
- );
- match entry {
- Entry::Occupied(occupied) => {
- // If the string already exists, we will directly use the
view
- let idx = occupied.get();
- self.views_buffer.push(self.views_buffer[*idx]);
- self.null_buffer_builder.append_non_null();
- self.string_tracker = Some((ht, hasher));
- return Ok(());
- }
- Entry::Vacant(vacant) => {
- // o.w. we insert the (string hash -> view index)
- // the idx is current length of views_builder, as we are
inserting a new view
- vacant.insert(self.views_buffer.len());
+ // (2) len > `MAX_INLINE_VIEW_LEN` and len < `max_deduplication_len`
+ let can_deduplicate = match self.max_deduplication_len {
+ Some(max_deduplication_len) => length <= max_deduplication_len,
+ None => true,
+ };
+ if can_deduplicate {
Review Comment:
what I am worried about is slowing down the case when string deduplication
is not enabled. I will run some benchmarks to make sure this change doesn't
affect performance
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]