ClSlaid commented on code in PR #8990:
URL: https://github.com/apache/arrow-rs/pull/8990#discussion_r2616814939
##########
arrow-array/src/builder/generic_bytes_view_builder.rs:
##########
@@ -334,35 +350,37 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
// Deduplication if:
// (1) deduplication is enabled.
- // (2) len > 12
- if let Some((mut ht, hasher)) = self.string_tracker.take() {
- let hash_val = hasher.hash_one(v);
- let hasher_fn = |v: &_| hasher.hash_one(v);
-
- let entry = ht.entry(
- hash_val,
- |idx| {
- let stored_value = self.get_value(*idx);
- v == stored_value
- },
- hasher_fn,
- );
- match entry {
- Entry::Occupied(occupied) => {
- // If the string already exists, we will directly use the
view
- let idx = occupied.get();
- self.views_buffer.push(self.views_buffer[*idx]);
- self.null_buffer_builder.append_non_null();
- self.string_tracker = Some((ht, hasher));
- return Ok(());
- }
- Entry::Vacant(vacant) => {
- // o.w. we insert the (string hash -> view index)
- // the idx is current length of views_builder, as we are
inserting a new view
- vacant.insert(self.views_buffer.len());
+ // (2) len > `max_deduplication_len`
+ if length > self.max_deduplication_len {
Review Comment:
> In practice I suspect most strings that make sense to deduplicate are
relatively short, e.g. <64 bytes.
Pardon, I guess here comes some confusion? The code behaves like
`min_deduplication_len`.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]