alamb commented on code in PR #8990:
URL: https://github.com/apache/arrow-rs/pull/8990#discussion_r2620563121


##########
arrow-array/src/builder/generic_bytes_view_builder.rs:
##########
@@ -334,35 +350,41 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
 
         // Deduplication if:
         // (1) deduplication is enabled.
-        // (2) len > 12
-        if let Some((mut ht, hasher)) = self.string_tracker.take() {
-            let hash_val = hasher.hash_one(v);
-            let hasher_fn = |v: &_| hasher.hash_one(v);
-
-            let entry = ht.entry(
-                hash_val,
-                |idx| {
-                    let stored_value = self.get_value(*idx);
-                    v == stored_value
-                },
-                hasher_fn,
-            );
-            match entry {
-                Entry::Occupied(occupied) => {
-                    // If the string already exists, we will directly use the 
view
-                    let idx = occupied.get();
-                    self.views_buffer.push(self.views_buffer[*idx]);
-                    self.null_buffer_builder.append_non_null();
-                    self.string_tracker = Some((ht, hasher));
-                    return Ok(());
-                }
-                Entry::Vacant(vacant) => {
-                    // o.w. we insert the (string hash -> view index)
-                    // the idx is current length of views_builder, as we are 
inserting a new view
-                    vacant.insert(self.views_buffer.len());
+        // (2) len > `MAX_INLINE_VIEW_LEN` and len < `max_deduplication_len`
+        let can_deduplicate = match self.max_deduplication_len {
+            Some(max_deduplication_len) => length <= max_deduplication_len,
+            None => true,
+        };
+        if can_deduplicate {

Review Comment:
   It is minor, but as written this code will check the deduplicate length on 
each row, even if we are not deduplicating
   
   How about only checking after the call to `string_tracker.take()`?
   
   something like
   
   ```rust
           // Deduplication if:
           // (1) deduplication is enabled.
           // (2) len > `MAX_INLINE_VIEW_LEN` and len < `max_deduplication_len`
           if let Some((mut ht, hasher)) = self.string_tracker.take() {
               if self
                   .max_deduplication_len
                   .map(|max_len| length > max_len)
                   .unwrap_or(false)
               {
                   let hash_val = hasher.hash_one(v);
                   let hasher_fn = |v: &_| hasher.hash_one(v);
   
                   let entry = ht.entry(
                       hash_val,
                       |idx| {
                           let stored_value = self.get_value(*idx);
                           v == stored_value
                       },
                       hasher_fn,
                   );
                   match entry {
                       Entry::Occupied(occupied) => {
                           // If the string already exists, we will directly 
use the view
                           let idx = occupied.get();
                           self.views_buffer.push(self.views_buffer[*idx]);
                           self.null_buffer_builder.append_non_null();
                           self.string_tracker = Some((ht, hasher));
                           return Ok(());
                       }
                       Entry::Vacant(vacant) => {
                           // o.w. we insert the (string hash -> view index)
                           // the idx is current length of views_builder, as we 
are inserting a new view
                           vacant.insert(self.views_buffer.len());
                       }
                   }
               }
               self.string_tracker = Some((ht, hasher));
           }
   ```



##########
arrow-array/src/builder/generic_bytes_view_builder.rs:
##########
@@ -107,10 +108,25 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
                 current_size: STARTING_BLOCK_SIZE,
             },
             string_tracker: None,
+            max_deduplication_len: None,
             phantom: Default::default(),
         }
     }
 
+    /// Configure max deduplication length when deduplicating strings while 
building the array.
+    /// Default is [`MAX_INLINE_VIEW_LEN`] bytes.
+    /// See <https://github.com/apache/arrow-rs/issues/7187> for more details 
on the implications.

Review Comment:
   I think it would be nice to inline the implications here inline. Here is a 
suggestion
   
   ```suggestion
       ///
       /// When [`Self::with_deduplicate_strings`] is enabled, the builder 
attempts to deduplicate 
       /// any strings longer than 12 bytes. However, since it takes time 
proportional to the length of the string
       /// to deduplicate, setting this option limits the CPU overhead for this 
option. 
   ```



##########
arrow-array/src/builder/generic_bytes_view_builder.rs:
##########
@@ -636,8 +658,58 @@ pub fn make_view(data: &[u8], block_id: u32, offset: u32) 
-> u128 {
 mod tests {
     use core::str;
 
+    use arrow_buffer::ArrowNativeType;
+
     use super::*;
 
+    #[test]
+    fn test_string_max_deduplication_len() {
+        let value_1 = "short";
+        let value_2 = "not so similar string but long";
+        let value_3 = "1234567890123";
+
+        let mut builder = StringViewBuilder::new()
+            .with_deduplicate_strings()
+            .with_max_deduplication_len(MAX_INLINE_VIEW_LEN * 2);
+        // safe to unwrap
+        let max_deduplication_len = builder.max_deduplication_len.unwrap();
+        assert!(builder.string_tracker.is_some());
+        assert!(max_deduplication_len > MAX_INLINE_VIEW_LEN);
+        assert!(value_1.len() < MAX_INLINE_VIEW_LEN.as_usize());
+        assert!(value_2.len() > max_deduplication_len.as_usize());
+        assert!(
+            value_3.len() > MAX_INLINE_VIEW_LEN.as_usize()
+                && value_3.len() < max_deduplication_len.as_usize()
+        );
+
+        let value_checker = |v: &[u8], builder: &StringViewBuilder| {

Review Comment:
   I guess I was thinking something like this which verifies the output (that 
the strings are not deduplicated) rather than the internal state of the builder:
   
   ```rust
       #[test]
       fn test_string_max_deduplication_len() {
           let value_1 = "short";
           let value_2 = "not so similar string but long";
           let value_3 = "1234567890123";
   
           let max_deduplication_len = MAX_INLINE_VIEW_LEN * 2;
   
           let mut builder = StringViewBuilder::new()
               .with_deduplicate_strings()
               .with_max_deduplication_len(max_deduplication_len);
   
           assert!(value_1.len() < MAX_INLINE_VIEW_LEN.as_usize());
           assert!(value_2.len() > max_deduplication_len.as_usize());
           assert!(
               value_3.len() > MAX_INLINE_VIEW_LEN.as_usize()
                   && value_3.len() < max_deduplication_len.as_usize()
           );
   
           // append value1 (short), expect it is inlined and deduplicated
           builder.append_value(value_1); // view 0
           builder.append_value(value_1); // view 1
           // append value2, expect second copy is not deduplicated as it 
exceeds max_deduplication_len
           builder.append_value(value_2); // view 2
           builder.append_value(value_2); // view 3
           // append value3, expect second copy is deduplicated
           builder.append_value(value_3); // view 4
           builder.append_value(value_3); // view 5
   
           let array = builder.finish();
   
           // verify
           let v2 = ByteView::from(array.views()[2]);
           let v3 = ByteView::from(array.views()[3]);
           assert_eq!(v2.buffer_index, v3.buffer_index); // stored in same 
buffer
           assert_ne!(v2.offset, v3.offset); // different offsets --> not 
deduplicated
   
           let v4 = ByteView::from(array.views()[4]);
           let v5 = ByteView::from(array.views()[5]);
           assert_eq!(v4.buffer_index, v5.buffer_index); // stored in same 
buffer
           assert_eq!(v4.offset, v5.offset); // same offsets --> deduplicated
       }
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to