This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 0ded0ce1be Account for child `Bucket` size in OrderPreservingInterner 
(#4646)
0ded0ce1be is described below

commit 0ded0ce1be928ac8a74ce8e791febda95e01c05e
Author: Andrew Lamb <and...@nerdnetworks.org>
AuthorDate: Tue Aug 8 15:35:19 2023 -0500

    Account for child `Bucket` size in OrderPreservingInterner (#4646)
    
    * Account for child buckets in OrderPreservingInterner
    
    * Add a test
    
    * Tweak
    
    * clipy
---
 arrow-row/src/interner.rs | 93 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 93 insertions(+)

diff --git a/arrow-row/src/interner.rs b/arrow-row/src/interner.rs
index 1c71b6a552..fde9251952 100644
--- a/arrow-row/src/interner.rs
+++ b/arrow-row/src/interner.rs
@@ -343,8 +343,19 @@ impl Bucket {
     fn size(&self) -> usize {
         std::mem::size_of::<Self>()
             + self.slots.capacity() * std::mem::size_of::<Slot>()
+        // and account for the size of any embedded buckets in the slots
+            + self.slot_child_bucket_size()
             + self.next.as_ref().map(|x| x.size()).unwrap_or_default()
     }
+
+    /// returns the total size of any recursively allocated `Bucket`s
+    /// in self.slots. This does not include the size of the child Slot itself
+    fn slot_child_bucket_size(&self) -> usize {
+        self.slots
+            .iter()
+            .map(|slot| slot.child.as_ref().map(|x| 
x.size()).unwrap_or_default())
+            .sum()
+    }
 }
 
 #[cfg(test)]
@@ -427,4 +438,86 @@ mod tests {
             interner.normalized_key(interned[3]) < 
interner.normalized_key(interned[2])
         );
     }
+
+    #[test]
+    fn test_intern_sizes() {
+        let mut interner = OrderPreservingInterner::default();
+
+        // Intern a 1K values each 8 bytes large
+        let num_items = 1000;
+        let mut values: Vec<usize> = (0..num_items).collect();
+        values.reverse();
+
+        // intern these values 1 at a time (otherwise the interner
+        // will sort them first);
+        for v in values {
+            interner.intern([Some(v.to_be_bytes())]);
+        }
+
+        let reported = interner.size();
+
+        // Figure out the expected size (this is a second
+        // implementation of size()) as a double check
+        let min_expected = BucketWalker::new()
+            .visit_bucket(interner.bucket.as_ref())
+            .memory_estimate()
+            // hash table  size
+            + interner.lookup.capacity() *  std::mem::size_of::<Interned>()
+            // key/value storage
+            + interner.keys.buffer_size()
+            + interner.values.buffer_size();
+
+        assert!(
+            reported > min_expected,
+            "reported size {reported} not larger than min expected size: 
{min_expected}"
+        )
+    }
+
+    // Walks over the buckets / slots counting counting them all
+    struct BucketWalker {
+        num_buckets: usize,
+        num_slots: usize,
+    }
+
+    impl BucketWalker {
+        fn new() -> Self {
+            Self {
+                num_buckets: 0,
+                num_slots: 0,
+            }
+        }
+
+        // recursively visit the bucket and any slots/buckets contained
+        fn visit_bucket(mut self, bucket: &Bucket) -> Self {
+            self.num_buckets += 1;
+            let acc = bucket
+                .slots
+                .iter()
+                .fold(self, |acc, slot| acc.visit_slot(slot));
+
+            if let Some(next) = bucket.next.as_ref() {
+                acc.visit_bucket(next.as_ref())
+            } else {
+                acc
+            }
+        }
+
+        // recursively visit slot and any slots/buckets
+        fn visit_slot(mut self, slot: &Slot) -> Self {
+            self.num_slots += 1;
+            if let Some(child) = slot.child.as_ref() {
+                self.visit_bucket(child.as_ref())
+            } else {
+                self
+            }
+        }
+
+        // estimate how much memory is used just for Buckets / Slots
+        // (an underestimate of the total memory used for the
+        // interner as it doesn't contain any actual values)
+        fn memory_estimate(self) -> usize {
+            self.num_buckets * std::mem::size_of::<Bucket>()
+                + self.num_slots * std::mem::size_of::<Slot>()
+        }
+    }
 }

Reply via email to