This is an automated email from the ASF dual-hosted git repository. alamb pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push: new 0ded0ce1be Account for child `Bucket` size in OrderPreservingInterner (#4646) 0ded0ce1be is described below commit 0ded0ce1be928ac8a74ce8e791febda95e01c05e Author: Andrew Lamb <and...@nerdnetworks.org> AuthorDate: Tue Aug 8 15:35:19 2023 -0500 Account for child `Bucket` size in OrderPreservingInterner (#4646) * Account for child buckets in OrderPreservingInterner * Add a test * Tweak * clipy --- arrow-row/src/interner.rs | 93 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) diff --git a/arrow-row/src/interner.rs b/arrow-row/src/interner.rs index 1c71b6a552..fde9251952 100644 --- a/arrow-row/src/interner.rs +++ b/arrow-row/src/interner.rs @@ -343,8 +343,19 @@ impl Bucket { fn size(&self) -> usize { std::mem::size_of::<Self>() + self.slots.capacity() * std::mem::size_of::<Slot>() + // and account for the size of any embedded buckets in the slots + + self.slot_child_bucket_size() + self.next.as_ref().map(|x| x.size()).unwrap_or_default() } + + /// returns the total size of any recursively allocated `Bucket`s + /// in self.slots. This does not include the size of the child Slot itself + fn slot_child_bucket_size(&self) -> usize { + self.slots + .iter() + .map(|slot| slot.child.as_ref().map(|x| x.size()).unwrap_or_default()) + .sum() + } } #[cfg(test)] @@ -427,4 +438,86 @@ mod tests { interner.normalized_key(interned[3]) < interner.normalized_key(interned[2]) ); } + + #[test] + fn test_intern_sizes() { + let mut interner = OrderPreservingInterner::default(); + + // Intern a 1K values each 8 bytes large + let num_items = 1000; + let mut values: Vec<usize> = (0..num_items).collect(); + values.reverse(); + + // intern these values 1 at a time (otherwise the interner + // will sort them first); + for v in values { + interner.intern([Some(v.to_be_bytes())]); + } + + let reported = interner.size(); + + // Figure out the expected size (this is a second + // implementation of size()) as a double check + let min_expected = BucketWalker::new() + .visit_bucket(interner.bucket.as_ref()) + .memory_estimate() + // hash table size + + interner.lookup.capacity() * std::mem::size_of::<Interned>() + // key/value storage + + interner.keys.buffer_size() + + interner.values.buffer_size(); + + assert!( + reported > min_expected, + "reported size {reported} not larger than min expected size: {min_expected}" + ) + } + + // Walks over the buckets / slots counting counting them all + struct BucketWalker { + num_buckets: usize, + num_slots: usize, + } + + impl BucketWalker { + fn new() -> Self { + Self { + num_buckets: 0, + num_slots: 0, + } + } + + // recursively visit the bucket and any slots/buckets contained + fn visit_bucket(mut self, bucket: &Bucket) -> Self { + self.num_buckets += 1; + let acc = bucket + .slots + .iter() + .fold(self, |acc, slot| acc.visit_slot(slot)); + + if let Some(next) = bucket.next.as_ref() { + acc.visit_bucket(next.as_ref()) + } else { + acc + } + } + + // recursively visit slot and any slots/buckets + fn visit_slot(mut self, slot: &Slot) -> Self { + self.num_slots += 1; + if let Some(child) = slot.child.as_ref() { + self.visit_bucket(child.as_ref()) + } else { + self + } + } + + // estimate how much memory is used just for Buckets / Slots + // (an underestimate of the total memory used for the + // interner as it doesn't contain any actual values) + fn memory_estimate(self) -> usize { + self.num_buckets * std::mem::size_of::<Bucket>() + + self.num_slots * std::mem::size_of::<Slot>() + } + } }