adamreeve commented on code in PR #8671:
URL: https://github.com/apache/arrow-rs/pull/8671#discussion_r2547710902
##########
parquet/src/file/metadata/memory.rs:
##########
@@ -50,9 +51,60 @@ impl<T: HeapSize> HeapSize for Vec<T> {
}
}
+impl<K: HeapSize, V: HeapSize> HeapSize for HashMap<K, V> {
+ fn heap_size(&self) -> usize {
+ let capacity = self.capacity();
+ if capacity == 0 {
+ return 0;
+ }
+
+ // HashMap doesn't provide a way to get its heap size, so this is an
approximation based on
+ // the behavior of hashbrown::HashMap as at version 0.16.0, and may
become inaccurate
+ // if the implementation changes.
+ let key_val_size = std::mem::size_of::<(K, V)>();
+ // Overhead for the control tags group, which may be smaller depending
on architecture
+ let group_size = 16;
+ // 1 byte of metadata stored per bucket.
+ let metadata_size = 1;
+
+ // Compute the number of buckets for the capacity. Based on
hashbrown's capacity_to_buckets
+ let buckets = if capacity < 15 {
+ let min_cap = match key_val_size {
+ 0..=1 => 14,
+ 2..=3 => 7,
+ _ => 3,
+ };
+ let cap = min_cap.max(capacity);
+ if cap < 4 {
+ 4
+ } else if cap < 8 {
+ 8
+ } else {
+ 16
+ }
+ } else {
+ (capacity.saturating_mul(8) / 7).next_power_of_two()
+ };
+
+ group_size
+ + (buckets * (key_val_size + metadata_size))
+ + self.keys().map(|k| k.heap_size()).sum::<usize>()
+ + self.values().map(|v| v.heap_size()).sum::<usize>()
+ }
+}
+
impl<T: HeapSize> HeapSize for Arc<T> {
fn heap_size(&self) -> usize {
- self.as_ref().heap_size()
+ // Arc stores weak and strong counts on the heap alongside an instance
of T
+ 2 * std::mem::size_of::<usize>() + std::mem::size_of::<T>() +
self.as_ref().heap_size()
Review Comment:
I used `heaptrack` to measure allocations and the results I got from that
exactly matched what was being reported by the `ParquetMetaData.memory_size`
method after my changes. So I think the 33% increase you see is accurate.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]