alamb commented on code in PR #16789:
URL: https://github.com/apache/datafusion/pull/16789#discussion_r2208548427


##########
datafusion/common/src/scalar/mod.rs:
##########
@@ -854,6 +854,140 @@ pub fn get_dict_value<K: ArrowDictionaryKeyType>(
     Ok((dict_array.values(), dict_array.key(index)))
 }
 
+/// Cache for dictionary key arrays to avoid repeated allocations

Review Comment:
   I think it is important to also point out that the cached keys are always 
zero
   
   SO like "cache for dictionary keys for single valued dictionary arrays" or 
something



##########
datafusion/common/src/scalar/mod.rs:
##########
@@ -854,6 +854,140 @@ pub fn get_dict_value<K: ArrowDictionaryKeyType>(
     Ok((dict_array.values(), dict_array.key(index)))
 }
 
+/// Cache for dictionary key arrays to avoid repeated allocations
+/// when the same size is used frequently.
+///
+/// Similar to PartitionColumnProjector's ZeroBufferGenerators, this cache
+/// stores key arrays for different dictionary key types. The cache is
+/// limited to 1 entry per type (the last size used).
+#[derive(Debug)]
+struct KeyArrayCache<K: ArrowDictionaryKeyType> {
+    cache: Option<(usize, bool, PrimitiveArray<K>)>, // (size, is_null, 
key_array)
+}
+
+impl<K: ArrowDictionaryKeyType> Default for KeyArrayCache<K> {
+    fn default() -> Self {
+        Self { cache: None }
+    }
+}
+
+impl<K: ArrowDictionaryKeyType> KeyArrayCache<K> {
+    fn get_or_create(&mut self, size: usize, is_null: bool) -> 
PrimitiveArray<K> {
+        match &self.cache {
+            Some((cached_size, cached_is_null, cached_array))
+                if *cached_size == size && *cached_is_null == is_null =>
+            {
+                // Cache hit: reuse existing array if same size and null status
+                cached_array.clone()
+            }
+            _ => {
+                // Cache miss: create new array and cache it
+                let key_array: PrimitiveArray<K> = repeat_n(
+                    if is_null {
+                        None
+                    } else {
+                        Some(K::default_value())
+                    },
+                    size,
+                )
+                .collect();
+
+                self.cache = Some((size, is_null, key_array.clone()));
+                key_array
+            }
+        }
+    }
+}
+
+/// Cache for null arrays to avoid repeated allocations

Review Comment:
   ```suggestion
   /// Cache for null arrays to avoid repeated allocations
   /// of all zeros 
   ```



##########
datafusion/common/src/scalar/mod.rs:
##########
@@ -854,6 +854,140 @@ pub fn get_dict_value<K: ArrowDictionaryKeyType>(
     Ok((dict_array.values(), dict_array.key(index)))
 }
 
+/// Cache for dictionary key arrays to avoid repeated allocations
+/// when the same size is used frequently.
+///
+/// Similar to PartitionColumnProjector's ZeroBufferGenerators, this cache
+/// stores key arrays for different dictionary key types. The cache is

Review Comment:
   ```suggestion
   /// stores key arrays with all `0` values for different dictionary key 
types. The cache is
   ```



##########
datafusion/common/src/scalar/mod.rs:
##########
@@ -854,6 +854,140 @@ pub fn get_dict_value<K: ArrowDictionaryKeyType>(
     Ok((dict_array.values(), dict_array.key(index)))
 }
 
+/// Cache for dictionary key arrays to avoid repeated allocations
+/// when the same size is used frequently.
+///
+/// Similar to PartitionColumnProjector's ZeroBufferGenerators, this cache
+/// stores key arrays for different dictionary key types. The cache is
+/// limited to 1 entry per type (the last size used).
+#[derive(Debug)]
+struct KeyArrayCache<K: ArrowDictionaryKeyType> {
+    cache: Option<(usize, bool, PrimitiveArray<K>)>, // (size, is_null, 
key_array)
+}
+
+impl<K: ArrowDictionaryKeyType> Default for KeyArrayCache<K> {
+    fn default() -> Self {
+        Self { cache: None }
+    }
+}
+
+impl<K: ArrowDictionaryKeyType> KeyArrayCache<K> {
+    fn get_or_create(&mut self, size: usize, is_null: bool) -> 
PrimitiveArray<K> {
+        match &self.cache {
+            Some((cached_size, cached_is_null, cached_array))
+                if *cached_size == size && *cached_is_null == is_null =>
+            {
+                // Cache hit: reuse existing array if same size and null status
+                cached_array.clone()
+            }
+            _ => {
+                // Cache miss: create new array and cache it
+                let key_array: PrimitiveArray<K> = repeat_n(
+                    if is_null {
+                        None
+                    } else {
+                        Some(K::default_value())
+                    },
+                    size,
+                )
+                .collect();
+
+                self.cache = Some((size, is_null, key_array.clone()));
+                key_array
+            }
+        }
+    }
+}
+
+/// Cache for null arrays to avoid repeated allocations
+/// when the same size is used frequently.
+#[derive(Debug, Default)]
+struct NullArrayCache {
+    cache: Option<(usize, ArrayRef)>, // (size, null_array)
+}
+
+impl NullArrayCache {
+    fn get_or_create(&mut self, size: usize) -> ArrayRef {
+        match &self.cache {
+            Some((cached_size, cached_array)) if *cached_size == size => {
+                // Cache hit: reuse existing array if same size
+                Arc::clone(cached_array)
+            }
+            _ => {
+                // Cache miss: create new array and cache it
+                let null_array = new_null_array(&DataType::Null, size);
+                self.cache = Some((size, Arc::clone(&null_array)));
+                null_array
+            }
+        }
+    }
+}
+
+/// Global cache for dictionary key arrays and null arrays
+#[derive(Debug, Default)]
+struct ArrayCaches {
+    cache_i8: KeyArrayCache<Int8Type>,
+    cache_i16: KeyArrayCache<Int16Type>,
+    cache_i32: KeyArrayCache<Int32Type>,
+    cache_i64: KeyArrayCache<Int64Type>,
+    cache_u8: KeyArrayCache<UInt8Type>,
+    cache_u16: KeyArrayCache<UInt16Type>,
+    cache_u32: KeyArrayCache<UInt32Type>,
+    cache_u64: KeyArrayCache<UInt64Type>,
+    null_cache: NullArrayCache,
+}
+
+static ARRAY_CACHES: LazyLock<Mutex<ArrayCaches>> =
+    LazyLock::new(|| Mutex::new(ArrayCaches::default()));
+
+/// Get the global cache for arrays
+fn get_array_caches() -> &'static Mutex<ArrayCaches> {
+    &ARRAY_CACHES
+}
+
+/// Get cached null array for the given size
+fn get_cached_null_array(size: usize) -> ArrayRef {
+    let cache = get_array_caches();
+    let mut caches = cache.lock().unwrap();
+    caches.null_cache.get_or_create(size)
+}
+
+/// Get cached key array for a specific key type
+fn get_cached_key_array<K: ArrowDictionaryKeyType>(

Review Comment:
   ```suggestion
   fn get_or_create_cached_key_array<K: ArrowDictionaryKeyType>(
   ```



##########
datafusion/common/src/scalar/mod.rs:
##########
@@ -854,6 +854,140 @@ pub fn get_dict_value<K: ArrowDictionaryKeyType>(
     Ok((dict_array.values(), dict_array.key(index)))
 }
 
+/// Cache for dictionary key arrays to avoid repeated allocations
+/// when the same size is used frequently.
+///
+/// Similar to PartitionColumnProjector's ZeroBufferGenerators, this cache
+/// stores key arrays for different dictionary key types. The cache is
+/// limited to 1 entry per type (the last size used).
+#[derive(Debug)]
+struct KeyArrayCache<K: ArrowDictionaryKeyType> {
+    cache: Option<(usize, bool, PrimitiveArray<K>)>, // (size, is_null, 
key_array)
+}
+
+impl<K: ArrowDictionaryKeyType> Default for KeyArrayCache<K> {
+    fn default() -> Self {
+        Self { cache: None }
+    }
+}
+
+impl<K: ArrowDictionaryKeyType> KeyArrayCache<K> {
+    fn get_or_create(&mut self, size: usize, is_null: bool) -> 
PrimitiveArray<K> {

Review Comment:
   nit is I would use `num_rows` here rather than size as the size might be 
confused with the key size (eg `Int16Type` would have size 16??)



##########
datafusion/common/src/scalar/mod.rs:
##########
@@ -854,6 +854,140 @@ pub fn get_dict_value<K: ArrowDictionaryKeyType>(
     Ok((dict_array.values(), dict_array.key(index)))
 }
 
+/// Cache for dictionary key arrays to avoid repeated allocations
+/// when the same size is used frequently.
+///
+/// Similar to PartitionColumnProjector's ZeroBufferGenerators, this cache
+/// stores key arrays for different dictionary key types. The cache is
+/// limited to 1 entry per type (the last size used).
+#[derive(Debug)]
+struct KeyArrayCache<K: ArrowDictionaryKeyType> {
+    cache: Option<(usize, bool, PrimitiveArray<K>)>, // (size, is_null, 
key_array)
+}
+
+impl<K: ArrowDictionaryKeyType> Default for KeyArrayCache<K> {
+    fn default() -> Self {
+        Self { cache: None }
+    }
+}
+
+impl<K: ArrowDictionaryKeyType> KeyArrayCache<K> {
+    fn get_or_create(&mut self, size: usize, is_null: bool) -> 
PrimitiveArray<K> {
+        match &self.cache {
+            Some((cached_size, cached_is_null, cached_array))
+                if *cached_size == size && *cached_is_null == is_null =>
+            {
+                // Cache hit: reuse existing array if same size and null status
+                cached_array.clone()
+            }
+            _ => {
+                // Cache miss: create new array and cache it
+                let key_array: PrimitiveArray<K> = repeat_n(
+                    if is_null {
+                        None
+                    } else {
+                        Some(K::default_value())
+                    },
+                    size,
+                )
+                .collect();
+
+                self.cache = Some((size, is_null, key_array.clone()));
+                key_array
+            }
+        }
+    }
+}
+
+/// Cache for null arrays to avoid repeated allocations
+/// when the same size is used frequently.
+#[derive(Debug, Default)]
+struct NullArrayCache {
+    cache: Option<(usize, ArrayRef)>, // (size, null_array)
+}
+
+impl NullArrayCache {
+    fn get_or_create(&mut self, size: usize) -> ArrayRef {
+        match &self.cache {
+            Some((cached_size, cached_array)) if *cached_size == size => {
+                // Cache hit: reuse existing array if same size
+                Arc::clone(cached_array)
+            }
+            _ => {
+                // Cache miss: create new array and cache it
+                let null_array = new_null_array(&DataType::Null, size);
+                self.cache = Some((size, Arc::clone(&null_array)));
+                null_array
+            }
+        }
+    }
+}
+
+/// Global cache for dictionary key arrays and null arrays
+#[derive(Debug, Default)]
+struct ArrayCaches {
+    cache_i8: KeyArrayCache<Int8Type>,
+    cache_i16: KeyArrayCache<Int16Type>,
+    cache_i32: KeyArrayCache<Int32Type>,
+    cache_i64: KeyArrayCache<Int64Type>,
+    cache_u8: KeyArrayCache<UInt8Type>,
+    cache_u16: KeyArrayCache<UInt16Type>,
+    cache_u32: KeyArrayCache<UInt32Type>,
+    cache_u64: KeyArrayCache<UInt64Type>,
+    null_cache: NullArrayCache,
+}
+
+static ARRAY_CACHES: LazyLock<Mutex<ArrayCaches>> =
+    LazyLock::new(|| Mutex::new(ArrayCaches::default()));
+
+/// Get the global cache for arrays
+fn get_array_caches() -> &'static Mutex<ArrayCaches> {
+    &ARRAY_CACHES
+}
+
+/// Get cached null array for the given size
+fn get_cached_null_array(size: usize) -> ArrayRef {

Review Comment:
   ```suggestion
   fn get_or_create_cached_null_array(size: usize) -> ArrayRef {
   ```



##########
datafusion/common/src/scalar/mod.rs:
##########
@@ -854,6 +854,140 @@ pub fn get_dict_value<K: ArrowDictionaryKeyType>(
     Ok((dict_array.values(), dict_array.key(index)))
 }
 
+/// Cache for dictionary key arrays to avoid repeated allocations
+/// when the same size is used frequently.
+///
+/// Similar to PartitionColumnProjector's ZeroBufferGenerators, this cache
+/// stores key arrays for different dictionary key types. The cache is
+/// limited to 1 entry per type (the last size used).
+#[derive(Debug)]
+struct KeyArrayCache<K: ArrowDictionaryKeyType> {
+    cache: Option<(usize, bool, PrimitiveArray<K>)>, // (size, is_null, 
key_array)

Review Comment:
   ```suggestion
       cache: Option<(usize, bool, PrimitiveArray<K>)>, // (num_rows, is_null, 
key_array)
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org
For additional commands, e-mail: github-h...@datafusion.apache.org

Reply via email to