alamb commented on code in PR #16789: URL: https://github.com/apache/datafusion/pull/16789#discussion_r2208548427
########## datafusion/common/src/scalar/mod.rs: ########## @@ -854,6 +854,140 @@ pub fn get_dict_value<K: ArrowDictionaryKeyType>( Ok((dict_array.values(), dict_array.key(index))) } +/// Cache for dictionary key arrays to avoid repeated allocations Review Comment: I think it is important to also point out that the cached keys are always zero SO like "cache for dictionary keys for single valued dictionary arrays" or something ########## datafusion/common/src/scalar/mod.rs: ########## @@ -854,6 +854,140 @@ pub fn get_dict_value<K: ArrowDictionaryKeyType>( Ok((dict_array.values(), dict_array.key(index))) } +/// Cache for dictionary key arrays to avoid repeated allocations +/// when the same size is used frequently. +/// +/// Similar to PartitionColumnProjector's ZeroBufferGenerators, this cache +/// stores key arrays for different dictionary key types. The cache is +/// limited to 1 entry per type (the last size used). +#[derive(Debug)] +struct KeyArrayCache<K: ArrowDictionaryKeyType> { + cache: Option<(usize, bool, PrimitiveArray<K>)>, // (size, is_null, key_array) +} + +impl<K: ArrowDictionaryKeyType> Default for KeyArrayCache<K> { + fn default() -> Self { + Self { cache: None } + } +} + +impl<K: ArrowDictionaryKeyType> KeyArrayCache<K> { + fn get_or_create(&mut self, size: usize, is_null: bool) -> PrimitiveArray<K> { + match &self.cache { + Some((cached_size, cached_is_null, cached_array)) + if *cached_size == size && *cached_is_null == is_null => + { + // Cache hit: reuse existing array if same size and null status + cached_array.clone() + } + _ => { + // Cache miss: create new array and cache it + let key_array: PrimitiveArray<K> = repeat_n( + if is_null { + None + } else { + Some(K::default_value()) + }, + size, + ) + .collect(); + + self.cache = Some((size, is_null, key_array.clone())); + key_array + } + } + } +} + +/// Cache for null arrays to avoid repeated allocations Review Comment: ```suggestion /// Cache for null arrays to avoid repeated allocations /// of all zeros ``` ########## datafusion/common/src/scalar/mod.rs: ########## @@ -854,6 +854,140 @@ pub fn get_dict_value<K: ArrowDictionaryKeyType>( Ok((dict_array.values(), dict_array.key(index))) } +/// Cache for dictionary key arrays to avoid repeated allocations +/// when the same size is used frequently. +/// +/// Similar to PartitionColumnProjector's ZeroBufferGenerators, this cache +/// stores key arrays for different dictionary key types. The cache is Review Comment: ```suggestion /// stores key arrays with all `0` values for different dictionary key types. The cache is ``` ########## datafusion/common/src/scalar/mod.rs: ########## @@ -854,6 +854,140 @@ pub fn get_dict_value<K: ArrowDictionaryKeyType>( Ok((dict_array.values(), dict_array.key(index))) } +/// Cache for dictionary key arrays to avoid repeated allocations +/// when the same size is used frequently. +/// +/// Similar to PartitionColumnProjector's ZeroBufferGenerators, this cache +/// stores key arrays for different dictionary key types. The cache is +/// limited to 1 entry per type (the last size used). +#[derive(Debug)] +struct KeyArrayCache<K: ArrowDictionaryKeyType> { + cache: Option<(usize, bool, PrimitiveArray<K>)>, // (size, is_null, key_array) +} + +impl<K: ArrowDictionaryKeyType> Default for KeyArrayCache<K> { + fn default() -> Self { + Self { cache: None } + } +} + +impl<K: ArrowDictionaryKeyType> KeyArrayCache<K> { + fn get_or_create(&mut self, size: usize, is_null: bool) -> PrimitiveArray<K> { + match &self.cache { + Some((cached_size, cached_is_null, cached_array)) + if *cached_size == size && *cached_is_null == is_null => + { + // Cache hit: reuse existing array if same size and null status + cached_array.clone() + } + _ => { + // Cache miss: create new array and cache it + let key_array: PrimitiveArray<K> = repeat_n( + if is_null { + None + } else { + Some(K::default_value()) + }, + size, + ) + .collect(); + + self.cache = Some((size, is_null, key_array.clone())); + key_array + } + } + } +} + +/// Cache for null arrays to avoid repeated allocations +/// when the same size is used frequently. +#[derive(Debug, Default)] +struct NullArrayCache { + cache: Option<(usize, ArrayRef)>, // (size, null_array) +} + +impl NullArrayCache { + fn get_or_create(&mut self, size: usize) -> ArrayRef { + match &self.cache { + Some((cached_size, cached_array)) if *cached_size == size => { + // Cache hit: reuse existing array if same size + Arc::clone(cached_array) + } + _ => { + // Cache miss: create new array and cache it + let null_array = new_null_array(&DataType::Null, size); + self.cache = Some((size, Arc::clone(&null_array))); + null_array + } + } + } +} + +/// Global cache for dictionary key arrays and null arrays +#[derive(Debug, Default)] +struct ArrayCaches { + cache_i8: KeyArrayCache<Int8Type>, + cache_i16: KeyArrayCache<Int16Type>, + cache_i32: KeyArrayCache<Int32Type>, + cache_i64: KeyArrayCache<Int64Type>, + cache_u8: KeyArrayCache<UInt8Type>, + cache_u16: KeyArrayCache<UInt16Type>, + cache_u32: KeyArrayCache<UInt32Type>, + cache_u64: KeyArrayCache<UInt64Type>, + null_cache: NullArrayCache, +} + +static ARRAY_CACHES: LazyLock<Mutex<ArrayCaches>> = + LazyLock::new(|| Mutex::new(ArrayCaches::default())); + +/// Get the global cache for arrays +fn get_array_caches() -> &'static Mutex<ArrayCaches> { + &ARRAY_CACHES +} + +/// Get cached null array for the given size +fn get_cached_null_array(size: usize) -> ArrayRef { + let cache = get_array_caches(); + let mut caches = cache.lock().unwrap(); + caches.null_cache.get_or_create(size) +} + +/// Get cached key array for a specific key type +fn get_cached_key_array<K: ArrowDictionaryKeyType>( Review Comment: ```suggestion fn get_or_create_cached_key_array<K: ArrowDictionaryKeyType>( ``` ########## datafusion/common/src/scalar/mod.rs: ########## @@ -854,6 +854,140 @@ pub fn get_dict_value<K: ArrowDictionaryKeyType>( Ok((dict_array.values(), dict_array.key(index))) } +/// Cache for dictionary key arrays to avoid repeated allocations +/// when the same size is used frequently. +/// +/// Similar to PartitionColumnProjector's ZeroBufferGenerators, this cache +/// stores key arrays for different dictionary key types. The cache is +/// limited to 1 entry per type (the last size used). +#[derive(Debug)] +struct KeyArrayCache<K: ArrowDictionaryKeyType> { + cache: Option<(usize, bool, PrimitiveArray<K>)>, // (size, is_null, key_array) +} + +impl<K: ArrowDictionaryKeyType> Default for KeyArrayCache<K> { + fn default() -> Self { + Self { cache: None } + } +} + +impl<K: ArrowDictionaryKeyType> KeyArrayCache<K> { + fn get_or_create(&mut self, size: usize, is_null: bool) -> PrimitiveArray<K> { Review Comment: nit is I would use `num_rows` here rather than size as the size might be confused with the key size (eg `Int16Type` would have size 16??) ########## datafusion/common/src/scalar/mod.rs: ########## @@ -854,6 +854,140 @@ pub fn get_dict_value<K: ArrowDictionaryKeyType>( Ok((dict_array.values(), dict_array.key(index))) } +/// Cache for dictionary key arrays to avoid repeated allocations +/// when the same size is used frequently. +/// +/// Similar to PartitionColumnProjector's ZeroBufferGenerators, this cache +/// stores key arrays for different dictionary key types. The cache is +/// limited to 1 entry per type (the last size used). +#[derive(Debug)] +struct KeyArrayCache<K: ArrowDictionaryKeyType> { + cache: Option<(usize, bool, PrimitiveArray<K>)>, // (size, is_null, key_array) +} + +impl<K: ArrowDictionaryKeyType> Default for KeyArrayCache<K> { + fn default() -> Self { + Self { cache: None } + } +} + +impl<K: ArrowDictionaryKeyType> KeyArrayCache<K> { + fn get_or_create(&mut self, size: usize, is_null: bool) -> PrimitiveArray<K> { + match &self.cache { + Some((cached_size, cached_is_null, cached_array)) + if *cached_size == size && *cached_is_null == is_null => + { + // Cache hit: reuse existing array if same size and null status + cached_array.clone() + } + _ => { + // Cache miss: create new array and cache it + let key_array: PrimitiveArray<K> = repeat_n( + if is_null { + None + } else { + Some(K::default_value()) + }, + size, + ) + .collect(); + + self.cache = Some((size, is_null, key_array.clone())); + key_array + } + } + } +} + +/// Cache for null arrays to avoid repeated allocations +/// when the same size is used frequently. +#[derive(Debug, Default)] +struct NullArrayCache { + cache: Option<(usize, ArrayRef)>, // (size, null_array) +} + +impl NullArrayCache { + fn get_or_create(&mut self, size: usize) -> ArrayRef { + match &self.cache { + Some((cached_size, cached_array)) if *cached_size == size => { + // Cache hit: reuse existing array if same size + Arc::clone(cached_array) + } + _ => { + // Cache miss: create new array and cache it + let null_array = new_null_array(&DataType::Null, size); + self.cache = Some((size, Arc::clone(&null_array))); + null_array + } + } + } +} + +/// Global cache for dictionary key arrays and null arrays +#[derive(Debug, Default)] +struct ArrayCaches { + cache_i8: KeyArrayCache<Int8Type>, + cache_i16: KeyArrayCache<Int16Type>, + cache_i32: KeyArrayCache<Int32Type>, + cache_i64: KeyArrayCache<Int64Type>, + cache_u8: KeyArrayCache<UInt8Type>, + cache_u16: KeyArrayCache<UInt16Type>, + cache_u32: KeyArrayCache<UInt32Type>, + cache_u64: KeyArrayCache<UInt64Type>, + null_cache: NullArrayCache, +} + +static ARRAY_CACHES: LazyLock<Mutex<ArrayCaches>> = + LazyLock::new(|| Mutex::new(ArrayCaches::default())); + +/// Get the global cache for arrays +fn get_array_caches() -> &'static Mutex<ArrayCaches> { + &ARRAY_CACHES +} + +/// Get cached null array for the given size +fn get_cached_null_array(size: usize) -> ArrayRef { Review Comment: ```suggestion fn get_or_create_cached_null_array(size: usize) -> ArrayRef { ``` ########## datafusion/common/src/scalar/mod.rs: ########## @@ -854,6 +854,140 @@ pub fn get_dict_value<K: ArrowDictionaryKeyType>( Ok((dict_array.values(), dict_array.key(index))) } +/// Cache for dictionary key arrays to avoid repeated allocations +/// when the same size is used frequently. +/// +/// Similar to PartitionColumnProjector's ZeroBufferGenerators, this cache +/// stores key arrays for different dictionary key types. The cache is +/// limited to 1 entry per type (the last size used). +#[derive(Debug)] +struct KeyArrayCache<K: ArrowDictionaryKeyType> { + cache: Option<(usize, bool, PrimitiveArray<K>)>, // (size, is_null, key_array) Review Comment: ```suggestion cache: Option<(usize, bool, PrimitiveArray<K>)>, // (num_rows, is_null, key_array) ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org