alamb commented on code in PR #7716: URL: https://github.com/apache/arrow-rs/pull/7716#discussion_r2197774129
########## arrow-select/src/dictionary.rs: ########## @@ -23,10 +28,63 @@ use arrow_array::types::{ LargeUtf8Type, Utf8Type, }; use arrow_array::{cast::AsArray, downcast_primitive}; -use arrow_array::{Array, ArrayRef, DictionaryArray, GenericByteArray, PrimitiveArray}; +use arrow_array::{ + downcast_dictionary_array, AnyDictionaryArray, Array, ArrayRef, ArrowNativeTypeOp, + BooleanArray, DictionaryArray, GenericByteArray, PrimitiveArray, +}; use arrow_buffer::{ArrowNativeType, BooleanBuffer, ScalarBuffer, ToByteSlice}; use arrow_schema::{ArrowError, DataType}; +/// Garbage collects a [DictionaryArray] by removing unreferenced values. Review Comment: ```suggestion /// Garbage collects a [DictionaryArray] by removing unreferenced values. /// /// Returns a new [DictionaryArray] such that there are no values /// that are not referenced by at least one key. There may still be duplicate /// values. /// /// See also [`garbage_collect_any_dictionary`] if you need to handle multiple dictionary types ``` ########## arrow-select/src/dictionary.rs: ########## @@ -78,7 +136,7 @@ impl<'a, V> Interner<'a, V> { } } -pub struct MergedDictionaries<K: ArrowDictionaryKeyType> { +pub(crate) struct MergedDictionaries<K: ArrowDictionaryKeyType> { Review Comment: Is this change needed? I didn't see `MergedDictionaries` used anywhere else ########## arrow-select/src/dictionary.rs: ########## @@ -23,10 +28,63 @@ use arrow_array::types::{ LargeUtf8Type, Utf8Type, }; use arrow_array::{cast::AsArray, downcast_primitive}; -use arrow_array::{Array, ArrayRef, DictionaryArray, GenericByteArray, PrimitiveArray}; +use arrow_array::{ + downcast_dictionary_array, AnyDictionaryArray, Array, ArrayRef, ArrowNativeTypeOp, + BooleanArray, DictionaryArray, GenericByteArray, PrimitiveArray, +}; use arrow_buffer::{ArrowNativeType, BooleanBuffer, ScalarBuffer, ToByteSlice}; use arrow_schema::{ArrowError, DataType}; +/// Garbage collects a [DictionaryArray] by removing unreferenced values. +pub fn garbage_collect_dictionary<K: ArrowDictionaryKeyType>( + dictionary: &DictionaryArray<K>, +) -> Result<DictionaryArray<K>, ArrowError> { + let keys = dictionary.keys(); + let values = dictionary.values(); + + let mask = dictionary.occupancy(); + + // If no work to do, return the original dictionary + if mask.count_set_bits() == values.len() { + return Ok(dictionary.clone()); + } + + // Create a mapping from the old keys to the new keys, use a Vec for easy indexing + let mut key_remap = vec![K::Native::ZERO; values.len()]; + for (new_idx, old_idx) in mask.set_indices().enumerate() { + key_remap[old_idx] = K::Native::from_usize(new_idx) + .expect("new index should fit in K::Native, as old index was in range"); + } + + // ... and then build the new keys array + let new_keys = keys.unary(|key| { + key_remap + .get(key.as_usize()) + .copied() + // nulls may be present in the keys, and they will have arbitrary value; we don't care + // and can safely return zero + .unwrap_or(K::Native::ZERO) + }); + + // Create a new values array by filtering using the mask + let values = filter(dictionary.values(), &BooleanArray::new(mask, None))?; + + Ok(DictionaryArray::new(new_keys, values)) +} + +/// Equivalent to [`garbage_collect_dictionary`] but without requiring casting to a specific key type. +pub fn garbage_collect_any_dictionary( + dictionary: &dyn AnyDictionaryArray, +) -> Result<ArrayRef, ArrowError> { + // FIXME: this is a workaround for MSRV Rust versions below 1.86 where trait upcasting is not stable. + // From 1.86 onward, `&dyn AnyDictionaryArray` can be directly passed to `downcast_dictionary_array!`. + let dictionary = &*dictionary.slice(0, dictionary.len()); Review Comment: It seems fine to me ########## arrow-select/src/dictionary.rs: ########## @@ -110,7 +168,7 @@ type PtrEq = fn(&dyn Array, &dyn Array) -> bool; /// some return over the naive approach used by MutableArrayData /// /// `len` is the total length of the merged output -pub fn should_merge_dictionary_values<K: ArrowDictionaryKeyType>( +pub(crate) fn should_merge_dictionary_values<K: ArrowDictionaryKeyType>( Review Comment: Iis this change still needed? I didn't see `should_merge_dictionary_values` used anyhwere in this PR -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org