alamb commented on code in PR #7716:
URL: https://github.com/apache/arrow-rs/pull/7716#discussion_r2197774129


##########
arrow-select/src/dictionary.rs:
##########
@@ -23,10 +28,63 @@ use arrow_array::types::{
     LargeUtf8Type, Utf8Type,
 };
 use arrow_array::{cast::AsArray, downcast_primitive};
-use arrow_array::{Array, ArrayRef, DictionaryArray, GenericByteArray, 
PrimitiveArray};
+use arrow_array::{
+    downcast_dictionary_array, AnyDictionaryArray, Array, ArrayRef, 
ArrowNativeTypeOp,
+    BooleanArray, DictionaryArray, GenericByteArray, PrimitiveArray,
+};
 use arrow_buffer::{ArrowNativeType, BooleanBuffer, ScalarBuffer, ToByteSlice};
 use arrow_schema::{ArrowError, DataType};
 
+/// Garbage collects a [DictionaryArray] by removing unreferenced values.

Review Comment:
   ```suggestion
   /// Garbage collects a [DictionaryArray] by removing unreferenced values.
   ///
   /// Returns a new [DictionaryArray] such that there are no values 
   /// that are not referenced by at least one key. There may still be 
duplicate 
   /// values. 
   ///
   /// See also [`garbage_collect_any_dictionary`] if you need to handle 
multiple dictionary types
   ```



##########
arrow-select/src/dictionary.rs:
##########
@@ -78,7 +136,7 @@ impl<'a, V> Interner<'a, V> {
     }
 }
 
-pub struct MergedDictionaries<K: ArrowDictionaryKeyType> {
+pub(crate) struct MergedDictionaries<K: ArrowDictionaryKeyType> {

Review Comment:
   Is this change needed? I didn't see `MergedDictionaries` used  anywhere else



##########
arrow-select/src/dictionary.rs:
##########
@@ -23,10 +28,63 @@ use arrow_array::types::{
     LargeUtf8Type, Utf8Type,
 };
 use arrow_array::{cast::AsArray, downcast_primitive};
-use arrow_array::{Array, ArrayRef, DictionaryArray, GenericByteArray, 
PrimitiveArray};
+use arrow_array::{
+    downcast_dictionary_array, AnyDictionaryArray, Array, ArrayRef, 
ArrowNativeTypeOp,
+    BooleanArray, DictionaryArray, GenericByteArray, PrimitiveArray,
+};
 use arrow_buffer::{ArrowNativeType, BooleanBuffer, ScalarBuffer, ToByteSlice};
 use arrow_schema::{ArrowError, DataType};
 
+/// Garbage collects a [DictionaryArray] by removing unreferenced values.
+pub fn garbage_collect_dictionary<K: ArrowDictionaryKeyType>(
+    dictionary: &DictionaryArray<K>,
+) -> Result<DictionaryArray<K>, ArrowError> {
+    let keys = dictionary.keys();
+    let values = dictionary.values();
+
+    let mask = dictionary.occupancy();
+
+    // If no work to do, return the original dictionary
+    if mask.count_set_bits() == values.len() {
+        return Ok(dictionary.clone());
+    }
+
+    // Create a mapping from the old keys to the new keys, use a Vec for easy 
indexing
+    let mut key_remap = vec![K::Native::ZERO; values.len()];
+    for (new_idx, old_idx) in mask.set_indices().enumerate() {
+        key_remap[old_idx] = K::Native::from_usize(new_idx)
+            .expect("new index should fit in K::Native, as old index was in 
range");
+    }
+
+    // ... and then build the new keys array
+    let new_keys = keys.unary(|key| {
+        key_remap
+            .get(key.as_usize())
+            .copied()
+            // nulls may be present in the keys, and they will have arbitrary 
value; we don't care
+            // and can safely return zero
+            .unwrap_or(K::Native::ZERO)
+    });
+
+    // Create a new values array by filtering using the mask
+    let values = filter(dictionary.values(), &BooleanArray::new(mask, None))?;
+
+    Ok(DictionaryArray::new(new_keys, values))
+}
+
+/// Equivalent to [`garbage_collect_dictionary`] but without requiring casting 
to a specific key type.
+pub fn garbage_collect_any_dictionary(
+    dictionary: &dyn AnyDictionaryArray,
+) -> Result<ArrayRef, ArrowError> {
+    // FIXME: this is a workaround for MSRV Rust versions below 1.86 where 
trait upcasting is not stable.
+    // From 1.86 onward, `&dyn AnyDictionaryArray` can be directly passed to 
`downcast_dictionary_array!`.
+    let dictionary = &*dictionary.slice(0, dictionary.len());

Review Comment:
   It seems fine to me



##########
arrow-select/src/dictionary.rs:
##########
@@ -110,7 +168,7 @@ type PtrEq = fn(&dyn Array, &dyn Array) -> bool;
 /// some return over the naive approach used by MutableArrayData
 ///
 /// `len` is the total length of the merged output
-pub fn should_merge_dictionary_values<K: ArrowDictionaryKeyType>(
+pub(crate) fn should_merge_dictionary_values<K: ArrowDictionaryKeyType>(

Review Comment:
   Iis this change still needed? I didn't see `should_merge_dictionary_values` 
used anyhwere in this PR



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to