Re: [PR] Optimize Dictionary groupings [datafusion]

via GitHub Sun, 07 Jun 2026 23:23:12 -0700


kumarUjjawal commented on code in PR #21765:
URL: https://github.com/apache/datafusion/pull/21765#discussion_r3371046924



##########
datafusion/physical-plan/src/aggregates/group_values/single_group_by/dictionary.rs:
##########
@@ -0,0 +1,878 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::aggregates::group_values::GroupValues;
+use crate::hash_utils::RandomState;
+use arrow::array::{
+    Array, ArrayRef, AsArray, DictionaryArray, LargeStringArray, 
LargeStringBuilder,
+    ListArray, ListBuilder, PrimitiveArray, PrimitiveBuilder, StringArray, 
StringBuilder,
+    StringViewArray, StringViewBuilder,
+};
+use arrow::datatypes::{ArrowDictionaryKeyType, ArrowNativeType, DataType};
+use datafusion_common::DataFusionError::{Internal, NotImplemented};
+use datafusion_common::Result;
+use datafusion_common::hash_utils::create_hashes;
+use datafusion_expr::EmitTo;
+use hashbrown::HashTable;
+use hashbrown::hash_table::Entry as HashTableEntry;
+use std::borrow::Cow;
+use std::marker::PhantomData;
+use std::sync::Arc;
+
+/// Heuristic for sizing the values buffer of string builders during emit:
+/// dictionary-encoded values are short by design (categorical strings, short
+/// identifiers), so 16 B/item avoids the realloc-doubling chain in the common
+/// case while keeping over-allocation cheap when values are smaller.
+const AVG_BYTES_PER_DICT_VALUE: usize = 16;
+const INITIAL_PRE_ALLOCATION: usize = 8 * 1024; // avoid re-allocation`s for 
small-medium groups

Review Comment:
   The lookup table is always pre-sized to ~8,000 entries on creation. so most 
instances will hold a handful of values yet still pay for thousands of empty 
slots.  We can start small and let it grow, or size it from a cheap signal 
(such as the number of distinct values already present), rather than a fixed 
large constant.



##########
datafusion/physical-plan/src/aggregates/group_values/mod.rs:
##########
@@ -196,6 +202,39 @@ pub fn new_group_values(
             DataType::Boolean => {
                 return Ok(Box::new(GroupValuesBoolean::new()));
             }
+            DataType::Dictionary(key_type, value_type)
+                if supported_single_dictionary_value(value_type) =>
+            {
+                return match key_type.as_ref() {
+                    DataType::Int8 => {
+                        
Ok(Box::new(GroupValuesDictionary::<Int8Type>::new(value_type)))
+                    }
+                    DataType::Int16 => Ok(Box::new(
+                        GroupValuesDictionary::<Int16Type>::new(value_type),
+                    )),
+                    DataType::Int32 => Ok(Box::new(
+                        GroupValuesDictionary::<Int32Type>::new(value_type),
+                    )),
+                    DataType::Int64 => Ok(Box::new(
+                        GroupValuesDictionary::<Int64Type>::new(value_type),
+                    )),
+                    DataType::UInt8 => Ok(Box::new(
+                        GroupValuesDictionary::<UInt8Type>::new(value_type),
+                    )),
+                    DataType::UInt16 => Ok(Box::new(
+                        GroupValuesDictionary::<UInt16Type>::new(value_type),
+                    )),
+                    DataType::UInt32 => Ok(Box::new(
+                        GroupValuesDictionary::<UInt32Type>::new(value_type),
+                    )),
+                    DataType::UInt64 => Ok(Box::new(
+                        GroupValuesDictionary::<UInt64Type>::new(value_type),
+                    )),
+                    _ => 
Err(datafusion_common::DataFusionError::NotImplemented(

Review Comment:
    If a dictionary's value type is supported but its key type somehow isn't, 
the code returns an error rather than handing the work to the generic path. 
Every real dictionary key type is covered, so this branch is unreachable today 
but a graceful fallback would be more robust than a hard error if Arrow ever 
adds a key type. 



##########
datafusion/physical-plan/src/aggregates/group_values/single_group_by/dictionary.rs:
##########
@@ -0,0 +1,878 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::aggregates::group_values::GroupValues;
+use crate::hash_utils::RandomState;
+use arrow::array::{
+    Array, ArrayRef, AsArray, DictionaryArray, LargeStringArray, 
LargeStringBuilder,
+    ListArray, ListBuilder, PrimitiveArray, PrimitiveBuilder, StringArray, 
StringBuilder,
+    StringViewArray, StringViewBuilder,
+};
+use arrow::datatypes::{ArrowDictionaryKeyType, ArrowNativeType, DataType};
+use datafusion_common::DataFusionError::{Internal, NotImplemented};
+use datafusion_common::Result;
+use datafusion_common::hash_utils::create_hashes;
+use datafusion_expr::EmitTo;
+use hashbrown::HashTable;
+use hashbrown::hash_table::Entry as HashTableEntry;
+use std::borrow::Cow;
+use std::marker::PhantomData;
+use std::sync::Arc;
+
+/// Heuristic for sizing the values buffer of string builders during emit:
+/// dictionary-encoded values are short by design (categorical strings, short
+/// identifiers), so 16 B/item avoids the realloc-doubling chain in the common
+/// case while keeping over-allocation cheap when values are smaller.
+const AVG_BYTES_PER_DICT_VALUE: usize = 16;
+const INITIAL_PRE_ALLOCATION: usize = 8 * 1024; // avoid re-allocation`s for 
small-medium groups
+
+macro_rules! decode_list {
+    ($raw:expr, $builder:expr) => {{
+        let mut builder = $builder;
+        for raw_bytes in $raw {
+            match raw_bytes {
+                None => builder.append_null(),
+                Some(raw_vector) => {
+                    let mut offset = 0;
+                    while offset < raw_vector.len() {
+                        let len = i64::from_ne_bytes(
+                            raw_vector[offset..offset + 8]
+                                .try_into()
+                                .expect("slice of length 8"),
+                        );
+                        offset += 8;
+                        if len == -1 {
+                            builder.values().append_null();
+                        } else {
+                            let s = unsafe {
+                                std::str::from_utf8_unchecked(
+                                    &raw_vector[offset..offset + len as usize],
+                                )
+                            };
+                            builder.values().append_value(s);
+                            offset += len as usize;
+                        }
+                    }
+                    builder.append(true);
+                }
+            }
+        }
+        Ok(Arc::new(builder.finish()) as ArrayRef)
+    }};
+}
+macro_rules! decode_scalar_string {
+    ($raw:expr, $builder:expr) => {{
+        let mut builder = $builder;
+        for raw_bytes in $raw {
+            match raw_bytes {
+                Some(raw_vector) => {
+                    let s = unsafe { std::str::from_utf8_unchecked(raw_vector) 
};
+                    builder.append_value(s);
+                }
+                None => builder.append_null(),
+            }
+        }
+        Ok(Arc::new(builder.finish()) as ArrayRef)
+    }};
+}
+/// Entry stored inside `unique_dict_value_mapping`. Caching `hash` on the 
entry
+struct DictEntry {
+    hash: u64,
+    group_id: usize,
+    bytes: Vec<u8>,
+}
+
+pub struct GroupValuesDictionary<K: ArrowDictionaryKeyType + Send> {
+    // stores the order new unique elements are seen for self.emit()
+    seen_elements: Vec<Option<Vec<u8>>>,
+    value_dt: DataType,
+    _phantom: PhantomData<K>,
+    // keeps track of which values weve already seen, keyed by raw value hash.
+    unique_dict_value_mapping: HashTable<DictEntry>,
+
+    random_state: RandomState,
+
+    // cache the group id for nulls since they all map to the same group
+    null_group_id: Option<usize>,
+    // key to group vector scratch space, used to avoid re-allocating a new 
vector on each call to intern
+    key_to_group: Vec<Option<usize>>,
+    // 0. cache pointer of arrays, this avoids having to re-compute hashing 
for arrays weve already seen on past iterations
+    // 1. avoid re-allocating buffer inbetween calls, instead of allocating a 
new vector each time re-use inbetween calls
+    values_cache: (Option<ArrayRef>, Vec<u64>),
+}
+
+impl<K: ArrowDictionaryKeyType + Send> GroupValuesDictionary<K> {
+    pub fn new(data_type: &DataType) -> Self {
+        Self {
+            seen_elements: Vec::new(),
+            unique_dict_value_mapping: 
HashTable::with_capacity(INITIAL_PRE_ALLOCATION),
+            value_dt: data_type.clone(),
+            _phantom: PhantomData,
+            random_state: RandomState::with_seed(0),
+            null_group_id: None,
+            key_to_group: Vec::new(),
+            values_cache: (None, Vec::new()),
+        }
+    }
+
+    /// Returns the existing `group_id` for the value with this hash and bytes,
+    /// or inserts a new entry and returns the freshly assigned group_id.
+    fn lookup_or_insert_in_table(&mut self, hash: u64, raw: &[u8]) -> usize {
+        match self
+            .unique_dict_value_mapping
+            .entry(hash, |e| e.bytes == raw, |e| e.hash)
+        {
+            HashTableEntry::Occupied(o) => o.get().group_id,
+            HashTableEntry::Vacant(v) => {
+                let new_group_id = self.seen_elements.len();
+                self.seen_elements.push(Some(raw.to_vec())); // replace this 
with raw buffer to avoid this double copy #TODO see 
https://github.com/apache/datafusion/issues/22078
+                v.insert(DictEntry {
+                    hash,
+                    group_id: new_group_id,
+                    bytes: raw.to_vec(),
+                });
+                new_group_id
+            }
+        }
+    }
+    fn compute_value_hashes(&mut self, values: &ArrayRef) -> Result<()> {
+        self.values_cache.1.clear();
+        self.values_cache.1.resize(values.len(), 0);
+        create_hashes(
+            [Arc::clone(values)],
+            &self.random_state,
+            &mut self.values_cache.1,
+        )?;
+        Ok(())
+        //Ok(hashes)
+    }
+
+    fn get_raw_bytes(values: &ArrayRef, index: usize) -> Cow<'_, [u8]> {
+        match values.data_type() {
+            DataType::Utf8 => Cow::Borrowed(
+                values
+                    .as_any()
+                    .downcast_ref::<StringArray>()
+                    .expect("Expected StringArray")
+                    .value(index)
+                    .as_bytes(),
+            ),
+            DataType::LargeUtf8 => Cow::Borrowed(
+                values
+                    .as_any()
+                    .downcast_ref::<LargeStringArray>()
+                    .expect("Expected LargeStringArray")
+                    .value(index)
+                    .as_bytes(),
+            ),
+            DataType::Utf8View => Cow::Borrowed(
+                values
+                    .as_any()
+                    .downcast_ref::<StringViewArray>()
+                    .expect("Expected StringViewArray")
+                    .value(index)
+                    .as_bytes(),
+            ),
+            DataType::List(_) => {
+                let list_array = values
+                    .as_any()
+                    .downcast_ref::<ListArray>()
+                    .expect("Expected ListArray");
+
+                debug_assert!(!list_array.is_null(index));
+
+                let start = list_array.value_offsets()[index] as usize;
+                let end = list_array.value_offsets()[index + 1] as usize;
+                let child = list_array.values();
+
+                let mut bytes = Vec::new();
+                for i in start..end {
+                    if child.is_null(i) {
+                        // acts as a marker for transform_into_array to write 
a null
+                        bytes.extend_from_slice(&(-1i64).to_ne_bytes());
+                    } else {
+                        let raw = Self::get_raw_bytes(child, i);
+                        bytes.extend_from_slice(&(raw.len() as 
i64).to_ne_bytes());
+                        bytes.extend_from_slice(&raw);
+                    }
+                }
+                Cow::Owned(bytes)
+            }
+            other => unimplemented!("get_raw_bytes not implemented for 
{other:?}"),
+        }
+    }
+
+    #[inline]
+    fn get_null_group_id(&mut self) -> usize {
+        if let Some(group_id) = self.null_group_id {
+            group_id
+        } else {
+            let new_group_id = self.seen_elements.len();
+            self.seen_elements.push(None);
+            self.null_group_id = Some(new_group_id);
+            new_group_id
+        }
+    }
+    fn transform_into_array(&self, raw: &[Option<Vec<u8>>]) -> 
Result<ArrayRef> {
+        let item_capacity = raw.len();
+        let data_capacity = item_capacity * AVG_BYTES_PER_DICT_VALUE;
+        match &self.value_dt {
+            DataType::Utf8 => decode_scalar_string!(
+                raw,
+                StringBuilder::with_capacity(item_capacity, data_capacity)
+            ),
+            DataType::LargeUtf8 => decode_scalar_string!(
+                raw,
+                LargeStringBuilder::with_capacity(item_capacity, data_capacity)
+            ),
+            DataType::Utf8View => decode_scalar_string!(
+                raw,
+                StringViewBuilder::with_capacity(item_capacity)
+            ),
+            DataType::List(field) => match field.data_type() {
+                DataType::Utf8 => decode_list!(
+                    raw,
+                    ListBuilder::with_capacity(
+                        StringBuilder::with_capacity(item_capacity, 
data_capacity),
+                        item_capacity,
+                    )
+                    .with_field(Arc::clone(field))
+                ),
+                DataType::LargeUtf8 => decode_list!(
+                    raw,
+                    ListBuilder::with_capacity(
+                        LargeStringBuilder::with_capacity(item_capacity, 
data_capacity),
+                        item_capacity,
+                    )
+                    .with_field(Arc::clone(field))
+                ),
+                DataType::Utf8View => decode_list!(
+                    raw,
+                    ListBuilder::with_capacity(
+                        StringViewBuilder::with_capacity(item_capacity),
+                        item_capacity,
+                    )
+                    .with_field(Arc::clone(field))
+                ),
+                other => Err(NotImplemented(format!(
+                    "transform_into_array not implemented for List<{other:?}>"
+                ))),
+            },
+            other => Err(NotImplemented(format!(
+                "transform_into_array not implemented for {other:?}"
+            ))),
+        }
+    }
+}
+
+fn valid_bounds<K: ArrowDictionaryKeyType>(n: usize) -> bool {
+    let max: usize = match K::DATA_TYPE {
+        DataType::Int8 => i8::MAX as usize,
+        DataType::Int16 => i16::MAX as usize,
+        DataType::Int32 => i32::MAX as usize,
+        DataType::Int64 => i64::MAX as usize,
+        DataType::UInt8 => u8::MAX as usize,
+        DataType::UInt16 => u16::MAX as usize,
+        DataType::UInt32 => u32::MAX as usize,
+        DataType::UInt64 => usize::MAX,
+        _ => return false,
+    };
+    n <= max
+}
+
+impl<K: ArrowDictionaryKeyType + Send> GroupValues for 
GroupValuesDictionary<K> {
+    fn size(&self) -> usize {
+        let seen_elements_size = self.seen_elements.capacity()
+            * size_of::<Option<Vec<u8>>>()
+            + self
+                .seen_elements
+                .iter()
+                .filter_map(|opt| opt.as_ref())
+                .map(|inner| inner.capacity())
+                .sum::<usize>();
+
+        let unique_mapping_size = self.unique_dict_value_mapping.capacity()
+            * size_of::<DictEntry>()
+            + self
+                .unique_dict_value_mapping
+                .iter()
+                .map(|e| e.bytes.capacity())
+                .sum::<usize>();
+
+        let values_cache_size = self.values_cache.1.capacity() * 
size_of::<u64>()
+            + self
+                .values_cache
+                .0
+                .as_ref()
+                .map(|a| a.to_data().get_array_memory_size())
+                .unwrap_or(0);
+
+        size_of::<Self>() + seen_elements_size + unique_mapping_size + 
values_cache_size
+    }
+    fn len(&self) -> usize {
+        self.seen_elements.len()
+    }
+    fn is_empty(&self) -> bool {
+        self.seen_elements.is_empty()
+    }
+    fn intern(&mut self, cols: &[ArrayRef], groups: &mut Vec<usize>) -> 
Result<()> {
+        assert_eq!(
+            cols.len(),
+            1,
+            "GroupValuesDictionary only supports a single column"
+        );
+        let array = Arc::clone(&cols[0]);
+        groups.clear(); // zero out buffer
+        let dict_array = array
+            .as_dictionary_opt()
+            .map_or_else(|| Err(Internal("Expected dictionary array".into())), 
Ok)?;
+
+        let values = dict_array.values();
+        let key_array: &PrimitiveArray<K> = dict_array.keys();
+        if key_array.is_empty() {
+            return Ok(());
+        }
+
+        let cache_hit = self
+            .values_cache
+            .0
+            .as_ref()
+            .map(|cached| Arc::ptr_eq(cached, values))
+            .unwrap_or(false);
+
+        if !cache_hit {
+            // values array changed since last batch - recompute hashes and 
update cached pointer
+            self.compute_value_hashes(values)?;
+            self.values_cache.0 = Some(Arc::clone(values));
+        }
+        // avoid re-allocating the key_to_group vector on each call to intern 
by re-using it as scratch space and only updating self.key_to_group at the end 
of the function.
+        let mut key_to_group = std::mem::take(&mut self.key_to_group);
+        key_to_group.clear();
+        key_to_group.resize(values.len(), None);
+
+        // iterate keys array (n iterations)
+        // only d insertions at most, repeated work is cached
+        for i in 0..key_array.len() {
+            let group_id = if key_array.is_null(i) {
+                self.get_null_group_id()
+            } else {
+                let key = key_array.value(i).to_usize().unwrap();
+                if let Some(group_id) = key_to_group[key] {
+                    group_id
+                } else if values.is_null(key) {
+                    let gid = self.get_null_group_id();
+                    key_to_group[key] = Some(gid);
+                    gid
+                } else {
+                    let hash = self.values_cache.1[key];
+                    let raw = Self::get_raw_bytes(values, key);
+                    let gid = self.lookup_or_insert_in_table(hash, 
raw.as_ref());
+                    key_to_group[key] = Some(gid);
+                    gid
+                }
+            };
+            groups.push(group_id);
+        }
+        self.key_to_group = key_to_group;
+        Ok(())
+    }
+    fn emit(&mut self, emit_to: EmitTo) -> Result<Vec<ArrayRef>> {
+        let (elements_to_emit, null_id) = match emit_to {
+            EmitTo::All => {
+                let original_null_id = self.null_group_id;
+                self.null_group_id = None;
+                self.unique_dict_value_mapping.clear();
+                (std::mem::take(&mut self.seen_elements), original_null_id)
+            }
+            EmitTo::First(n) => {
+                let n = n.min(self.seen_elements.len());
+                let first_n = 
self.seen_elements.drain(..n).collect::<Vec<_>>(); // again read from raw 
buffer directly
+                let original_null_id = self.null_group_id.filter(|&id| id < n);
+                // update null_group_id if the null group was in the first n
+                if let Some(null_id) = self.null_group_id {
+                    if null_id < n {
+                        self.null_group_id = None;
+                    } else {
+                        self.null_group_id = Some(null_id - n);
+                    }
+                }
+                // shift all remaining group indices down by n in the table
+                self.unique_dict_value_mapping.retain(|e| {
+                    if e.group_id < n {
+                        false
+                    } else {
+                        e.group_id -= n;
+                        true
+                    }
+                });
+                (first_n, original_null_id)
+            }
+        };
+
+        let n = elements_to_emit.len();
+        if !valid_bounds::<K>(n) {

Review Comment:
   guard is off by one,compare against one more than the largest index, i.e. 
allow the count when the largest index it will produce still fits the key type



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] Optimize Dictionary groupings [datafusion]

Reply via email to