Re: [PR] feat: implement GroupArrayAggAccumulator attempt 3 [datafusion]

via GitHub Sat, 18 Oct 2025 01:47:26 -0700


duongcongtoai commented on code in PR #17915:
URL: https://github.com/apache/datafusion/pull/17915#discussion_r2404392851



##########
datafusion/functions-aggregate-common/src/aggregate/array_agg.rs:
##########
@@ -0,0 +1,224 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Dedicated implementation of `GroupsAccumulator` for `array_agg`
+
+use std::iter::repeat_n;
+use std::sync::Arc;
+
+use arrow::array::{new_empty_array, Array, GenericListArray};
+use arrow::array::{ArrayRef, AsArray, BooleanArray};
+use arrow::buffer::OffsetBuffer;
+use arrow::compute::kernels;
+use arrow::datatypes::Field;
+use datafusion_common::{internal_datafusion_err, Result};
+use datafusion_expr_common::groups_accumulator::{EmitTo, GroupsAccumulator};
+
+#[derive(Default)]
+pub struct AggGroupAccumulator {
+    // [1,2,3] [4,5,6]
+    stacked_batches: Vec<ArrayRef>,
+    // address items of each group within the stacked_batches
+    // this is maintained to perform kernel::interleave
+    stacked_group_indices: Vec<Vec<(usize, usize)>>,
+}
+
+impl AggGroupAccumulator {
+    pub fn new() -> Self {
+        Self {
+            stacked_batches: vec![],
+            stacked_group_indices: vec![],
+        }
+    }
+    fn consume_stacked_batches(
+        &mut self,
+        emit_to: EmitTo,
+    ) -> Result<GenericListArray<i32>> {
+        let stacked_batches = self
+            .stacked_batches
+            .iter()
+            .map(|arr| arr.as_ref())
+            .collect::<Vec<_>>();
+
+        let group_indices = emit_to.take_needed(&mut 
self.stacked_group_indices);
+        let lengths = group_indices.iter().map(|v| v.len());
+
+        let offsets_buffer = OffsetBuffer::from_lengths(lengths);
+
+        // group indices like [1,1,1,2,2,2]
+        // backend_array like [a,b,c,d,e,f]
+        // offsets should be: [0,3,6]
+        // then result should be [a,b,c], [d,e,f]
+
+        // backend_array is a flatten list of individual values before 
aggregation
+        let backend_array = kernels::interleave::interleave(
+            &stacked_batches,
+            group_indices
+                .into_iter()
+                .flatten()
+                .collect::<Vec<_>>()
+                .as_slice(),
+        )?;
+        let dt = backend_array.data_type();
+        let field = Arc::new(Field::new_list_field(dt.clone(), true));
+
+        let arr =
+            GenericListArray::<i32>::new(field, offsets_buffer, backend_array, 
None);
+        Ok(arr)
+    }
+}
+
+impl GroupsAccumulator for AggGroupAccumulator {
+    // given the stacked_batch as:
+    // - batch1 [1,4,5,6,7]
+    // - batch2 [5,1,1,1,1]
+
+    // and group_indices as
+    // indices g1: [(0,0), (1,1), (1,2) ...]
+    // indices g2: []
+    // indices g3: []
+    // indices g4: [(0,1)]
+    // each tuple represents (batch_index, and offset within the batch index)
+    // for example
+    // - (0,0) means the 0th item inside batch1, which is `1`
+    // - (1,1) means the 1th item inside batch2, which is `1`
+    fn update_batch(
+        &mut self,
+        values: &[ArrayRef],
+        group_indices: &[usize],
+        opt_filter: Option<&BooleanArray>,
+        total_num_groups: usize,
+    ) -> Result<()> {
+        if opt_filter.is_some() {
+            panic!("not implemented");
+        }
+
+        let singular_col = values
+            .first()
+            .ok_or(internal_datafusion_err!("invalid agg input"))?;
+        if self.stacked_group_indices.len() < total_num_groups {
+            self.stacked_group_indices
+                .resize(total_num_groups, Vec::new());
+        }
+
+        self.stacked_batches.push(Arc::clone(singular_col));
+        let batch_index = self.stacked_batches.len() - 1;
+
+        if let Some(filter) = opt_filter {
+            for (array_offset, (group_index, filter_value)) in
+                group_indices.iter().zip(filter.iter()).enumerate()
+            {
+                if let Some(true) = filter_value {
+                    self.stacked_group_indices[*group_index]
+                        .push((batch_index, array_offset));
+                }
+            }
+        } else {
+            for (array_offset, group_index) in 
group_indices.iter().enumerate() {
+                self.stacked_group_indices[*group_index]
+                    .push((batch_index, array_offset));
+            }
+        }
+
+        Ok(())
+    }
+
+    fn evaluate(&mut self, emit_to: EmitTo) -> Result<ArrayRef> {
+        let arr = self.consume_stacked_batches(emit_to)?;
+        Ok(Arc::new(arr) as ArrayRef)
+    }
+
+    // filtered_null_mask(opt_filter, &values);
+    fn state(&mut self, emit_to: EmitTo) -> Result<Vec<ArrayRef>> {
+        Ok(vec![self.evaluate(emit_to)?])
+    }
+
+    fn merge_batch(
+        &mut self,
+        values: &[ArrayRef],
+        group_indices: &[usize],
+        opt_filter: Option<&BooleanArray>,
+        total_num_groups: usize,
+    ) -> Result<()> {
+        // TODO: all the reference to this function always result into this 
opt_filter as none

Review Comment:
   actually for `merge_batch` i think the opt_filter will always be None, for 
example the implementation of
   median: 
https://github.com/apache/datafusion/blob/53728b30aa95be7865a0ad35e70ec574312a69bc/datafusion/functions-aggregate/src/median.rs#L332
   count: 
https://github.com/apache/datafusion/blob/53728b30aa95be7865a0ad35e70ec574312a69bc/datafusion/functions-aggregate/src/count.rs#L461
   
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] feat: implement GroupArrayAggAccumulator attempt 3 [datafusion]

Reply via email to