duongcongtoai commented on code in PR #17915: URL: https://github.com/apache/datafusion/pull/17915#discussion_r2404392851
########## datafusion/functions-aggregate-common/src/aggregate/array_agg.rs: ########## @@ -0,0 +1,224 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Dedicated implementation of `GroupsAccumulator` for `array_agg` + +use std::iter::repeat_n; +use std::sync::Arc; + +use arrow::array::{new_empty_array, Array, GenericListArray}; +use arrow::array::{ArrayRef, AsArray, BooleanArray}; +use arrow::buffer::OffsetBuffer; +use arrow::compute::kernels; +use arrow::datatypes::Field; +use datafusion_common::{internal_datafusion_err, Result}; +use datafusion_expr_common::groups_accumulator::{EmitTo, GroupsAccumulator}; + +#[derive(Default)] +pub struct AggGroupAccumulator { + // [1,2,3] [4,5,6] + stacked_batches: Vec<ArrayRef>, + // address items of each group within the stacked_batches + // this is maintained to perform kernel::interleave + stacked_group_indices: Vec<Vec<(usize, usize)>>, +} + +impl AggGroupAccumulator { + pub fn new() -> Self { + Self { + stacked_batches: vec![], + stacked_group_indices: vec![], + } + } + fn consume_stacked_batches( + &mut self, + emit_to: EmitTo, + ) -> Result<GenericListArray<i32>> { + let stacked_batches = self + .stacked_batches + .iter() + .map(|arr| arr.as_ref()) + .collect::<Vec<_>>(); + + let group_indices = emit_to.take_needed(&mut self.stacked_group_indices); + let lengths = group_indices.iter().map(|v| v.len()); + + let offsets_buffer = OffsetBuffer::from_lengths(lengths); + + // group indices like [1,1,1,2,2,2] + // backend_array like [a,b,c,d,e,f] + // offsets should be: [0,3,6] + // then result should be [a,b,c], [d,e,f] + + // backend_array is a flatten list of individual values before aggregation + let backend_array = kernels::interleave::interleave( + &stacked_batches, + group_indices + .into_iter() + .flatten() + .collect::<Vec<_>>() + .as_slice(), + )?; + let dt = backend_array.data_type(); + let field = Arc::new(Field::new_list_field(dt.clone(), true)); + + let arr = + GenericListArray::<i32>::new(field, offsets_buffer, backend_array, None); + Ok(arr) + } +} + +impl GroupsAccumulator for AggGroupAccumulator { + // given the stacked_batch as: + // - batch1 [1,4,5,6,7] + // - batch2 [5,1,1,1,1] + + // and group_indices as + // indices g1: [(0,0), (1,1), (1,2) ...] + // indices g2: [] + // indices g3: [] + // indices g4: [(0,1)] + // each tuple represents (batch_index, and offset within the batch index) + // for example + // - (0,0) means the 0th item inside batch1, which is `1` + // - (1,1) means the 1th item inside batch2, which is `1` + fn update_batch( + &mut self, + values: &[ArrayRef], + group_indices: &[usize], + opt_filter: Option<&BooleanArray>, + total_num_groups: usize, + ) -> Result<()> { + if opt_filter.is_some() { + panic!("not implemented"); + } + + let singular_col = values + .first() + .ok_or(internal_datafusion_err!("invalid agg input"))?; + if self.stacked_group_indices.len() < total_num_groups { + self.stacked_group_indices + .resize(total_num_groups, Vec::new()); + } + + self.stacked_batches.push(Arc::clone(singular_col)); + let batch_index = self.stacked_batches.len() - 1; + + if let Some(filter) = opt_filter { + for (array_offset, (group_index, filter_value)) in + group_indices.iter().zip(filter.iter()).enumerate() + { + if let Some(true) = filter_value { + self.stacked_group_indices[*group_index] + .push((batch_index, array_offset)); + } + } + } else { + for (array_offset, group_index) in group_indices.iter().enumerate() { + self.stacked_group_indices[*group_index] + .push((batch_index, array_offset)); + } + } + + Ok(()) + } + + fn evaluate(&mut self, emit_to: EmitTo) -> Result<ArrayRef> { + let arr = self.consume_stacked_batches(emit_to)?; + Ok(Arc::new(arr) as ArrayRef) + } + + // filtered_null_mask(opt_filter, &values); + fn state(&mut self, emit_to: EmitTo) -> Result<Vec<ArrayRef>> { + Ok(vec![self.evaluate(emit_to)?]) + } + + fn merge_batch( + &mut self, + values: &[ArrayRef], + group_indices: &[usize], + opt_filter: Option<&BooleanArray>, + total_num_groups: usize, + ) -> Result<()> { + // TODO: all the reference to this function always result into this opt_filter as none Review Comment: actually for `merge_batch` i think the opt_filter will always be None, for example the implementation of median: https://github.com/apache/datafusion/blob/53728b30aa95be7865a0ad35e70ec574312a69bc/datafusion/functions-aggregate/src/median.rs#L332 count: https://github.com/apache/datafusion/blob/53728b30aa95be7865a0ad35e70ec574312a69bc/datafusion/functions-aggregate/src/count.rs#L461 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
