This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 2dad90425b Minor: Improve documentation for
AggregateUDFImpl::accumulator and `AccumulatorArgs` (#9920)
2dad90425b is described below
commit 2dad90425bacb98a3c2a4214faad53850c93104e
Author: Andrew Lamb <[email protected]>
AuthorDate: Fri Apr 5 07:16:32 2024 -0400
Minor: Improve documentation for AggregateUDFImpl::accumulator and
`AccumulatorArgs` (#9920)
* Minor: Improve documentation for AggregateUDFImpl::accumulator and
`AccumulatorArgs`
* Add test and helper functions
* Improve docs and examples
* Fix CI
* Remove checks for ORDER BY and IGNORE NULLS
---
.../tests/user_defined/user_defined_aggregates.rs | 3 ---
datafusion/expr/src/function.rs | 29 ++++++++++++++++------
datafusion/expr/src/udaf.rs | 12 +++++++--
3 files changed, 32 insertions(+), 12 deletions(-)
diff --git a/datafusion/core/tests/user_defined/user_defined_aggregates.rs
b/datafusion/core/tests/user_defined/user_defined_aggregates.rs
index 6085fca876..8f02fb30b0 100644
--- a/datafusion/core/tests/user_defined/user_defined_aggregates.rs
+++ b/datafusion/core/tests/user_defined/user_defined_aggregates.rs
@@ -526,7 +526,6 @@ impl Accumulator for TimeSum {
let arr = arr.as_primitive::<TimestampNanosecondType>();
for v in arr.values().iter() {
- println!("Adding {v}");
self.sum += v;
}
Ok(())
@@ -538,7 +537,6 @@ impl Accumulator for TimeSum {
}
fn evaluate(&mut self) -> Result<ScalarValue> {
- println!("Evaluating to {}", self.sum);
Ok(ScalarValue::TimestampNanosecond(Some(self.sum), None))
}
@@ -558,7 +556,6 @@ impl Accumulator for TimeSum {
let arr = arr.as_primitive::<TimestampNanosecondType>();
for v in arr.values().iter() {
- println!("Retracting {v}");
self.sum -= v;
}
Ok(())
diff --git a/datafusion/expr/src/function.rs b/datafusion/expr/src/function.rs
index 7598c805ad..7a92a50ae1 100644
--- a/datafusion/expr/src/function.rs
+++ b/datafusion/expr/src/function.rs
@@ -38,18 +38,33 @@ pub type ScalarFunctionImplementation =
pub type ReturnTypeFunction =
Arc<dyn Fn(&[DataType]) -> Result<Arc<DataType>> + Send + Sync>;
-/// Arguments passed to create an accumulator
+/// [`AccumulatorArgs`] contains information about how an aggregate
+/// function was called, including the types of its arguments and any optional
+/// ordering expressions.
pub struct AccumulatorArgs<'a> {
- // default arguments
- /// the return type of the function
+ /// The return type of the aggregate function.
pub data_type: &'a DataType,
- /// the schema of the input arguments
+ /// The schema of the input arguments
pub schema: &'a Schema,
- /// whether to ignore nulls
+ /// Whether to ignore nulls.
+ ///
+ /// SQL allows the user to specify `IGNORE NULLS`, for example:
+ ///
+ /// ```sql
+ /// SELECT FIRST_VALUE(column1) IGNORE NULLS FROM t;
+ /// ```
pub ignore_nulls: bool,
- // ordering arguments
- /// the expressions of `order by`, if no ordering is required, this will
be an empty slice
+ /// The expressions in the `ORDER BY` clause passed to this aggregator.
+ ///
+ /// SQL allows the user to specify the ordering of arguments to the
+ /// aggregate using an `ORDER BY`. For example:
+ ///
+ /// ```sql
+ /// SELECT FIRST_VALUE(column1 ORDER BY column2) FROM t;
+ /// ```
+ ///
+ /// If no `ORDER BY` is specified, `sort_exprs`` will be empty.
pub sort_exprs: &'a [Expr],
}
diff --git a/datafusion/expr/src/udaf.rs b/datafusion/expr/src/udaf.rs
index 14e5195116..3cf1845aac 100644
--- a/datafusion/expr/src/udaf.rs
+++ b/datafusion/expr/src/udaf.rs
@@ -213,8 +213,8 @@ where
/// See [`advanced_udaf.rs`] for a full example with complete implementation
and
/// [`AggregateUDF`] for other available options.
///
-///
/// [`advanced_udaf.rs`]:
https://github.com/apache/arrow-datafusion/blob/main/datafusion-examples/examples/advanced_udaf.rs
+///
/// # Basic Example
/// ```
/// # use std::any::Any;
@@ -282,7 +282,8 @@ pub trait AggregateUDFImpl: Debug + Send + Sync {
/// Return a new [`Accumulator`] that aggregates values for a specific
/// group during query execution.
///
- /// `acc_args`: the arguments to the accumulator. See [`AccumulatorArgs`]
for more details.
+ /// acc_args: [`AccumulatorArgs`] contains information about how the
+ /// aggregate function was called.
fn accumulator(&self, acc_args: AccumulatorArgs) -> Result<Box<dyn
Accumulator>>;
/// Return the fields used to store the intermediate state of this
accumulator.
@@ -325,6 +326,13 @@ pub trait AggregateUDFImpl: Debug + Send + Sync {
/// If the aggregate expression has a specialized
/// [`GroupsAccumulator`] implementation. If this returns true,
/// `[Self::create_groups_accumulator]` will be called.
+ ///
+ /// # Notes
+ ///
+ /// Even if this function returns true, DataFusion will still use
+ /// `Self::accumulator` for certain queries, such as when this aggregate is
+ /// used as a window function or when there no GROUP BY columns in the
+ /// query.
fn groups_accumulator_supported(&self) -> bool {
false
}