(arrow-datafusion) branch main updated: Minor: Improve documentation for AggregateUDFImpl::accumulator and `AccumulatorArgs` (#9920)

alamb Fri, 05 Apr 2024 04:17:12 -0700

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git



The following commit(s) were added to refs/heads/main by this push:
     new 2dad90425b Minor: Improve documentation for 
AggregateUDFImpl::accumulator and `AccumulatorArgs`  (#9920)
2dad90425b is described below

commit 2dad90425bacb98a3c2a4214faad53850c93104e
Author: Andrew Lamb <[email protected]>
AuthorDate: Fri Apr 5 07:16:32 2024 -0400

    Minor: Improve documentation for AggregateUDFImpl::accumulator and 
`AccumulatorArgs`  (#9920)
    
    * Minor: Improve documentation for AggregateUDFImpl::accumulator and 
`AccumulatorArgs`
    
    * Add test and helper functions
    
    * Improve docs and examples
    
    * Fix CI
    
    * Remove checks for ORDER BY and IGNORE NULLS
---
 .../tests/user_defined/user_defined_aggregates.rs  |  3 ---
 datafusion/expr/src/function.rs                    | 29 ++++++++++++++++------
 datafusion/expr/src/udaf.rs                        | 12 +++++++--
 3 files changed, 32 insertions(+), 12 deletions(-)

diff --git a/datafusion/core/tests/user_defined/user_defined_aggregates.rs 
b/datafusion/core/tests/user_defined/user_defined_aggregates.rs
index 6085fca876..8f02fb30b0 100644
--- a/datafusion/core/tests/user_defined/user_defined_aggregates.rs
+++ b/datafusion/core/tests/user_defined/user_defined_aggregates.rs
@@ -526,7 +526,6 @@ impl Accumulator for TimeSum {
         let arr = arr.as_primitive::<TimestampNanosecondType>();
 
         for v in arr.values().iter() {
-            println!("Adding {v}");
             self.sum += v;
         }
         Ok(())
@@ -538,7 +537,6 @@ impl Accumulator for TimeSum {
     }
 
     fn evaluate(&mut self) -> Result<ScalarValue> {
-        println!("Evaluating to {}", self.sum);
         Ok(ScalarValue::TimestampNanosecond(Some(self.sum), None))
     }
 
@@ -558,7 +556,6 @@ impl Accumulator for TimeSum {
         let arr = arr.as_primitive::<TimestampNanosecondType>();
 
         for v in arr.values().iter() {
-            println!("Retracting {v}");
             self.sum -= v;
         }
         Ok(())
diff --git a/datafusion/expr/src/function.rs b/datafusion/expr/src/function.rs
index 7598c805ad..7a92a50ae1 100644
--- a/datafusion/expr/src/function.rs
+++ b/datafusion/expr/src/function.rs
@@ -38,18 +38,33 @@ pub type ScalarFunctionImplementation =
 pub type ReturnTypeFunction =
     Arc<dyn Fn(&[DataType]) -> Result<Arc<DataType>> + Send + Sync>;
 
-/// Arguments passed to create an accumulator
+/// [`AccumulatorArgs`] contains information about how an aggregate
+/// function was called, including the types of its arguments and any optional
+/// ordering expressions.
 pub struct AccumulatorArgs<'a> {
-    // default arguments
-    /// the return type of the function
+    /// The return type of the aggregate function.
     pub data_type: &'a DataType,
-    /// the schema of the input arguments
+    /// The schema of the input arguments
     pub schema: &'a Schema,
-    /// whether to ignore nulls
+    /// Whether to ignore nulls.
+    ///
+    /// SQL allows the user to specify `IGNORE NULLS`, for example:
+    ///
+    /// ```sql
+    /// SELECT FIRST_VALUE(column1) IGNORE NULLS FROM t;
+    /// ```
     pub ignore_nulls: bool,
 
-    // ordering arguments
-    /// the expressions of `order by`, if no ordering is required, this will 
be an empty slice
+    /// The expressions in the `ORDER BY` clause passed to this aggregator.
+    ///
+    /// SQL allows the user to specify the ordering of arguments to the
+    /// aggregate using an `ORDER BY`. For example:
+    ///
+    /// ```sql
+    /// SELECT FIRST_VALUE(column1 ORDER BY column2) FROM t;
+    /// ```
+    ///
+    /// If no `ORDER BY` is specified, `sort_exprs`` will be empty.
     pub sort_exprs: &'a [Expr],
 }
 
diff --git a/datafusion/expr/src/udaf.rs b/datafusion/expr/src/udaf.rs
index 14e5195116..3cf1845aac 100644
--- a/datafusion/expr/src/udaf.rs
+++ b/datafusion/expr/src/udaf.rs
@@ -213,8 +213,8 @@ where
 /// See [`advanced_udaf.rs`] for a full example with complete implementation 
and
 /// [`AggregateUDF`] for other available options.
 ///
-///
 /// [`advanced_udaf.rs`]: 
https://github.com/apache/arrow-datafusion/blob/main/datafusion-examples/examples/advanced_udaf.rs
+///
 /// # Basic Example
 /// ```
 /// # use std::any::Any;
@@ -282,7 +282,8 @@ pub trait AggregateUDFImpl: Debug + Send + Sync {
     /// Return a new [`Accumulator`] that aggregates values for a specific
     /// group during query execution.
     ///
-    /// `acc_args`: the arguments to the accumulator. See [`AccumulatorArgs`] 
for more details.
+    /// acc_args: [`AccumulatorArgs`] contains information about how the
+    /// aggregate function was called.
     fn accumulator(&self, acc_args: AccumulatorArgs) -> Result<Box<dyn 
Accumulator>>;
 
     /// Return the fields used to store the intermediate state of this 
accumulator.
@@ -325,6 +326,13 @@ pub trait AggregateUDFImpl: Debug + Send + Sync {
     /// If the aggregate expression has a specialized
     /// [`GroupsAccumulator`] implementation. If this returns true,
     /// `[Self::create_groups_accumulator]` will be called.
+    ///
+    /// # Notes
+    ///
+    /// Even if this function returns true, DataFusion will still use
+    /// `Self::accumulator` for certain queries, such as when this aggregate is
+    /// used as a window function or when there no GROUP BY columns in the
+    /// query.
     fn groups_accumulator_supported(&self) -> bool {
         false
     }

(arrow-datafusion) branch main updated: Minor: Improve documentation for AggregateUDFImpl::accumulator and `AccumulatorArgs` (#9920)

Reply via email to