(spark) branch master updated: [SPARK-48842][DOCS] Document non-determinism of max_by and min_by

ruifengz Thu, 11 Jul 2024 21:42:09 -0700

This is an automated email from the ASF dual-hosted git repository.

ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new 5bbe9c850aaa [SPARK-48842][DOCS] Document non-determinism of max_by 
and min_by
5bbe9c850aaa is described below

commit 5bbe9c850aaaf31327b81d893ed513033a129e08
Author: Ruifeng Zheng <ruife...@apache.org>
AuthorDate: Fri Jul 12 12:41:07 2024 +0800

    [SPARK-48842][DOCS] Document non-determinism of max_by and min_by
    
    ### What changes were proposed in this pull request?
    Document non-determinism of max_by and min_by
    
    ### Why are the changes needed?
    I have been confused by this non-determinism twice, it occurred like a 
correctness bug to me.
    So I think we need to document it
    
    ### Does this PR introduce _any_ user-facing change?
    doc change only
    
    ### How was this patch tested?
    ci
    
    ### Was this patch authored or co-authored using generative AI tooling?
    no
    
    Closes #47266 from zhengruifeng/py_doc_max_by.
    
    Authored-by: Ruifeng Zheng <ruife...@apache.org>
    Signed-off-by: Ruifeng Zheng <ruife...@apache.org>
---
 R/pkg/R/functions.R                                            |  6 ++++++
 .../jvm/src/main/scala/org/apache/spark/sql/functions.scala    |  8 ++++++++
 python/pyspark/sql/functions/builtin.py                        | 10 ++++++++++
 .../sql/catalyst/expressions/aggregate/MaxByAndMinBy.scala     |  8 ++++++++
 sql/core/src/main/scala/org/apache/spark/sql/functions.scala   |  6 ++++++
 5 files changed, 38 insertions(+)

diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index a7e337d3f9af..b91124f96a6f 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -1558,6 +1558,9 @@ setMethod("max",
 #' @details
 #' \code{max_by}: Returns the value associated with the maximum value of ord.
 #'
+#' Note: The function is non-deterministic so the output order can be different
+#' for those associated the same values of `x`.
+#'
 #' @rdname column_aggregate_functions
 #' @aliases max_by max_by,Column-method
 #' @note max_by since 3.3.0
@@ -1633,6 +1636,9 @@ setMethod("min",
 #' @details
 #' \code{min_by}: Returns the value associated with the minimum value of ord.
 #'
+#' Note: The function is non-deterministic so the output order can be different
+#' for those associated the same values of `x`.
+#'
 #' @rdname column_aggregate_functions
 #' @aliases min_by min_by,Column-method
 #' @note min_by since 3.3.0
diff --git 
a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala
 
b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala
index 92e7bc9da590..81f25b3d743f 100644
--- 
a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala
+++ 
b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala
@@ -884,6 +884,10 @@ object functions {
   /**
    * Aggregate function: returns the value associated with the maximum value 
of ord.
    *
+   * @note
+   *   The function is non-deterministic so the output order can be different 
for those associated
+   *   the same values of `e`.
+   *
    * @group agg_funcs
    * @since 3.4.0
    */
@@ -932,6 +936,10 @@ object functions {
   /**
    * Aggregate function: returns the value associated with the minimum value 
of ord.
    *
+   * @note
+   *   The function is non-deterministic so the output order can be different 
for those associated
+   *   the same values of `e`.
+   *
    * @group agg_funcs
    * @since 3.4.0
    */
diff --git a/python/pyspark/sql/functions/builtin.py 
b/python/pyspark/sql/functions/builtin.py
index 1ca522313f24..446ff2b1be93 100644
--- a/python/pyspark/sql/functions/builtin.py
+++ b/python/pyspark/sql/functions/builtin.py
@@ -1271,6 +1271,11 @@ def max_by(col: "ColumnOrName", ord: "ColumnOrName") -> 
Column:
     .. versionchanged:: 3.4.0
         Supports Spark Connect.
 
+    Notes
+    -----
+    The function is non-deterministic so the output order can be different for 
those
+    associated the same values of `col`.
+
     Parameters
     ----------
     col : :class:`~pyspark.sql.Column` or str
@@ -1352,6 +1357,11 @@ def min_by(col: "ColumnOrName", ord: "ColumnOrName") -> 
Column:
     .. versionchanged:: 3.4.0
         Supports Spark Connect.
 
+    Notes
+    -----
+    The function is non-deterministic so the output order can be different for 
those
+    associated the same values of `col`.
+
     Parameters
     ----------
     col : :class:`~pyspark.sql.Column` or str
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/MaxByAndMinBy.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/MaxByAndMinBy.scala
index 56941c9de451..b33142ed29cc 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/MaxByAndMinBy.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/MaxByAndMinBy.scala
@@ -99,6 +99,10 @@ abstract class MaxMinBy extends DeclarativeAggregate with 
BinaryLike[Expression]
       > SELECT _FUNC_(x, y) FROM VALUES ('a', 10), ('b', 50), ('c', 20) AS 
tab(x, y);
        b
   """,
+  note = """
+    The function is non-deterministic so the output order can be different for
+    those associated the same values of `x`.
+  """,
   group = "agg_funcs",
   since = "3.0.0")
 case class MaxBy(valueExpr: Expression, orderingExpr: Expression) extends 
MaxMinBy {
@@ -122,6 +126,10 @@ case class MaxBy(valueExpr: Expression, orderingExpr: 
Expression) extends MaxMin
       > SELECT _FUNC_(x, y) FROM VALUES ('a', 10), ('b', 50), ('c', 20) AS 
tab(x, y);
        a
   """,
+  note = """
+    The function is non-deterministic so the output order can be different for
+    those associated the same values of `x`.
+  """,
   group = "agg_funcs",
   since = "3.0.0")
 case class MinBy(valueExpr: Expression, orderingExpr: Expression) extends 
MaxMinBy {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 882918eb78c7..5b4d27fc65d0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -902,6 +902,9 @@ object functions {
   /**
    * Aggregate function: returns the value associated with the maximum value 
of ord.
    *
+   * @note The function is non-deterministic so the output order can be 
different for
+   * those associated the same values of `e`.
+   *
    * @group agg_funcs
    * @since 3.3.0
    */
@@ -952,6 +955,9 @@ object functions {
   /**
    * Aggregate function: returns the value associated with the minimum value 
of ord.
    *
+   * @note The function is non-deterministic so the output order can be 
different for
+   * those associated the same values of `e`.
+   *
    * @group agg_funcs
    * @since 3.3.0
    */


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

(spark) branch master updated: [SPARK-48842][DOCS] Document non-determinism of max_by and min_by

Reply via email to