This is an automated email from the ASF dual-hosted git repository. ruifengz pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 045eb2d6dade [SPARK-45113][PYTHON][DOCS][FOLLOWUP] Make doctests deterministic 045eb2d6dade is described below commit 045eb2d6dadec905f5c8f249fe19be6001107668 Author: Ruifeng Zheng <ruife...@apache.org> AuthorDate: Thu Oct 12 09:20:15 2023 +0800 [SPARK-45113][PYTHON][DOCS][FOLLOWUP] Make doctests deterministic ### What changes were proposed in this pull request? sort before show ### Why are the changes needed? the orders of rows are non-deterministic after groupby the tests fail in some env ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? ci ### Was this patch authored or co-authored using generative AI tooling? no Closes #43331 from zhengruifeng/py_collect_groupby. Authored-by: Ruifeng Zheng <ruife...@apache.org> Signed-off-by: Ruifeng Zheng <ruife...@apache.org> --- python/pyspark/sql/functions.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 04968440e394..25958bdf15da 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -3751,7 +3751,8 @@ def collect_list(col: "ColumnOrName") -> Column: >>> from pyspark.sql import functions as sf >>> df = spark.createDataFrame([(1, "John"), (2, "John"), (3, "Ana")], ("id", "name")) - >>> df.groupBy("name").agg(sf.sort_array(sf.collect_list('id')).alias('sorted_list')).show() + >>> df = df.groupBy("name").agg(sf.sort_array(sf.collect_list('id')).alias('sorted_list')) + >>> df.orderBy(sf.desc("name")).show() +----+-----------+ |name|sorted_list| +----+-----------+ @@ -3842,7 +3843,8 @@ def collect_set(col: "ColumnOrName") -> Column: >>> from pyspark.sql import functions as sf >>> df = spark.createDataFrame([(1, "John"), (2, "John"), (3, "Ana")], ("id", "name")) - >>> df.groupBy("name").agg(sf.sort_array(sf.collect_set('id')).alias('sorted_set')).show() + >>> df = df.groupBy("name").agg(sf.sort_array(sf.collect_set('id')).alias('sorted_set')) + >>> df.orderBy(sf.desc("name")).show() +----+----------+ |name|sorted_set| +----+----------+ --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org