This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 464a3c19f51 [SPARK-45113][PYTHON][DOCS][FOLLOWUP] Add sorting to the example of `collect_set/collect_list` to ensure stable results 464a3c19f51 is described below commit 464a3c19f51081914a09d4394820da059cb2ee47 Author: yangjie01 <yangji...@baidu.com> AuthorDate: Tue Sep 19 08:48:03 2023 +0900 [SPARK-45113][PYTHON][DOCS][FOLLOWUP] Add sorting to the example of `collect_set/collect_list` to ensure stable results ### What changes were proposed in this pull request? This PR adds a `sort_array` for the output results of `collect_set` and `collect_list` to ensure the output is stable. ### Why are the changes needed? When executing the Example of `collect_set` and `collect_list` with different versions of Scala, the output results may differ, resulting in the failure of daily tests on Scala 2.13: - https://github.com/apache/spark/actions/runs/6209111340/job/16856005714 ``` ********************************************************************** File "/__w/spark/spark/python/pyspark/sql/connect/functions.py", line 1030, in pyspark.sql.connect.functions.collect_set Failed example: df.select(sf.collect_set('age')).show() Expected: +----------------+ |collect_set(age)| +----------------+ | [5, 2]| +----------------+ Got: +----------------+ |collect_set(age)| +----------------+ | [2, 5]| +----------------+ <BLANKLINE> ********************************************************************** 1 of 9 in pyspark.sql.connect.functions.collect_set ***Test Failed*** 1 failures. Had test failures in pyspark.sql.connect.functions with python3.9; see logs. Error: running /__w/spark/spark/python/run-tests --modules=pyspark-connect --parallelism=1 ; received return code 255 Error: Process completed with exit code 19. ``` ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Pass GitHub Actions ### Was this patch authored or co-authored using generative AI tooling? No Closes #42968 from LuciferYang/SPARK-45113-FOLLOWUP. Lead-authored-by: yangjie01 <yangji...@baidu.com> Co-authored-by: YangJie <yangji...@baidu.com> Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> --- python/pyspark/sql/functions.py | 96 ++++++++++++++++++++--------------------- 1 file changed, 48 insertions(+), 48 deletions(-) diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 54bd330ebc0..5474873df7b 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -3671,39 +3671,39 @@ def collect_list(col: "ColumnOrName") -> Column: Examples -------- - Example 1: Collect values from a single column DataFrame + Example 1: Collect values from a DataFrame and sort the result in ascending order >>> from pyspark.sql import functions as sf - >>> df = spark.createDataFrame([(2,), (5,), (5,)], ('age',)) - >>> df.select(sf.collect_list('age')).show() - +-----------------+ - |collect_list(age)| - +-----------------+ - | [2, 5, 5]| - +-----------------+ - - Example 2: Collect values from a DataFrame with multiple columns - - >>> from pyspark.sql import functions as sf - >>> df = spark.createDataFrame([(1, "John"), (2, "John"), (3, "Ana")], ("id", "name")) - >>> df.groupBy("name").agg(sf.collect_list('id')).show() - +----+----------------+ - |name|collect_list(id)| - +----+----------------+ - |John| [1, 2]| - | Ana| [3]| - +----+----------------+ + >>> df = spark.createDataFrame([(1,), (2,), (2,)], ('value',)) + >>> df.select(sf.sort_array(sf.collect_list('value')).alias('sorted_list')).show() + +-----------+ + |sorted_list| + +-----------+ + | [1, 2, 2]| + +-----------+ - Example 3: Collect values from a DataFrame and sort the result + Example 2: Collect values from a DataFrame and sort the result in descending order >>> from pyspark.sql import functions as sf - >>> df = spark.createDataFrame([(1,), (2,), (2,)], ('value',)) - >>> df.select(sf.array_sort(sf.collect_list('value')).alias('sorted_list')).show() + >>> df = spark.createDataFrame([(2,), (5,), (5,)], ('age',)) + >>> df.select(sf.sort_array(sf.collect_list('age'), asc=False).alias('sorted_list')).show() +-----------+ |sorted_list| +-----------+ - | [1, 2, 2]| + | [5, 5, 2]| +-----------+ + + Example 3: Collect values from a DataFrame with multiple columns and sort the result + + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([(1, "John"), (2, "John"), (3, "Ana")], ("id", "name")) + >>> df.groupBy("name").agg(sf.sort_array(sf.collect_list('id')).alias('sorted_list')).show() + +----+-----------+ + |name|sorted_list| + +----+-----------+ + |John| [1, 2]| + | Ana| [3]| + +----+-----------+ """ return _invoke_function_over_columns("collect_list", col) @@ -3762,39 +3762,39 @@ def collect_set(col: "ColumnOrName") -> Column: Examples -------- - Example 1: Collect values from a single column DataFrame + Example 1: Collect values from a DataFrame and sort the result in ascending order >>> from pyspark.sql import functions as sf - >>> df = spark.createDataFrame([(2,), (5,), (5,)], ('age',)) - >>> df.select(sf.collect_set('age')).show() - +----------------+ - |collect_set(age)| - +----------------+ - | [5, 2]| - +----------------+ - - Example 2: Collect values from a DataFrame with multiple columns - - >>> from pyspark.sql import functions as sf - >>> df = spark.createDataFrame([(1, "John"), (2, "John"), (3, "Ana")], ("id", "name")) - >>> df.groupBy("name").agg(sf.collect_set('id')).show() - +----+---------------+ - |name|collect_set(id)| - +----+---------------+ - |John| [1, 2]| - | Ana| [3]| - +----+---------------+ + >>> df = spark.createDataFrame([(1,), (2,), (2,)], ('value',)) + >>> df.select(sf.sort_array(sf.collect_set('value')).alias('sorted_set')).show() + +----------+ + |sorted_set| + +----------+ + | [1, 2]| + +----------+ - Example 3: Collect values from a DataFrame and sort the result + Example 2: Collect values from a DataFrame and sort the result in descending order >>> from pyspark.sql import functions as sf - >>> df = spark.createDataFrame([(1,), (2,), (2,)], ('value',)) - >>> df.select(sf.array_sort(sf.collect_set('value')).alias('sorted_set')).show() + >>> df = spark.createDataFrame([(2,), (5,), (5,)], ('age',)) + >>> df.select(sf.sort_array(sf.collect_set('age'), asc=False).alias('sorted_set')).show() +----------+ |sorted_set| +----------+ - | [1, 2]| + | [5, 2]| +----------+ + + Example 3: Collect values from a DataFrame with multiple columns and sort the result + + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([(1, "John"), (2, "John"), (3, "Ana")], ("id", "name")) + >>> df.groupBy("name").agg(sf.sort_array(sf.collect_set('id')).alias('sorted_set')).show() + +----+----------+ + |name|sorted_set| + +----+----------+ + |John| [1, 2]| + | Ana| [3]| + +----+----------+ """ return _invoke_function_over_columns("collect_set", col) --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org