This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 798fce3b571 [SPARK-45063][PYTHON][DOCS] Refine docstring of `max_by/min_by` 798fce3b571 is described below commit 798fce3b571907ee52058004cc38c2e8dbc4b016 Author: yangjie01 <yangji...@baidu.com> AuthorDate: Mon Sep 4 14:48:14 2023 -0700 [SPARK-45063][PYTHON][DOCS] Refine docstring of `max_by/min_by` ### What changes were proposed in this pull request? This pr refine docstring of `max_by/min_by` and add some new examples. ### Why are the changes needed? To improve PySpark documentation ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Pass Github Actions ### Was this patch authored or co-authored using generative AI tooling? No Closes #42789 from LuciferYang/SPARK-45063. Authored-by: yangjie01 <yangji...@baidu.com> Signed-off-by: Dongjoon Hyun <dh...@apple.com> --- python/pyspark/sql/functions.py | 96 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 86 insertions(+), 10 deletions(-) diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index d025b13cd10..6e0caf50c16 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -1346,7 +1346,9 @@ def min(col: "ColumnOrName") -> Column: @try_remote_functions def max_by(col: "ColumnOrName", ord: "ColumnOrName") -> Column: """ - Returns the value associated with the maximum value of ord. + Returns the value from the `col` parameter that is associated with the maximum value + from the `ord` parameter. This function is often used to find the `col` parameter value + corresponding to the maximum `ord` parameter value within each group when used with groupBy(). .. versionadded:: 3.3.0 @@ -1356,28 +1358,64 @@ def max_by(col: "ColumnOrName", ord: "ColumnOrName") -> Column: Parameters ---------- col : :class:`~pyspark.sql.Column` or str - target column to compute on. + The column representing the values to be returned. This could be the column instance + or the column name as string. ord : :class:`~pyspark.sql.Column` or str - column to be maximized + The column that needs to be maximized. This could be the column instance + or the column name as string. Returns ------- :class:`~pyspark.sql.Column` - value associated with the maximum value of ord. + A column object representing the value from `col` that is associated with + the maximum value from `ord`. Examples -------- + Example 1: Using `max_by` with groupBy + + >>> import pyspark.sql.functions as sf >>> df = spark.createDataFrame([ ... ("Java", 2012, 20000), ("dotNET", 2012, 5000), ... ("dotNET", 2013, 48000), ("Java", 2013, 30000)], ... schema=("course", "year", "earnings")) - >>> df.groupby("course").agg(max_by("year", "earnings")).show() + >>> df.groupby("course").agg(sf.max_by("year", "earnings")).show() +------+----------------------+ |course|max_by(year, earnings)| +------+----------------------+ | Java| 2013| |dotNET| 2013| +------+----------------------+ + + Example 2: Using `max_by` with different data types + + >>> import pyspark.sql.functions as sf + >>> df = spark.createDataFrame([ + ... ("Marketing", "Anna", 4), ("IT", "Bob", 2), + ... ("IT", "Charlie", 3), ("Marketing", "David", 1)], + ... schema=("department", "name", "years_in_dept")) + >>> df.groupby("department").agg(sf.max_by("name", "years_in_dept")).show() + +----------+---------------------------+ + |department|max_by(name, years_in_dept)| + +----------+---------------------------+ + | IT| Charlie| + | Marketing| Anna| + +----------+---------------------------+ + + Example 3: Using `max_by` where `ord` has multiple maximum values + + >>> import pyspark.sql.functions as sf + >>> df = spark.createDataFrame([ + ... ("Consult", "Eva", 6), ("Finance", "Frank", 5), + ... ("Finance", "George", 5), ("Consult", "Henry", 7)], + ... schema=("department", "name", "years_in_dept")) + >>> df.groupby("department").agg(sf.max_by("name", "years_in_dept")).show() + +----------+---------------------------+ + |department|max_by(name, years_in_dept)| + +----------+---------------------------+ + | Consult| Henry| + | Finance| George| + +----------+---------------------------+ """ return _invoke_function_over_columns("max_by", col, ord) @@ -1385,7 +1423,9 @@ def max_by(col: "ColumnOrName", ord: "ColumnOrName") -> Column: @try_remote_functions def min_by(col: "ColumnOrName", ord: "ColumnOrName") -> Column: """ - Returns the value associated with the minimum value of ord. + Returns the value from the `col` parameter that is associated with the minimum value + from the `ord` parameter. This function is often used to find the `col` parameter value + corresponding to the minimum `ord` parameter value within each group when used with groupBy(). .. versionadded:: 3.3.0 @@ -1395,28 +1435,64 @@ def min_by(col: "ColumnOrName", ord: "ColumnOrName") -> Column: Parameters ---------- col : :class:`~pyspark.sql.Column` or str - target column to compute on. + The column representing the values that will be returned. This could be the column instance + or the column name as string. ord : :class:`~pyspark.sql.Column` or str - column to be minimized + The column that needs to be minimized. This could be the column instance + or the column name as string. Returns ------- :class:`~pyspark.sql.Column` - value associated with the minimum value of ord. + Column object that represents the value from `col` associated with + the minimum value from `ord`. Examples -------- + Example 1: Using `min_by` with groupBy: + + >>> import pyspark.sql.functions as sf >>> df = spark.createDataFrame([ ... ("Java", 2012, 20000), ("dotNET", 2012, 5000), ... ("dotNET", 2013, 48000), ("Java", 2013, 30000)], ... schema=("course", "year", "earnings")) - >>> df.groupby("course").agg(min_by("year", "earnings")).show() + >>> df.groupby("course").agg(sf.min_by("year", "earnings")).show() +------+----------------------+ |course|min_by(year, earnings)| +------+----------------------+ | Java| 2012| |dotNET| 2012| +------+----------------------+ + + Example 2: Using `min_by` with different data types: + + >>> import pyspark.sql.functions as sf + >>> df = spark.createDataFrame([ + ... ("Marketing", "Anna", 4), ("IT", "Bob", 2), + ... ("IT", "Charlie", 3), ("Marketing", "David", 1)], + ... schema=("department", "name", "years_in_dept")) + >>> df.groupby("department").agg(sf.min_by("name", "years_in_dept")).show() + +----------+---------------------------+ + |department|min_by(name, years_in_dept)| + +----------+---------------------------+ + | IT| Bob| + | Marketing| David| + +----------+---------------------------+ + + Example 3: Using `min_by` where `ord` has multiple minimum values: + + >>> import pyspark.sql.functions as sf + >>> df = spark.createDataFrame([ + ... ("Consult", "Eva", 6), ("Finance", "Frank", 5), + ... ("Finance", "George", 5), ("Consult", "Henry", 7)], + ... schema=("department", "name", "years_in_dept")) + >>> df.groupby("department").agg(sf.min_by("name", "years_in_dept")).show() + +----------+---------------------------+ + |department|min_by(name, years_in_dept)| + +----------+---------------------------+ + | Consult| Eva| + | Finance| George| + +----------+---------------------------+ """ return _invoke_function_over_columns("min_by", col, ord) --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org