This is an automated email from the ASF dual-hosted git repository. ruifengz pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 00fb1851def2 [SPARK-45015][PYTHON][DOCS] Refine DocStrings of `try_{add, subtract, multiply, divide, avg, sum}` 00fb1851def2 is described below commit 00fb1851def201b1bab8b1acef875e5846159162 Author: Ruifeng Zheng <ruife...@apache.org> AuthorDate: Thu Aug 31 08:55:02 2023 +0800 [SPARK-45015][PYTHON][DOCS] Refine DocStrings of `try_{add, subtract, multiply, divide, avg, sum}` ### What changes were proposed in this pull request? Refine DocStrings of `try_{add, subtract, multiply, divide, avg, sum}`: 1. unify the import `import pyspark.sql.functions as sf` 2. make each example/cell copy/paste-able 3. rewrite examples to make them more clearer ### Why are the changes needed? to refine the docstrings ### Does this PR introduce _any_ user-facing change? Yes ### How was this patch tested? CI ### Was this patch authored or co-authored using generative AI tooling? NO Closes #42735 from zhengruifeng/py_doc_try_math. Authored-by: Ruifeng Zheng <ruife...@apache.org> Signed-off-by: Ruifeng Zheng <ruife...@apache.org> --- python/pyspark/sql/functions.py | 255 +++++++++++++++++++++++++--------------- 1 file changed, 158 insertions(+), 97 deletions(-) diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 6cce502d318c..0447bf0e19c8 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -863,36 +863,54 @@ def try_add(left: "ColumnOrName", right: "ColumnOrName") -> Column: Examples -------- - >>> df = spark.createDataFrame([(1982, 15), (1990, 2)], ["birth", "age"]) - >>> df.select(try_add(df.birth, df.age).alias('r')).collect() - [Row(r=1997), Row(r=1992)] - - >>> from pyspark.sql.types import StructType, StructField, IntegerType, StringType - >>> schema = StructType([ - ... StructField("i", IntegerType(), True), - ... StructField("d", StringType(), True), - ... ]) - >>> df = spark.createDataFrame([(1, '2015-09-30')], schema) - >>> df = df.select(df.i, to_date(df.d).alias('d')) - >>> df.select(try_add(df.d, df.i).alias('r')).collect() - [Row(r=datetime.date(2015, 10, 1))] + Example 1: Integer plus Integer. - >>> df.select(try_add(df.d, make_interval(df.i)).alias('r')).collect() - [Row(r=datetime.date(2016, 9, 30))] + >>> import pyspark.sql.functions as sf + >>> spark.createDataFrame( + ... [(1982, 15), (1990, 2)], ["birth", "age"] + ... ).select(sf.try_add("birth", "age")).show() + +-------------------+ + |try_add(birth, age)| + +-------------------+ + | 1997| + | 1992| + +-------------------+ - >>> df.select( - ... try_add(df.d, make_interval(lit(0), lit(0), lit(0), df.i)).alias('r') - ... ).collect() - [Row(r=datetime.date(2015, 10, 1))] + Example 2: Date plus Integer. - >>> df.select( - ... try_add(make_interval(df.i), make_interval(df.i)).alias('r') - ... ).show(truncate=False) - +-------+ - |r | - +-------+ - |2 years| - +-------+ + >>> import pyspark.sql.functions as sf + >>> spark.sql( + ... "SELECT * FROM VALUES (DATE('2015-09-30')) AS TAB(date)" + ... ).select(sf.try_add("date", sf.lit(1))).show() + +----------------+ + |try_add(date, 1)| + +----------------+ + | 2015-10-01| + +----------------+ + + Example 3: Date plus Interval. + + >>> import pyspark.sql.functions as sf + >>> spark.sql( + ... "SELECT * FROM VALUES (DATE('2015-09-30'), INTERVAL 1 YEAR) AS TAB(date, i)" + ... ).select(sf.try_add("date", "i")).show() + +----------------+ + |try_add(date, i)| + +----------------+ + | 2016-09-30| + +----------------+ + + Example 4: Interval plus Interval. + + >>> import pyspark.sql.functions as sf + >>> spark.sql( + ... "SELECT * FROM VALUES (INTERVAL 1 YEAR, INTERVAL 2 YEAR) AS TAB(i, j)" + ... ).select(sf.try_add("i", "j")).show() + +-----------------+ + | try_add(i, j)| + +-----------------+ + |INTERVAL '3' YEAR| + +-----------------+ """ return _invoke_function_over_columns("try_add", left, right) @@ -910,9 +928,15 @@ def try_avg(col: "ColumnOrName") -> Column: Examples -------- - >>> df = spark.createDataFrame([(1982, 15), (1990, 2)], ["birth", "age"]) - >>> df.select(try_avg(df.age).alias('r')).collect() - [Row(r=8.5)] + >>> import pyspark.sql.functions as sf + >>> spark.createDataFrame( + ... [(1982, 15), (1990, 2)], ["birth", "age"] + ... ).select(sf.try_avg("age").alias("age_avg")).show() + +-------+ + |age_avg| + +-------+ + | 8.5| + +-------+ """ return _invoke_function_over_columns("try_avg", col) @@ -934,37 +958,34 @@ def try_divide(left: "ColumnOrName", right: "ColumnOrName") -> Column: Examples -------- - >>> df = spark.createDataFrame([(6000, 15), (1990, 2)], ["a", "b"]) - >>> df.select(try_divide(df.a, df.b).alias('r')).collect() - [Row(r=400.0), Row(r=995.0)] + Example 1: Integer divided by Integer. - >>> df = spark.createDataFrame([(1, 2)], ["year", "month"]) - >>> df.select( - ... try_divide(make_interval(df.year), df.month).alias('r') - ... ).show(truncate=False) - +--------+ - |r | - +--------+ - |6 months| - +--------+ + >>> import pyspark.sql.functions as sf + >>> spark.createDataFrame( + ... [(6000, 15), (1990, 2), (1234, 0)], ["a", "b"] + ... ).select(sf.try_divide("a", "b")).show() + +----------------+ + |try_divide(a, b)| + +----------------+ + | 400.0| + | 995.0| + | NULL| + +----------------+ - >>> df.select( - ... try_divide(make_interval(df.year, df.month), lit(2)).alias('r') - ... ).show(truncate=False) - +--------+ - |r | - +--------+ - |7 months| - +--------+ + Example 2: Interval divided by Integer. - >>> df.select( - ... try_divide(make_interval(df.year, df.month), lit(0)).alias('r') - ... ).show(truncate=False) - +----+ - |r | - +----+ - |NULL| - +----+ + >>> import pyspark.sql.functions as sf + >>> spark.range(4).select( + ... sf.try_divide(sf.make_interval(sf.lit(1)), "id") + ... ).show() + +--------------------------------------------------+ + |try_divide(make_interval(1, 0, 0, 0, 0, 0, 0), id)| + +--------------------------------------------------+ + | NULL| + | 1 years| + | 6 months| + | 4 months| + +--------------------------------------------------+ """ return _invoke_function_over_columns("try_divide", left, right) @@ -986,17 +1007,35 @@ def try_multiply(left: "ColumnOrName", right: "ColumnOrName") -> Column: Examples -------- - >>> df = spark.createDataFrame([(6000, 15), (1990, 2)], ["a", "b"]) - >>> df.select(try_multiply(df.a, df.b).alias('r')).collect() - [Row(r=90000), Row(r=3980)] + Example 1: Integer multiplied by Integer. - >>> df = spark.createDataFrame([(2, 3),], ["a", "b"]) - >>> df.select(try_multiply(make_interval(df.a), df.b).alias('r')).show(truncate=False) - +-------+ - |r | - +-------+ - |6 years| - +-------+ + >>> import pyspark.sql.functions as sf + >>> spark.createDataFrame( + ... [(6000, 15), (1990, 2)], ["a", "b"] + ... ).select(sf.try_multiply("a", "b")).show() + +------------------+ + |try_multiply(a, b)| + +------------------+ + | 90000| + | 3980| + +------------------+ + + Example 2: Interval multiplied by Integer. + + >>> import pyspark.sql.functions as sf + >>> spark.range(6).select( + ... sf.try_multiply(sf.make_interval(sf.lit(0), sf.lit(3)), "id") + ... ).show() + +----------------------------------------------------+ + |try_multiply(make_interval(0, 3, 0, 0, 0, 0, 0), id)| + +----------------------------------------------------+ + | 0 seconds| + | 3 months| + | 6 months| + | 9 months| + | 1 years| + | 1 years 3 months| + +----------------------------------------------------+ """ return _invoke_function_over_columns("try_multiply", left, right) @@ -1016,36 +1055,54 @@ def try_subtract(left: "ColumnOrName", right: "ColumnOrName") -> Column: Examples -------- - >>> df = spark.createDataFrame([(6000, 15), (1990, 2)], ["a", "b"]) - >>> df.select(try_subtract(df.a, df.b).alias('r')).collect() - [Row(r=5985), Row(r=1988)] + Example 1: Integer minus Integer. - >>> from pyspark.sql.types import StructType, StructField, IntegerType, StringType - >>> schema = StructType([ - ... StructField("i", IntegerType(), True), - ... StructField("d", StringType(), True), - ... ]) - >>> df = spark.createDataFrame([(1, '2015-09-30')], schema) - >>> df = df.select(df.i, to_date(df.d).alias('d')) - >>> df.select(try_subtract(df.d, df.i).alias('r')).collect() - [Row(r=datetime.date(2015, 9, 29))] + >>> import pyspark.sql.functions as sf + >>> spark.createDataFrame( + ... [(1982, 15), (1990, 2)], ["birth", "age"] + ... ).select(sf.try_subtract("birth", "age")).show() + +------------------------+ + |try_subtract(birth, age)| + +------------------------+ + | 1967| + | 1988| + +------------------------+ - >>> df.select(try_subtract(df.d, make_interval(df.i)).alias('r')).collect() - [Row(r=datetime.date(2014, 9, 30))] + Example 2: Date minus Integer. - >>> df.select( - ... try_subtract(df.d, make_interval(lit(0), lit(0), lit(0), df.i)).alias('r') - ... ).collect() - [Row(r=datetime.date(2015, 9, 29))] + >>> import pyspark.sql.functions as sf + >>> spark.sql( + ... "SELECT * FROM VALUES (DATE('2015-10-01')) AS TAB(date)" + ... ).select(sf.try_subtract("date", sf.lit(1))).show() + +---------------------+ + |try_subtract(date, 1)| + +---------------------+ + | 2015-09-30| + +---------------------+ - >>> df.select( - ... try_subtract(make_interval(df.i), make_interval(df.i)).alias('r') - ... ).show(truncate=False) - +---------+ - |r | - +---------+ - |0 seconds| - +---------+ + Example 3: Date minus Interval. + + >>> import pyspark.sql.functions as sf + >>> spark.sql( + ... "SELECT * FROM VALUES (DATE('2015-09-30'), INTERVAL 1 YEAR) AS TAB(date, i)" + ... ).select(sf.try_subtract("date", "i")).show() + +---------------------+ + |try_subtract(date, i)| + +---------------------+ + | 2014-09-30| + +---------------------+ + + Example 4: Interval minus Interval. + + >>> import pyspark.sql.functions as sf + >>> spark.sql( + ... "SELECT * FROM VALUES (INTERVAL 1 YEAR, INTERVAL 2 YEAR) AS TAB(i, j)" + ... ).select(sf.try_subtract("i", "j")).show() + +------------------+ + |try_subtract(i, j)| + +------------------+ + |INTERVAL '-1' YEAR| + +------------------+ """ return _invoke_function_over_columns("try_subtract", left, right) @@ -1063,9 +1120,13 @@ def try_sum(col: "ColumnOrName") -> Column: Examples -------- - >>> df = spark.range(10) - >>> df.select(try_sum(df["id"]).alias('r')).collect() - [Row(r=45)] + >>> import pyspark.sql.functions as sf + >>> spark.range(10).select(sf.try_sum("id").alias("sum")).show() + +---+ + |sum| + +---+ + | 45| + +---+ """ return _invoke_function_over_columns("try_sum", col) --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org