This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new ba20eaa4c30 [SPARK-45066][SQL][PYTHON][CONNECT] Make function `repeat` accept column-type `n` ba20eaa4c30 is described below commit ba20eaa4c30aecb32ba2deb7bbf502bec929a297 Author: Ruifeng Zheng <ruife...@apache.org> AuthorDate: Mon Sep 4 15:04:25 2023 -0700 [SPARK-45066][SQL][PYTHON][CONNECT] Make function `repeat` accept column-type `n` ### What changes were proposed in this pull request? Make function `repeat` accept column-type `n` ### Why are the changes needed? 1. to follow this guide: https://github.com/apache/spark/blob/5b609598503df603cbddd5e1adf8d2cb28a5f977/sql/core/src/main/scala/org/apache/spark/sql/functions.scala#L60-L62 2. especially, can replace [the internal function](https://github.com/apache/spark/blob/17fac569b4e4b569d41f761db07d7bf112801e0c/python/pyspark/pandas/spark/functions.py#L138-L143) in Pandas API (to make the PR clean, I will replace it in separate PR) ### Does this PR introduce _any_ user-facing change? yes ### How was this patch tested? NO ### Was this patch authored or co-authored using generative AI tooling? NO Closes #42794 from zhengruifeng/func_repeat_func. Authored-by: Ruifeng Zheng <ruife...@apache.org> Signed-off-by: Dongjoon Hyun <dh...@apple.com> --- .../scala/org/apache/spark/sql/functions.scala | 8 +++++ python/pyspark/sql/connect/functions.py | 5 +-- python/pyspark/sql/functions.py | 42 ++++++++++++++++++---- .../scala/org/apache/spark/sql/functions.scala | 10 ++++++ 4 files changed, 57 insertions(+), 8 deletions(-) diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala index 9ead800ace7..527848e95e6 100644 --- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala @@ -4100,6 +4100,14 @@ object functions { */ def repeat(str: Column, n: Int): Column = Column.fn("repeat", str, lit(n)) + /** + * Repeats a string column n times, and returns it as a new string column. + * + * @group string_funcs + * @since 4.0.0 + */ + def repeat(str: Column, n: Column): Column = Column.fn("repeat", str, n) + /** * Trim the spaces from right end for the specified string value. * diff --git a/python/pyspark/sql/connect/functions.py b/python/pyspark/sql/connect/functions.py index f290549ae47..19dd021ba08 100644 --- a/python/pyspark/sql/connect/functions.py +++ b/python/pyspark/sql/connect/functions.py @@ -2357,8 +2357,9 @@ def rpad(col: "ColumnOrName", len: int, pad: str) -> Column: rpad.__doc__ = pysparkfuncs.rpad.__doc__ -def repeat(col: "ColumnOrName", n: int) -> Column: - return _invoke_function("repeat", _to_col(col), lit(n)) +def repeat(col: "ColumnOrName", n: Union["ColumnOrName", int]) -> Column: + n = lit(n) if isinstance(n, int) else n + return _invoke_function("repeat", _to_col(col), _to_col(n)) repeat.__doc__ = pysparkfuncs.repeat.__doc__ diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 6e0caf50c16..699c8b9c8cf 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -10020,7 +10020,7 @@ def rpad(col: "ColumnOrName", len: int, pad: str) -> Column: @try_remote_functions -def repeat(col: "ColumnOrName", n: int) -> Column: +def repeat(col: "ColumnOrName", n: Union["ColumnOrName", int]) -> Column: """ Repeats a string column n times, and returns it as a new string column. @@ -10033,9 +10033,12 @@ def repeat(col: "ColumnOrName", n: int) -> Column: ---------- col : :class:`~pyspark.sql.Column` or str target column to work on. - n : int + n : :class:`~pyspark.sql.Column` or str or int number of times to repeat value. + .. versionchanged:: 4.0.0 + `n` now accepts column and column name. + Returns ------- :class:`~pyspark.sql.Column` @@ -10043,11 +10046,38 @@ def repeat(col: "ColumnOrName", n: int) -> Column: Examples -------- - >>> df = spark.createDataFrame([('ab',)], ['s',]) - >>> df.select(repeat(df.s, 3).alias('s')).collect() - [Row(s='ababab')] + >>> import pyspark.sql.functions as sf + >>> spark.createDataFrame( + ... [('ab',)], ['s',] + ... ).select(sf.repeat("s", 3)).show() + +------------+ + |repeat(s, 3)| + +------------+ + | ababab| + +------------+ + + >>> import pyspark.sql.functions as sf + >>> spark.createDataFrame( + ... [('ab',)], ['s',] + ... ).select(sf.repeat("s", sf.lit(4))).show() + +------------+ + |repeat(s, 4)| + +------------+ + | abababab| + +------------+ + + >>> import pyspark.sql.functions as sf + >>> spark.createDataFrame( + ... [('ab', 5,)], ['s', 't'] + ... ).select(sf.repeat("s", 't')).show() + +------------+ + |repeat(s, t)| + +------------+ + | ababababab| + +------------+ """ - return _invoke_function("repeat", _to_java_column(col), n) + n = lit(n) if isinstance(n, int) else n + return _invoke_function_over_columns("repeat", col, n) @try_remote_functions diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala index d4e271db5b2..a04a5e471ec 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala @@ -4200,6 +4200,16 @@ object functions { StringRepeat(str.expr, lit(n).expr) } + /** + * Repeats a string column n times, and returns it as a new string column. + * + * @group string_funcs + * @since 4.0.0 + */ + def repeat(str: Column, n: Column): Column = withExpr { + StringRepeat(str.expr, n.expr) + } + /** * Trim the spaces from right end for the specified string value. * --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org