This is an automated email from the ASF dual-hosted git repository. ruifengz pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new a2c247bc1c82 [SPARK-45076][PS] Switch to built-in `repeat` function a2c247bc1c82 is described below commit a2c247bc1c8217061d4bbc1e2342ee65d59c13de Author: Ruifeng Zheng <ruife...@apache.org> AuthorDate: Tue Sep 5 14:21:30 2023 +0800 [SPARK-45076][PS] Switch to built-in `repeat` function ### What changes were proposed in this pull request? Switch to built-in `repeat` function ### Why are the changes needed? https://github.com/apache/spark/pull/42794 make `repeat` support column-typed `n`, so we don't need this PS-specific function any more ### Does this PR introduce _any_ user-facing change? NO ### How was this patch tested? CI ### Was this patch authored or co-authored using generative AI tooling? NO Closes #42812 from zhengruifeng/ps_replace_repeat. Authored-by: Ruifeng Zheng <ruife...@apache.org> Signed-off-by: Ruifeng Zheng <ruife...@apache.org> --- python/pyspark/pandas/data_type_ops/num_ops.py | 3 +-- python/pyspark/pandas/data_type_ops/string_ops.py | 7 +++---- python/pyspark/pandas/spark/functions.py | 11 ----------- python/pyspark/pandas/strings.py | 3 +-- 4 files changed, 5 insertions(+), 19 deletions(-) diff --git a/python/pyspark/pandas/data_type_ops/num_ops.py b/python/pyspark/pandas/data_type_ops/num_ops.py index af5e387c0f2a..7775cfed044c 100644 --- a/python/pyspark/pandas/data_type_ops/num_ops.py +++ b/python/pyspark/pandas/data_type_ops/num_ops.py @@ -42,7 +42,6 @@ from pyspark.pandas.data_type_ops.base import ( _is_valid_for_logical_operator, _is_boolean_type, ) -from pyspark.pandas.spark import functions as SF from pyspark.pandas.typedef.typehints import extension_dtypes, pandas_on_spark_type from pyspark.sql import functions as F from pyspark.sql import Column as PySparkColumn @@ -245,7 +244,7 @@ class IntegralOps(NumericOps): def mul(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex: _sanitize_list_like(right) if isinstance(right, IndexOpsMixin) and isinstance(right.spark.data_type, StringType): - return column_op(SF.repeat)(right, left) + return column_op(F.repeat)(right, left) if not is_valid_operand_for_numeric_arithmetic(right): raise TypeError("Multiplication can not be applied to given types.") diff --git a/python/pyspark/pandas/data_type_ops/string_ops.py b/python/pyspark/pandas/data_type_ops/string_ops.py index 1c282f201178..53095c55e81e 100644 --- a/python/pyspark/pandas/data_type_ops/string_ops.py +++ b/python/pyspark/pandas/data_type_ops/string_ops.py @@ -33,7 +33,6 @@ from pyspark.pandas.data_type_ops.base import ( _as_string_type, _sanitize_list_like, ) -from pyspark.pandas.spark import functions as SF from pyspark.pandas.typedef import extension_dtypes, pandas_on_spark_type from pyspark.sql.types import BooleanType @@ -67,7 +66,7 @@ class StringOps(DataTypeOps): return cast( SeriesOrIndex, left._with_new_scol( - SF.repeat(left.spark.column, right), field=left._internal.data_fields[0] + F.repeat(left.spark.column, right), field=left._internal.data_fields[0] ), ) elif ( @@ -75,7 +74,7 @@ class StringOps(DataTypeOps): and isinstance(right.spark.data_type, IntegralType) and not isinstance(right.dtype, CategoricalDtype) ): - return column_op(SF.repeat)(left, right) + return column_op(F.repeat)(left, right) else: raise TypeError("Multiplication can not be applied to given types.") @@ -97,7 +96,7 @@ class StringOps(DataTypeOps): return cast( SeriesOrIndex, left._with_new_scol( - SF.repeat(left.spark.column, right), field=left._internal.data_fields[0] + F.repeat(left.spark.column, right), field=left._internal.data_fields[0] ), ) else: diff --git a/python/pyspark/pandas/spark/functions.py b/python/pyspark/pandas/spark/functions.py index b0bc6efcd56e..d7f3d81773c7 100644 --- a/python/pyspark/pandas/spark/functions.py +++ b/python/pyspark/pandas/spark/functions.py @@ -17,10 +17,7 @@ """ Additional Spark functions used in pandas-on-Spark. """ -from typing import Union - from pyspark import SparkContext -import pyspark.sql.functions as F from pyspark.sql.column import Column # For supporting Spark Connect @@ -135,14 +132,6 @@ def covar(col1: Column, col2: Column, ddof: int) -> Column: return Column(sc._jvm.PythonSQLUtils.pandasCovar(col1._jc, col2._jc, ddof)) -def repeat(col: Column, n: Union[int, Column]) -> Column: - """ - Repeats a string column n times, and returns it as a new string column. - """ - _n = F.lit(n) if isinstance(n, int) else n - return F.call_udf("repeat", col, _n) - - def ewm(col: Column, alpha: float, ignore_na: bool) -> Column: if is_remote(): from pyspark.sql.connect.functions import _invoke_function_over_columns, lit diff --git a/python/pyspark/pandas/strings.py b/python/pyspark/pandas/strings.py index cd47de55108b..37486b0cc812 100644 --- a/python/pyspark/pandas/strings.py +++ b/python/pyspark/pandas/strings.py @@ -37,7 +37,6 @@ from pyspark.sql import functions as F from pyspark.sql.functions import pandas_udf import pyspark.pandas as ps -from pyspark.pandas.spark import functions as SF class StringMethods: @@ -1506,7 +1505,7 @@ class StringMethods: """ if not isinstance(repeats, int): raise TypeError("repeats expects an int parameter") - return self._data.spark.transform(lambda c: SF.repeat(col=c, n=repeats)) + return self._data.spark.transform(lambda c: F.repeat(col=c, n=repeats)) def replace( self, --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org