This is an automated email from the ASF dual-hosted git repository. ruifengz pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 45d537ddeba [SPARK-40724][PS][FOLLOW-UP] Simplify `corrwith` with method `inline` 45d537ddeba is described below commit 45d537ddeba2835449972d08a5a65d8276ec2978 Author: Ruifeng Zheng <ruife...@apache.org> AuthorDate: Thu Oct 13 08:47:29 2022 +0800 [SPARK-40724][PS][FOLLOW-UP] Simplify `corrwith` with method `inline` ### What changes were proposed in this pull request? Use `inline` instead of `explode` in `corrwith` ### Why are the changes needed? do not need the temporary column ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? existing UTs Closes #38221 from zhengruifeng/ps_df_corrwith_inline. Authored-by: Ruifeng Zheng <ruife...@apache.org> Signed-off-by: Ruifeng Zheng <ruife...@apache.org> --- python/pyspark/pandas/frame.py | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/python/pyspark/pandas/frame.py b/python/pyspark/pandas/frame.py index 1aa94623ac3..835c13d6fdd 100644 --- a/python/pyspark/pandas/frame.py +++ b/python/pyspark/pandas/frame.py @@ -1747,7 +1747,6 @@ class DataFrame(Frame, Generic[T]): sdf = combined._internal.spark_frame index_col_name = verify_temp_column_name(sdf, "__corrwith_index_temp_column__") - tuple_col_name = verify_temp_column_name(sdf, "__corrwith_tuple_temp_column__") this_numeric_column_labels: List[Label] = [] for column_label in this._internal.column_labels: @@ -1797,15 +1796,7 @@ class DataFrame(Frame, Generic[T]): ) if len(pair_scols) > 0: - sdf = sdf.select(F.explode(F.array(*pair_scols)).alias(tuple_col_name)).select( - F.col(f"{tuple_col_name}.{index_col_name}").alias(index_col_name), - F.col(f"{tuple_col_name}.{CORRELATION_VALUE_1_COLUMN}").alias( - CORRELATION_VALUE_1_COLUMN - ), - F.col(f"{tuple_col_name}.{CORRELATION_VALUE_2_COLUMN}").alias( - CORRELATION_VALUE_2_COLUMN - ), - ) + sdf = sdf.select(F.inline(F.array(*pair_scols))) sdf = compute(sdf=sdf, groupKeys=[index_col_name], method=method).select( index_col_name, CORRELATION_CORR_OUTPUT_COLUMN --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org