This is an automated email from the ASF dual-hosted git repository.

ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 45d537ddeba [SPARK-40724][PS][FOLLOW-UP] Simplify `corrwith` with 
method `inline`
45d537ddeba is described below

commit 45d537ddeba2835449972d08a5a65d8276ec2978
Author: Ruifeng Zheng <ruife...@apache.org>
AuthorDate: Thu Oct 13 08:47:29 2022 +0800

    [SPARK-40724][PS][FOLLOW-UP] Simplify `corrwith` with method `inline`
    
    ### What changes were proposed in this pull request?
    Use `inline` instead of `explode` in `corrwith`
    
    ### Why are the changes needed?
    do not need the temporary column
    
    ### Does this PR introduce _any_ user-facing change?
    No
    
    ### How was this patch tested?
    existing UTs
    
    Closes #38221 from zhengruifeng/ps_df_corrwith_inline.
    
    Authored-by: Ruifeng Zheng <ruife...@apache.org>
    Signed-off-by: Ruifeng Zheng <ruife...@apache.org>
---
 python/pyspark/pandas/frame.py | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/python/pyspark/pandas/frame.py b/python/pyspark/pandas/frame.py
index 1aa94623ac3..835c13d6fdd 100644
--- a/python/pyspark/pandas/frame.py
+++ b/python/pyspark/pandas/frame.py
@@ -1747,7 +1747,6 @@ class DataFrame(Frame, Generic[T]):
 
         sdf = combined._internal.spark_frame
         index_col_name = verify_temp_column_name(sdf, 
"__corrwith_index_temp_column__")
-        tuple_col_name = verify_temp_column_name(sdf, 
"__corrwith_tuple_temp_column__")
 
         this_numeric_column_labels: List[Label] = []
         for column_label in this._internal.column_labels:
@@ -1797,15 +1796,7 @@ class DataFrame(Frame, Generic[T]):
                 )
 
         if len(pair_scols) > 0:
-            sdf = 
sdf.select(F.explode(F.array(*pair_scols)).alias(tuple_col_name)).select(
-                
F.col(f"{tuple_col_name}.{index_col_name}").alias(index_col_name),
-                F.col(f"{tuple_col_name}.{CORRELATION_VALUE_1_COLUMN}").alias(
-                    CORRELATION_VALUE_1_COLUMN
-                ),
-                F.col(f"{tuple_col_name}.{CORRELATION_VALUE_2_COLUMN}").alias(
-                    CORRELATION_VALUE_2_COLUMN
-                ),
-            )
+            sdf = sdf.select(F.inline(F.array(*pair_scols)))
 
             sdf = compute(sdf=sdf, groupKeys=[index_col_name], 
method=method).select(
                 index_col_name, CORRELATION_CORR_OUTPUT_COLUMN


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to