[spark] branch master updated: [SPARK-39284][PS] Implement Groupby.mad

2022-06-04 Thread ruifengz
This is an automated email from the ASF dual-hosted git repository.

ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new d793c5c6858 [SPARK-39284][PS] Implement Groupby.mad
d793c5c6858 is described below

commit d793c5c6858cb3d89fd981495a85f4c60ae63035
Author: Ruifeng Zheng 
AuthorDate: Sun Jun 5 09:49:24 2022 +0800

[SPARK-39284][PS] Implement Groupby.mad

### What changes were proposed in this pull request?
Implement Groupby.mad

### Why are the changes needed?
to increase pandas api coverage

### Does this PR introduce _any_ user-facing change?
yes

```
In [6]: pdf = pd.DataFrame({"A": [1, 2, 2, 1, 1], "B": [3, 2, 3, 9, 0], 
"C": [3, 4, 13, -14, 9]})

In [7]: psdf = ps.from_pandas(pdf)

In [8]: pdf.groupby("A")[["B", "C"]].mad()
Out[8]:
  B C
A
1  3.33  8.89
2  0.50  4.50

In [9]: psdf.groupby("A")[["B", "C"]].mad()
Out[9]:
  B C
A
1  3.33  8.89
2  0.50  4.50

In [10]: pdf.B.groupby(pdf.A).mad()
Out[10]:
A
13.33
20.50
Name: B, dtype: float64

In [11]: psdf.B.groupby(psdf.A).mad()
Out[11]:
A
13.33
20.50
Name: B, dtype: float64

```

### How was this patch tested?
added ut

Closes #36660 from zhengruifeng/ps_groupby_mad.

Lead-authored-by: Ruifeng Zheng 
Co-authored-by: Ruifeng Zheng 
Signed-off-by: Ruifeng Zheng 
---
 python/pyspark/pandas/groupby.py| 84 +++--
 python/pyspark/pandas/missing/groupby.py|  2 -
 python/pyspark/pandas/tests/test_groupby.py |  3 ++
 3 files changed, 83 insertions(+), 6 deletions(-)

diff --git a/python/pyspark/pandas/groupby.py b/python/pyspark/pandas/groupby.py
index ce8a322c20b..4377ad6a5c9 100644
--- a/python/pyspark/pandas/groupby.py
+++ b/python/pyspark/pandas/groupby.py
@@ -753,6 +753,80 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta):
 bool_to_numeric=True,
 )
 
+# TODO: 'axis', 'skipna', 'level' parameter should be implemented.
+def mad(self) -> FrameLike:
+"""
+Compute mean absolute deviation of groups, excluding missing values.
+
+.. versionadded:: 3.4.0
+
+Examples
+
+>>> df = ps.DataFrame({"A": [1, 2, 1, 1], "B": [True, False, False, 
True],
+..."C": [3, 4, 3, 4], "D": ["a", "b", "b", "a"]})
+
+>>> df.groupby("A").mad()
+  B C
+A
+1  0.44  0.44
+2  0.00  0.00
+
+>>> df.B.groupby(df.A).mad()
+A
+10.44
+20.00
+Name: B, dtype: float64
+
+See Also
+
+pyspark.pandas.Series.groupby
+pyspark.pandas.DataFrame.groupby
+"""
+groupkey_names = [SPARK_INDEX_NAME_FORMAT(i) for i in 
range(len(self._groupkeys))]
+internal, agg_columns, sdf = self._prepare_reduce(
+groupkey_names=groupkey_names,
+accepted_spark_types=(NumericType, BooleanType),
+bool_to_numeric=False,
+)
+psdf: DataFrame = DataFrame(internal)
+
+if len(psdf._internal.column_labels) > 0:
+window = Window.partitionBy(groupkey_names).rowsBetween(
+Window.unboundedPreceding, Window.unboundedFollowing
+)
+new_agg_scols = {}
+new_stat_scols = []
+for agg_column in agg_columns:
+# it is not able to directly use 
'self._reduce_for_stat_function', due to
+# 'it is not allowed to use a window function inside an 
aggregate function'.
+# so we need to create temporary columns to compute the 'abs(x 
- avg(x))' here.
+agg_column_name = 
agg_column._internal.data_spark_column_names[0]
+new_agg_column_name = verify_temp_column_name(
+psdf._internal.spark_frame, 
"__tmp_agg_col_{}__".format(agg_column_name)
+)
+casted_agg_scol = F.col(agg_column_name).cast("double")
+new_agg_scols[new_agg_column_name] = F.abs(
+casted_agg_scol - F.avg(casted_agg_scol).over(window)
+)
+
new_stat_scols.append(F.avg(F.col(new_agg_column_name)).alias(agg_column_name))
+
+sdf = (
+psdf._internal.spark_frame.withColumns(new_agg_scols)
+.groupby(groupkey_names)
+.agg(*new_stat_scols)
+)
+else:
+sdf = sdf.select(*groupkey_names).distinct()
+
+internal = internal.copy(
+spark_frame=sdf,
+index_spark_columns=[scol_for(sdf, 

svn commit: r54856 - in /dev/spark/v3.3.0-rc5-docs: ./ _site/ _site/api/ _site/api/R/ _site/api/R/articles/ _site/api/R/deps/ _site/api/R/deps/bootstrap-5.1.0/ _site/api/R/deps/jquery-3.6.0/ _site/api

2022-06-04 Thread maxgekk
Author: maxgekk
Date: Sat Jun  4 09:52:12 2022
New Revision: 54856

Log:
Apache Spark v3.3.0-rc5 docs


[This commit notification would consist of 2665 parts, 
which exceeds the limit of 50 ones, so it was shortened to the summary.]

-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



svn commit: r54854 - /dev/spark/v3.3.0-rc5-bin/

2022-06-04 Thread maxgekk
Author: maxgekk
Date: Sat Jun  4 09:15:39 2022
New Revision: 54854

Log:
Apache Spark v3.3.0-rc5

Added:
dev/spark/v3.3.0-rc5-bin/
dev/spark/v3.3.0-rc5-bin/SparkR_3.3.0.tar.gz   (with props)
dev/spark/v3.3.0-rc5-bin/SparkR_3.3.0.tar.gz.asc
dev/spark/v3.3.0-rc5-bin/SparkR_3.3.0.tar.gz.sha512
dev/spark/v3.3.0-rc5-bin/pyspark-3.3.0.tar.gz   (with props)
dev/spark/v3.3.0-rc5-bin/pyspark-3.3.0.tar.gz.asc
dev/spark/v3.3.0-rc5-bin/pyspark-3.3.0.tar.gz.sha512
dev/spark/v3.3.0-rc5-bin/spark-3.3.0-bin-hadoop2.tgz   (with props)
dev/spark/v3.3.0-rc5-bin/spark-3.3.0-bin-hadoop2.tgz.asc
dev/spark/v3.3.0-rc5-bin/spark-3.3.0-bin-hadoop2.tgz.sha512
dev/spark/v3.3.0-rc5-bin/spark-3.3.0-bin-hadoop3-scala2.13.tgz   (with 
props)
dev/spark/v3.3.0-rc5-bin/spark-3.3.0-bin-hadoop3-scala2.13.tgz.asc
dev/spark/v3.3.0-rc5-bin/spark-3.3.0-bin-hadoop3-scala2.13.tgz.sha512
dev/spark/v3.3.0-rc5-bin/spark-3.3.0-bin-hadoop3.tgz   (with props)
dev/spark/v3.3.0-rc5-bin/spark-3.3.0-bin-hadoop3.tgz.asc
dev/spark/v3.3.0-rc5-bin/spark-3.3.0-bin-hadoop3.tgz.sha512
dev/spark/v3.3.0-rc5-bin/spark-3.3.0-bin-without-hadoop.tgz   (with props)
dev/spark/v3.3.0-rc5-bin/spark-3.3.0-bin-without-hadoop.tgz.asc
dev/spark/v3.3.0-rc5-bin/spark-3.3.0-bin-without-hadoop.tgz.sha512
dev/spark/v3.3.0-rc5-bin/spark-3.3.0.tgz   (with props)
dev/spark/v3.3.0-rc5-bin/spark-3.3.0.tgz.asc
dev/spark/v3.3.0-rc5-bin/spark-3.3.0.tgz.sha512

Added: dev/spark/v3.3.0-rc5-bin/SparkR_3.3.0.tar.gz
==
Binary file - no diff available.

Propchange: dev/spark/v3.3.0-rc5-bin/SparkR_3.3.0.tar.gz
--
svn:mime-type = application/octet-stream

Added: dev/spark/v3.3.0-rc5-bin/SparkR_3.3.0.tar.gz.asc
==
--- dev/spark/v3.3.0-rc5-bin/SparkR_3.3.0.tar.gz.asc (added)
+++ dev/spark/v3.3.0-rc5-bin/SparkR_3.3.0.tar.gz.asc Sat Jun  4 09:15:39 2022
@@ -0,0 +1,17 @@
+-BEGIN PGP SIGNATURE-
+
+iQJHBAABCgAxFiEEgPuOvo66aFBJiXA0kbXcgV2/ENMFAmKbIlYTHG1heGdla2tA
+YXBhY2hlLm9yZwAKCRCRtdyBXb8Q02YyD/wPYvJ8VvweSAAYMFrbIU66mZdfqcS3
+IXY1KjmwzirOnWPN9ovrcwLjHKFkWtsOLVDum4x4ffQoZUBKn+4xz/FGQx5j70Uw
+pJ+GXXWQSBrhIs/CApCDz98Fx4KX+u6d/0qr5fpftOPJIQn/D9nwPOlF3NBaIhOv
+jYqvZKEXqVgkZ/TSVnFHP4BRUYW7norV3F6s229KVvdvHE1wlUt3TnMk6ouSEmgM
+a4AZXxryhD0BtkB+9+8WmVWjYQjX8NlPp4wB7fi2p4RfSLKDiDWP+ompSqwh65GD
+k1hc32cgEQIAeVrh5O1ssT65PPOjNacVKzjRojujo1esKFyzaBdl6Ew04aCtsliF
+mEO9XI1Jh+NijDnhxcRA8gck+hI/AqDf9aXUcSWyQm/BclygtHs/UXPYWPhgi1jm
+4pGPBtsYOESCU0PewomFbwU36nMX/roLPJRGxk3m1ItxxU9FfPEYpRCaBa4KQxzB
+g4gtAlYs3CokukoskZMl6nF22CnCaB/1PzaFExp7Tys9UgX6Pv5Vf0gwc94wRvKK
+9RWtbAsvL6Cqd5pwu8cmauS3++BIGq6r09bzLvL5hG2fQwZ4jrIZqyFBSoxghSTb
+uulNdjthA+c82How6/ACzTrTrtTh8LammrIuX95E545fMdDGBO2DYfjrK1g2BprC
+fKF4UyooOJUriA==
+=YOtt
+-END PGP SIGNATURE-

Added: dev/spark/v3.3.0-rc5-bin/SparkR_3.3.0.tar.gz.sha512
==
--- dev/spark/v3.3.0-rc5-bin/SparkR_3.3.0.tar.gz.sha512 (added)
+++ dev/spark/v3.3.0-rc5-bin/SparkR_3.3.0.tar.gz.sha512 Sat Jun  4 09:15:39 2022
@@ -0,0 +1 @@
+1b9fb801c955e1038122804678defdd2eaba0f26dd501e09a2f61e13c77b9292bf14ca9c25a8561ce23ff4ee50ebad6d047a34696394f520f059f8e32dc91a9a
  SparkR_3.3.0.tar.gz

Added: dev/spark/v3.3.0-rc5-bin/pyspark-3.3.0.tar.gz
==
Binary file - no diff available.

Propchange: dev/spark/v3.3.0-rc5-bin/pyspark-3.3.0.tar.gz
--
svn:mime-type = application/octet-stream

Added: dev/spark/v3.3.0-rc5-bin/pyspark-3.3.0.tar.gz.asc
==
--- dev/spark/v3.3.0-rc5-bin/pyspark-3.3.0.tar.gz.asc (added)
+++ dev/spark/v3.3.0-rc5-bin/pyspark-3.3.0.tar.gz.asc Sat Jun  4 09:15:39 2022
@@ -0,0 +1,17 @@
+-BEGIN PGP SIGNATURE-
+
+iQJHBAABCgAxFiEEgPuOvo66aFBJiXA0kbXcgV2/ENMFAmKbIlkTHG1heGdla2tA
+YXBhY2hlLm9yZwAKCRCRtdyBXb8Q01pkEACwe03A1jrjWnAN6evlwk0xxMugbZI+
+2xNUuHOAPNc6Z1rsYuZnh8WCHKVo/Ik0JEdpDAPQDGqC1Pwn4l4LFf9c6BiTTCRS
+14VsiJrERpzzBNT8lqVIT09Z2esLFjTiw7S/tXFwkSNPT6o+IZb3KxuTm6XREc1Q
+QmsbC/EfOmqxSlTdBf3Dq7T2RSSNyFHOLwdgtPUWNxSXhKGzQd6WYceUx2aCGkrv
+u/TGoPhQL+F15EmhrK5Pfrycvo4UbJrsWzBswUeQFbJ3klyQlPvOfdm/VZhWzG/a
+XGggZmTFiPEdFRJ9FRnArK9lng/8uUME/2Am9WTU28dkFRiaND/CARJ9NvYKyYIR
+TBOudzm+advHgOjiHS1FWLXG9sHdGvgjwFe/g3byzPqiCl2LmPencXCgH0lmRd/x
+H7HFp4nRQtWIVByedwSeFGJS4zZh42fWg4h7K6iP8dP4ZoepcuPGZw6qIi0P+tFh
+ATTimLDx28LhsiaRE7QP2xvYXI0yCIjeDLPGgbM9rpUapqwUMTcuDUtnFSKzV7QW
+Ly+jJpyBL6lSAy7N7e4mpCm8yEep/sdPCL/H7XF9cHCEV5Afnh/vqG63jXKxZYgz
+vRTW5oDMCn/mpxt8NxQXtiu7iXNJvAIPPJZWclCSoTBfkueQhyRCypYXF5//O6l8
+YZ4yF8LA+z0gDA==
+=2iWV
+-END PGP SIGNATURE-

Added: 

[spark] 01/01: Preparing development version 3.3.1-SNAPSHOT

2022-06-04 Thread maxgekk
This is an automated email from the ASF dual-hosted git repository.

maxgekk pushed a commit to branch branch-3.3
in repository https://gitbox.apache.org/repos/asf/spark.git

commit bf3c472ff87ab7ec17f55e4730d6c6c9a7f299ad
Author: Maxim Gekk 
AuthorDate: Sat Jun 4 06:43:12 2022 +

Preparing development version 3.3.1-SNAPSHOT
---
 R/pkg/DESCRIPTION  | 2 +-
 assembly/pom.xml   | 2 +-
 common/kvstore/pom.xml | 2 +-
 common/network-common/pom.xml  | 2 +-
 common/network-shuffle/pom.xml | 2 +-
 common/network-yarn/pom.xml| 2 +-
 common/sketch/pom.xml  | 2 +-
 common/tags/pom.xml| 2 +-
 common/unsafe/pom.xml  | 2 +-
 core/pom.xml   | 2 +-
 docs/_config.yml   | 6 +++---
 examples/pom.xml   | 2 +-
 external/avro/pom.xml  | 2 +-
 external/docker-integration-tests/pom.xml  | 2 +-
 external/kafka-0-10-assembly/pom.xml   | 2 +-
 external/kafka-0-10-sql/pom.xml| 2 +-
 external/kafka-0-10-token-provider/pom.xml | 2 +-
 external/kafka-0-10/pom.xml| 2 +-
 external/kinesis-asl-assembly/pom.xml  | 2 +-
 external/kinesis-asl/pom.xml   | 2 +-
 external/spark-ganglia-lgpl/pom.xml| 2 +-
 graphx/pom.xml | 2 +-
 hadoop-cloud/pom.xml   | 2 +-
 launcher/pom.xml   | 2 +-
 mllib-local/pom.xml| 2 +-
 mllib/pom.xml  | 2 +-
 pom.xml| 2 +-
 repl/pom.xml   | 2 +-
 resource-managers/kubernetes/core/pom.xml  | 2 +-
 resource-managers/kubernetes/integration-tests/pom.xml | 2 +-
 resource-managers/mesos/pom.xml| 2 +-
 resource-managers/yarn/pom.xml | 2 +-
 sql/catalyst/pom.xml   | 2 +-
 sql/core/pom.xml   | 2 +-
 sql/hive-thriftserver/pom.xml  | 2 +-
 sql/hive/pom.xml   | 2 +-
 streaming/pom.xml  | 2 +-
 tools/pom.xml  | 2 +-
 38 files changed, 40 insertions(+), 40 deletions(-)

diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index 9479bb3bf87..0e449e841cf 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: SparkR
 Type: Package
-Version: 3.3.0
+Version: 3.3.1
 Title: R Front End for 'Apache Spark'
 Description: Provides an R Front end for 'Apache Spark' 
.
 Authors@R:
diff --git a/assembly/pom.xml b/assembly/pom.xml
index 2e9c4d9960b..d12f2ad73fa 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -21,7 +21,7 @@
   
 org.apache.spark
 spark-parent_2.12
-3.3.0
+3.3.1-SNAPSHOT
 ../pom.xml
   
 
diff --git a/common/kvstore/pom.xml b/common/kvstore/pom.xml
index 2a9acfa335e..842d63f5d38 100644
--- a/common/kvstore/pom.xml
+++ b/common/kvstore/pom.xml
@@ -22,7 +22,7 @@
   
 org.apache.spark
 spark-parent_2.12
-3.3.0
+3.3.1-SNAPSHOT
 ../../pom.xml
   
 
diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml
index 7b17e625d75..f7d187bf952 100644
--- a/common/network-common/pom.xml
+++ b/common/network-common/pom.xml
@@ -22,7 +22,7 @@
   
 org.apache.spark
 spark-parent_2.12
-3.3.0
+3.3.1-SNAPSHOT
 ../../pom.xml
   
 
diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml
index c5c920e7747..53f38df8851 100644
--- a/common/network-shuffle/pom.xml
+++ b/common/network-shuffle/pom.xml
@@ -22,7 +22,7 @@
   
 org.apache.spark
 spark-parent_2.12
-3.3.0
+3.3.1-SNAPSHOT
 ../../pom.xml
   
 
diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml
index 697b5a3928e..845f6659407 100644
--- a/common/network-yarn/pom.xml
+++ b/common/network-yarn/pom.xml
@@ -22,7 +22,7 @@
   
 org.apache.spark
 spark-parent_2.12
-3.3.0
+3.3.1-SNAPSHOT
 ../../pom.xml
   
 
diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml
index ad2db11370a..8e159089193 100644
--- a/common/sketch/pom.xml
+++ b/common/sketch/pom.xml
@@ -22,7 +22,7 @@
   
 org.apache.spark
 spark-parent_2.12
-3.3.0
+3.3.1-SNAPSHOT
 ../../pom.xml
   
 
diff --git a/common/tags/pom.xml b/common/tags/pom.xml
index 1a7bdee70f3..1987c133285 100644
--- a/common/tags/pom.xml
+++ 

[spark] branch branch-3.3 updated (b7e95bad882 -> bf3c472ff87)

2022-06-04 Thread maxgekk
This is an automated email from the ASF dual-hosted git repository.

maxgekk pushed a change to branch branch-3.3
in repository https://gitbox.apache.org/repos/asf/spark.git


from b7e95bad882 [SPARK-39259][SQL][FOLLOWUP] Fix source and binary 
incompatibilities in transformDownWithSubqueries
 add 7cf29705272 Preparing Spark release v3.3.0-rc5
 new bf3c472ff87 Preparing development version 3.3.1-SNAPSHOT

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] tag v3.3.0-rc5 created (now 7cf29705272)

2022-06-04 Thread maxgekk
This is an automated email from the ASF dual-hosted git repository.

maxgekk pushed a change to tag v3.3.0-rc5
in repository https://gitbox.apache.org/repos/asf/spark.git


  at 7cf29705272 (commit)
This tag includes the following new commits:

 new 7cf29705272 Preparing Spark release v3.3.0-rc5

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.



-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] 01/01: Preparing Spark release v3.3.0-rc5

2022-06-04 Thread maxgekk
This is an automated email from the ASF dual-hosted git repository.

maxgekk pushed a commit to tag v3.3.0-rc5
in repository https://gitbox.apache.org/repos/asf/spark.git

commit 7cf29705272ab8e8c70e8885a3664ad8ae3cd5e9
Author: Maxim Gekk 
AuthorDate: Sat Jun 4 06:43:05 2022 +

Preparing Spark release v3.3.0-rc5
---
 R/pkg/DESCRIPTION  | 2 +-
 assembly/pom.xml   | 2 +-
 common/kvstore/pom.xml | 2 +-
 common/network-common/pom.xml  | 2 +-
 common/network-shuffle/pom.xml | 2 +-
 common/network-yarn/pom.xml| 2 +-
 common/sketch/pom.xml  | 2 +-
 common/tags/pom.xml| 2 +-
 common/unsafe/pom.xml  | 2 +-
 core/pom.xml   | 2 +-
 docs/_config.yml   | 6 +++---
 examples/pom.xml   | 2 +-
 external/avro/pom.xml  | 2 +-
 external/docker-integration-tests/pom.xml  | 2 +-
 external/kafka-0-10-assembly/pom.xml   | 2 +-
 external/kafka-0-10-sql/pom.xml| 2 +-
 external/kafka-0-10-token-provider/pom.xml | 2 +-
 external/kafka-0-10/pom.xml| 2 +-
 external/kinesis-asl-assembly/pom.xml  | 2 +-
 external/kinesis-asl/pom.xml   | 2 +-
 external/spark-ganglia-lgpl/pom.xml| 2 +-
 graphx/pom.xml | 2 +-
 hadoop-cloud/pom.xml   | 2 +-
 launcher/pom.xml   | 2 +-
 mllib-local/pom.xml| 2 +-
 mllib/pom.xml  | 2 +-
 pom.xml| 2 +-
 repl/pom.xml   | 2 +-
 resource-managers/kubernetes/core/pom.xml  | 2 +-
 resource-managers/kubernetes/integration-tests/pom.xml | 2 +-
 resource-managers/mesos/pom.xml| 2 +-
 resource-managers/yarn/pom.xml | 2 +-
 sql/catalyst/pom.xml   | 2 +-
 sql/core/pom.xml   | 2 +-
 sql/hive-thriftserver/pom.xml  | 2 +-
 sql/hive/pom.xml   | 2 +-
 streaming/pom.xml  | 2 +-
 tools/pom.xml  | 2 +-
 38 files changed, 40 insertions(+), 40 deletions(-)

diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index 0e449e841cf..9479bb3bf87 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: SparkR
 Type: Package
-Version: 3.3.1
+Version: 3.3.0
 Title: R Front End for 'Apache Spark'
 Description: Provides an R Front end for 'Apache Spark' 
.
 Authors@R:
diff --git a/assembly/pom.xml b/assembly/pom.xml
index d12f2ad73fa..2e9c4d9960b 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -21,7 +21,7 @@
   
 org.apache.spark
 spark-parent_2.12
-3.3.1-SNAPSHOT
+3.3.0
 ../pom.xml
   
 
diff --git a/common/kvstore/pom.xml b/common/kvstore/pom.xml
index 842d63f5d38..2a9acfa335e 100644
--- a/common/kvstore/pom.xml
+++ b/common/kvstore/pom.xml
@@ -22,7 +22,7 @@
   
 org.apache.spark
 spark-parent_2.12
-3.3.1-SNAPSHOT
+3.3.0
 ../../pom.xml
   
 
diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml
index f7d187bf952..7b17e625d75 100644
--- a/common/network-common/pom.xml
+++ b/common/network-common/pom.xml
@@ -22,7 +22,7 @@
   
 org.apache.spark
 spark-parent_2.12
-3.3.1-SNAPSHOT
+3.3.0
 ../../pom.xml
   
 
diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml
index 53f38df8851..c5c920e7747 100644
--- a/common/network-shuffle/pom.xml
+++ b/common/network-shuffle/pom.xml
@@ -22,7 +22,7 @@
   
 org.apache.spark
 spark-parent_2.12
-3.3.1-SNAPSHOT
+3.3.0
 ../../pom.xml
   
 
diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml
index 845f6659407..697b5a3928e 100644
--- a/common/network-yarn/pom.xml
+++ b/common/network-yarn/pom.xml
@@ -22,7 +22,7 @@
   
 org.apache.spark
 spark-parent_2.12
-3.3.1-SNAPSHOT
+3.3.0
 ../../pom.xml
   
 
diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml
index 8e159089193..ad2db11370a 100644
--- a/common/sketch/pom.xml
+++ b/common/sketch/pom.xml
@@ -22,7 +22,7 @@
   
 org.apache.spark
 spark-parent_2.12
-3.3.1-SNAPSHOT
+3.3.0
 ../../pom.xml
   
 
diff --git a/common/tags/pom.xml b/common/tags/pom.xml
index 1987c133285..1a7bdee70f3 100644
--- a/common/tags/pom.xml
+++ 

[spark] branch branch-3.3 updated: [SPARK-39259][SQL][FOLLOWUP] Fix source and binary incompatibilities in transformDownWithSubqueries

2022-06-04 Thread maxgekk
This is an automated email from the ASF dual-hosted git repository.

maxgekk pushed a commit to branch branch-3.3
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.3 by this push:
 new b7e95bad882 [SPARK-39259][SQL][FOLLOWUP] Fix source and binary 
incompatibilities in transformDownWithSubqueries
b7e95bad882 is described below

commit b7e95bad882482168b7dd301fcfa3daf80477a7a
Author: Josh Rosen 
AuthorDate: Sat Jun 4 09:12:42 2022 +0300

[SPARK-39259][SQL][FOLLOWUP] Fix source and binary incompatibilities in 
transformDownWithSubqueries

### What changes were proposed in this pull request?

This is a followup to #36654. That PR modified the existing 
`QueryPlan.transformDownWithSubqueries` to add additional arguments for tree 
pattern pruning.

In this PR, I roll back the change to that method's signature and instead 
add a new `transformDownWithSubqueriesAndPruning` method.

### Why are the changes needed?

The original change breaks binary and source compatibility in Catalyst. 
Technically speaking, Catalyst APIs are considered internal to Spark and are 
subject to change between minor releases (see 
[source](https://github.com/apache/spark/blob/bb51add5c79558df863d37965603387d40cc4387/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/package.scala#L20-L24)),
 but I think it's nice to try to avoid API breakage when possible.

While trying to compile some custom Catalyst code, I ran into issues when 
trying to call the `transformDownWithSubqueries` method without supplying a 
tree pattern filter condition. If I do `transformDownWithSubqueries() { f} ` 
then I get a compilation error. I think this is due to the first parameter 
group containing all default parameters.

My PR's solution of adding a new `transformDownWithSubqueriesAndPruning` 
method solves this problem. It's also more consistent with the naming 
convention used for other pruning-enabled tree transformation methods.

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

Existing tests.

Closes #36765 from JoshRosen/SPARK-39259-binary-compatibility-followup.

Authored-by: Josh Rosen 
Signed-off-by: Max Gekk 
(cherry picked from commit eda6c4b9987f0515cb0aae4686c8a0ae0a3987d4)
Signed-off-by: Max Gekk 
---
 .../sql/catalyst/optimizer/finishAnalysis.scala|  2 +-
 .../spark/sql/catalyst/plans/QueryPlan.scala   | 22 --
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/finishAnalysis.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/finishAnalysis.scala
index 242c799dd22..a33069051d9 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/finishAnalysis.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/finishAnalysis.scala
@@ -84,7 +84,7 @@ object ComputeCurrentTime extends Rule[LogicalPlan] {
   treePatternbits.containsPattern(CURRENT_LIKE)
 }
 
-plan.transformDownWithSubqueries(transformCondition) {
+plan.transformDownWithSubqueriesAndPruning(transformCondition) {
   case subQuery =>
 subQuery.transformAllExpressionsWithPruning(transformCondition) {
   case cd: CurrentDate =>
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
index d0283f4d367..cc62c81b101 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
@@ -454,7 +454,7 @@ abstract class QueryPlan[PlanType <: QueryPlan[PlanType]]
* to rewrite the whole plan, include its subqueries, in one go.
*/
   def transformWithSubqueries(f: PartialFunction[PlanType, PlanType]): 
PlanType =
-transformDownWithSubqueries(AlwaysProcess.fn, UnknownRuleId)(f)
+transformDownWithSubqueries(f)
 
   /**
* Returns a copy of this node where the given partial function has been 
recursively applied
@@ -479,10 +479,20 @@ abstract class QueryPlan[PlanType <: QueryPlan[PlanType]]
* first to this node, then this node's subqueries and finally this node's 
children.
* When the partial function does not apply to a given node, it is left 
unchanged.
*/
-  def transformDownWithSubqueries(
-cond: TreePatternBits => Boolean = AlwaysProcess.fn, ruleId: RuleId = 
UnknownRuleId)
-(f: PartialFunction[PlanType, PlanType])
-: PlanType = {
+  def transformDownWithSubqueries(f: PartialFunction[PlanType, PlanType]): 
PlanType = {
+transformDownWithSubqueriesAndPruning(AlwaysProcess.fn, UnknownRuleId)(f)
+  }
+
+  /**
+   * This method is the top-down (pre-order) counterpart of 

[spark] branch master updated: [SPARK-39259][SQL][FOLLOWUP] Fix source and binary incompatibilities in transformDownWithSubqueries

2022-06-04 Thread maxgekk
This is an automated email from the ASF dual-hosted git repository.

maxgekk pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new eda6c4b9987 [SPARK-39259][SQL][FOLLOWUP] Fix source and binary 
incompatibilities in transformDownWithSubqueries
eda6c4b9987 is described below

commit eda6c4b9987f0515cb0aae4686c8a0ae0a3987d4
Author: Josh Rosen 
AuthorDate: Sat Jun 4 09:12:42 2022 +0300

[SPARK-39259][SQL][FOLLOWUP] Fix source and binary incompatibilities in 
transformDownWithSubqueries

### What changes were proposed in this pull request?

This is a followup to #36654. That PR modified the existing 
`QueryPlan.transformDownWithSubqueries` to add additional arguments for tree 
pattern pruning.

In this PR, I roll back the change to that method's signature and instead 
add a new `transformDownWithSubqueriesAndPruning` method.

### Why are the changes needed?

The original change breaks binary and source compatibility in Catalyst. 
Technically speaking, Catalyst APIs are considered internal to Spark and are 
subject to change between minor releases (see 
[source](https://github.com/apache/spark/blob/bb51add5c79558df863d37965603387d40cc4387/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/package.scala#L20-L24)),
 but I think it's nice to try to avoid API breakage when possible.

While trying to compile some custom Catalyst code, I ran into issues when 
trying to call the `transformDownWithSubqueries` method without supplying a 
tree pattern filter condition. If I do `transformDownWithSubqueries() { f} ` 
then I get a compilation error. I think this is due to the first parameter 
group containing all default parameters.

My PR's solution of adding a new `transformDownWithSubqueriesAndPruning` 
method solves this problem. It's also more consistent with the naming 
convention used for other pruning-enabled tree transformation methods.

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

Existing tests.

Closes #36765 from JoshRosen/SPARK-39259-binary-compatibility-followup.

Authored-by: Josh Rosen 
Signed-off-by: Max Gekk 
---
 .../sql/catalyst/optimizer/finishAnalysis.scala|  2 +-
 .../spark/sql/catalyst/plans/QueryPlan.scala   | 22 --
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/finishAnalysis.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/finishAnalysis.scala
index 242c799dd22..a33069051d9 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/finishAnalysis.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/finishAnalysis.scala
@@ -84,7 +84,7 @@ object ComputeCurrentTime extends Rule[LogicalPlan] {
   treePatternbits.containsPattern(CURRENT_LIKE)
 }
 
-plan.transformDownWithSubqueries(transformCondition) {
+plan.transformDownWithSubqueriesAndPruning(transformCondition) {
   case subQuery =>
 subQuery.transformAllExpressionsWithPruning(transformCondition) {
   case cd: CurrentDate =>
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
index d0283f4d367..cc62c81b101 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
@@ -454,7 +454,7 @@ abstract class QueryPlan[PlanType <: QueryPlan[PlanType]]
* to rewrite the whole plan, include its subqueries, in one go.
*/
   def transformWithSubqueries(f: PartialFunction[PlanType, PlanType]): 
PlanType =
-transformDownWithSubqueries(AlwaysProcess.fn, UnknownRuleId)(f)
+transformDownWithSubqueries(f)
 
   /**
* Returns a copy of this node where the given partial function has been 
recursively applied
@@ -479,10 +479,20 @@ abstract class QueryPlan[PlanType <: QueryPlan[PlanType]]
* first to this node, then this node's subqueries and finally this node's 
children.
* When the partial function does not apply to a given node, it is left 
unchanged.
*/
-  def transformDownWithSubqueries(
-cond: TreePatternBits => Boolean = AlwaysProcess.fn, ruleId: RuleId = 
UnknownRuleId)
-(f: PartialFunction[PlanType, PlanType])
-: PlanType = {
+  def transformDownWithSubqueries(f: PartialFunction[PlanType, PlanType]): 
PlanType = {
+transformDownWithSubqueriesAndPruning(AlwaysProcess.fn, UnknownRuleId)(f)
+  }
+
+  /**
+   * This method is the top-down (pre-order) counterpart of 
transformUpWithSubqueries.
+   * Returns a copy of this node where the given partial function has been