[spark] branch branch-3.2 updated: [SPARK-36705][FOLLOW-UP] Fix unnecessary logWarning when PUSH_BASED_SHUFFLE_ENABLED is set to false
This is an automated email from the ASF dual-hosted git repository. mridulm80 pushed a commit to branch branch-3.2 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.2 by this push: new 4a486f4 [SPARK-36705][FOLLOW-UP] Fix unnecessary logWarning when PUSH_BASED_SHUFFLE_ENABLED is set to false 4a486f4 is described below commit 4a486f40cf81f9a602e62e2e2bb050a6ac175f57 Author: Minchu Yang AuthorDate: Mon Sep 13 23:23:33 2021 -0500 [SPARK-36705][FOLLOW-UP] Fix unnecessary logWarning when PUSH_BASED_SHUFFLE_ENABLED is set to false ### What changes were proposed in this pull request? Only throw logWarning when `PUSH_BASED_SHUFFLE_ENABLED` is set to true and `canDoPushBasedShuffle` is false ### Why are the changes needed? Currently, this logWarning will still be printed out even when `PUSH_BASED_SHUFFLE_ENABLED` is set to false, which is unnecessary. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Passed existing UT. Closes #33984 from rmcyang/SPARK-36705-follow-up. Authored-by: Minchu Yang Signed-off-by: Mridul Muralidharan gmail.com> (cherry picked from commit 2d7dc7c7ce6d524a232f37927ca179f162ad9971) Signed-off-by: Mridul Muralidharan --- .../main/scala/org/apache/spark/util/Utils.scala | 38 -- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala index a112214..1a276f3 100644 --- a/core/src/main/scala/org/apache/spark/util/Utils.scala +++ b/core/src/main/scala/org/apache/spark/util/Utils.scala @@ -2598,23 +2598,27 @@ private[spark] object Utils extends Logging { * - serializer(such as KryoSerializer) supports relocation of serialized objects */ def isPushBasedShuffleEnabled(conf: SparkConf): Boolean = { -val serializer = Utils.classForName(conf.get(SERIALIZER)).getConstructor(classOf[SparkConf]) - .newInstance(conf).asInstanceOf[Serializer] -val canDoPushBasedShuffle = - conf.get(PUSH_BASED_SHUFFLE_ENABLED) && -(conf.get(IS_TESTING).getOrElse(false) || - (conf.get(SHUFFLE_SERVICE_ENABLED) && -conf.get(SparkLauncher.SPARK_MASTER, null) == "yarn" && -// TODO: [SPARK-36744] needs to support IO encryption for push-based shuffle -!conf.get(IO_ENCRYPTION_ENABLED) && -serializer.supportsRelocationOfSerializedObjects)) - -if (!canDoPushBasedShuffle) { - logWarning("Push-based shuffle can only be enabled when the application is submitted" + -"to run in YARN mode, with external shuffle service enabled, IO encryption disabled, and" + -"relocation of serialized objects supported.") -} -canDoPushBasedShuffle +val pushBasedShuffleEnabled = conf.get(PUSH_BASED_SHUFFLE_ENABLED) +if (pushBasedShuffleEnabled) { + val serializer = Utils.classForName(conf.get(SERIALIZER)).getConstructor(classOf[SparkConf]) +.newInstance(conf).asInstanceOf[Serializer] + val canDoPushBasedShuffle = conf.get(IS_TESTING).getOrElse(false) || +(conf.get(SHUFFLE_SERVICE_ENABLED) && + conf.get(SparkLauncher.SPARK_MASTER, null) == "yarn" && + // TODO: [SPARK-36744] needs to support IO encryption for push-based shuffle + !conf.get(IO_ENCRYPTION_ENABLED) && + serializer.supportsRelocationOfSerializedObjects) + + if (!canDoPushBasedShuffle) { +logWarning("Push-based shuffle can only be enabled when the application is submitted " + + "to run in YARN mode, with external shuffle service enabled, IO encryption disabled, " + + "and relocation of serialized objects supported.") + } + + canDoPushBasedShuffle +} else { + false +} } /** - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-36705][FOLLOW-UP] Fix unnecessary logWarning when PUSH_BASED_SHUFFLE_ENABLED is set to false
This is an automated email from the ASF dual-hosted git repository. mridulm80 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 2d7dc7c [SPARK-36705][FOLLOW-UP] Fix unnecessary logWarning when PUSH_BASED_SHUFFLE_ENABLED is set to false 2d7dc7c is described below commit 2d7dc7c7ce6d524a232f37927ca179f162ad9971 Author: Minchu Yang AuthorDate: Mon Sep 13 23:23:33 2021 -0500 [SPARK-36705][FOLLOW-UP] Fix unnecessary logWarning when PUSH_BASED_SHUFFLE_ENABLED is set to false ### What changes were proposed in this pull request? Only throw logWarning when `PUSH_BASED_SHUFFLE_ENABLED` is set to true and `canDoPushBasedShuffle` is false ### Why are the changes needed? Currently, this logWarning will still be printed out even when `PUSH_BASED_SHUFFLE_ENABLED` is set to false, which is unnecessary. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Passed existing UT. Closes #33984 from rmcyang/SPARK-36705-follow-up. Authored-by: Minchu Yang Signed-off-by: Mridul Muralidharan gmail.com> --- .../main/scala/org/apache/spark/util/Utils.scala | 38 -- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala index bbff56c..f894b83 100644 --- a/core/src/main/scala/org/apache/spark/util/Utils.scala +++ b/core/src/main/scala/org/apache/spark/util/Utils.scala @@ -2604,23 +2604,27 @@ private[spark] object Utils extends Logging { * - serializer(such as KryoSerializer) supports relocation of serialized objects */ def isPushBasedShuffleEnabled(conf: SparkConf): Boolean = { -val serializer = Utils.classForName(conf.get(SERIALIZER)).getConstructor(classOf[SparkConf]) - .newInstance(conf).asInstanceOf[Serializer] -val canDoPushBasedShuffle = - conf.get(PUSH_BASED_SHUFFLE_ENABLED) && -(conf.get(IS_TESTING).getOrElse(false) || - (conf.get(SHUFFLE_SERVICE_ENABLED) && -conf.get(SparkLauncher.SPARK_MASTER, null) == "yarn" && -// TODO: [SPARK-36744] needs to support IO encryption for push-based shuffle -!conf.get(IO_ENCRYPTION_ENABLED) && -serializer.supportsRelocationOfSerializedObjects)) - -if (!canDoPushBasedShuffle) { - logWarning("Push-based shuffle can only be enabled when the application is submitted" + -"to run in YARN mode, with external shuffle service enabled, IO encryption disabled, and" + -"relocation of serialized objects supported.") -} -canDoPushBasedShuffle +val pushBasedShuffleEnabled = conf.get(PUSH_BASED_SHUFFLE_ENABLED) +if (pushBasedShuffleEnabled) { + val serializer = Utils.classForName(conf.get(SERIALIZER)).getConstructor(classOf[SparkConf]) +.newInstance(conf).asInstanceOf[Serializer] + val canDoPushBasedShuffle = conf.get(IS_TESTING).getOrElse(false) || +(conf.get(SHUFFLE_SERVICE_ENABLED) && + conf.get(SparkLauncher.SPARK_MASTER, null) == "yarn" && + // TODO: [SPARK-36744] needs to support IO encryption for push-based shuffle + !conf.get(IO_ENCRYPTION_ENABLED) && + serializer.supportsRelocationOfSerializedObjects) + + if (!canDoPushBasedShuffle) { +logWarning("Push-based shuffle can only be enabled when the application is submitted " + + "to run in YARN mode, with external shuffle service enabled, IO encryption disabled, " + + "and relocation of serialized objects supported.") + } + + canDoPushBasedShuffle +} else { + false +} } /** - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated (52c5ff2 -> 1ed671c)
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from 52c5ff2 [SPARK-36715][SQL] InferFiltersFromGenerate should not infer filter for udf add 1ed671c [SPARK-36748][PYTHON] Introduce the 'compute.isin_limit' option No new revisions were added by this update. Summary of changes: python/docs/source/user_guide/pandas_on_spark/options.rst | 4 python/pyspark/pandas/config.py | 14 ++ 2 files changed, 18 insertions(+) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.1 updated: [SPARK-36715][SQL] InferFiltersFromGenerate should not infer filter for udf
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-3.1 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.1 by this push: new 7a27ea7 [SPARK-36715][SQL] InferFiltersFromGenerate should not infer filter for udf 7a27ea7 is described below commit 7a27ea7382e3460b43dcc36dab6f31b2a0a87565 Author: Fu Chen AuthorDate: Tue Sep 14 09:26:11 2021 +0900 [SPARK-36715][SQL] InferFiltersFromGenerate should not infer filter for udf ### What changes were proposed in this pull request? Fix InferFiltersFromGenerate bug, InferFiltersFromGenerate should not infer filter for generate when the children contain an expression which is instance of `org.apache.spark.sql.catalyst.expressions.UserDefinedExpression`. Before this pr, the following case will throw an exception. ```scala spark.udf.register("vec", (i: Int) => (0 until i).toArray) sql("select explode(vec(8)) as c1").show ``` ``` Once strategy's idempotence is broken for batch Infer Filters GlobalLimit 21 GlobalLimit 21 +- LocalLimit 21 +- LocalLimit 21 +- Project [cast(c1#3 as string) AS c1#12] +- Project [cast(c1#3 as string) AS c1#12] +- Generate explode(vec(8)), false, [c1#3] +- Generate explode(vec(8)), false, [c1#3] +- Filter ((size(vec(8), true) > 0) AND isnotnull(vec(8))) +- Filter ((size(vec(8), true) > 0) AND isnotnull(vec(8))) !+- OneRowRelation +- Filter ((size(vec(8), true) > 0) AND isnotnull(vec(8))) ! +- OneRowRelation java.lang.RuntimeException: Once strategy's idempotence is broken for batch Infer Filters GlobalLimit 21 GlobalLimit 21 +- LocalLimit 21 +- LocalLimit 21 +- Project [cast(c1#3 as string) AS c1#12] +- Project [cast(c1#3 as string) AS c1#12] +- Generate explode(vec(8)), false, [c1#3] +- Generate explode(vec(8)), false, [c1#3] +- Filter ((size(vec(8), true) > 0) AND isnotnull(vec(8))) +- Filter ((size(vec(8), true) > 0) AND isnotnull(vec(8))) !+- OneRowRelation +- Filter ((size(vec(8), true) > 0) AND isnotnull(vec(8))) ! +- OneRowRelation at org.apache.spark.sql.errors.QueryExecutionErrors$.onceStrategyIdempotenceIsBrokenForBatchError(QueryExecutionErrors.scala:1200) at org.apache.spark.sql.catalyst.rules.RuleExecutor.checkBatchIdempotence(RuleExecutor.scala:168) at org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$execute$1(RuleExecutor.scala:254) at org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$execute$1$adapted(RuleExecutor.scala:200) at scala.collection.immutable.List.foreach(List.scala:431) at org.apache.spark.sql.catalyst.rules.RuleExecutor.execute(RuleExecutor.scala:200) at org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$executeAndTrack$1(RuleExecutor.scala:179) at org.apache.spark.sql.catalyst.QueryPlanningTracker$.withTracker(QueryPlanningTracker.scala:88) at org.apache.spark.sql.catalyst.rules.RuleExecutor.executeAndTrack(RuleExecutor.scala:179) at org.apache.spark.sql.execution.QueryExecution.$anonfun$optimizedPlan$1(QueryExecution.scala:138) at org.apache.spark.sql.catalyst.QueryPlanningTracker.measurePhase(QueryPlanningTracker.scala:111) at org.apache.spark.sql.execution.QueryExecution.$anonfun$executePhase$1(QueryExecution.scala:196) at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775) at org.apache.spark.sql.execution.QueryExecution.executePhase(QueryExecution.scala:196) at org.apache.spark.sql.execution.QueryExecution.optimizedPlan$lzycompute(QueryExecution.scala:134) at org.apache.spark.sql.execution.QueryExecution.optimizedPlan(QueryExecution.scala:130) at org.apache.spark.sql.execution.QueryExecution.assertOptimized(QueryExecution.scala:148) at org.apache.spark.sql.execution.QueryExecution.$anonfun$executedPlan$1(QueryExecution.scala:166) at org.apache.spark.sql.execution.QueryExecution.withCteMap(QueryExecution.scala:73) at org.apache.spark.sql.execution.QueryExecution.executedPlan$lzycompute(QueryExecution.scala:163) at org.apache.spark.
[spark] branch branch-3.2 updated: [SPARK-36715][SQL] InferFiltersFromGenerate should not infer filter for udf
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-3.2 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.2 by this push: new 303590b [SPARK-36715][SQL] InferFiltersFromGenerate should not infer filter for udf 303590b is described below commit 303590b3e928d1658f05200b46025d53eceb0167 Author: Fu Chen AuthorDate: Tue Sep 14 09:26:11 2021 +0900 [SPARK-36715][SQL] InferFiltersFromGenerate should not infer filter for udf ### What changes were proposed in this pull request? Fix InferFiltersFromGenerate bug, InferFiltersFromGenerate should not infer filter for generate when the children contain an expression which is instance of `org.apache.spark.sql.catalyst.expressions.UserDefinedExpression`. Before this pr, the following case will throw an exception. ```scala spark.udf.register("vec", (i: Int) => (0 until i).toArray) sql("select explode(vec(8)) as c1").show ``` ``` Once strategy's idempotence is broken for batch Infer Filters GlobalLimit 21 GlobalLimit 21 +- LocalLimit 21 +- LocalLimit 21 +- Project [cast(c1#3 as string) AS c1#12] +- Project [cast(c1#3 as string) AS c1#12] +- Generate explode(vec(8)), false, [c1#3] +- Generate explode(vec(8)), false, [c1#3] +- Filter ((size(vec(8), true) > 0) AND isnotnull(vec(8))) +- Filter ((size(vec(8), true) > 0) AND isnotnull(vec(8))) !+- OneRowRelation +- Filter ((size(vec(8), true) > 0) AND isnotnull(vec(8))) ! +- OneRowRelation java.lang.RuntimeException: Once strategy's idempotence is broken for batch Infer Filters GlobalLimit 21 GlobalLimit 21 +- LocalLimit 21 +- LocalLimit 21 +- Project [cast(c1#3 as string) AS c1#12] +- Project [cast(c1#3 as string) AS c1#12] +- Generate explode(vec(8)), false, [c1#3] +- Generate explode(vec(8)), false, [c1#3] +- Filter ((size(vec(8), true) > 0) AND isnotnull(vec(8))) +- Filter ((size(vec(8), true) > 0) AND isnotnull(vec(8))) !+- OneRowRelation +- Filter ((size(vec(8), true) > 0) AND isnotnull(vec(8))) ! +- OneRowRelation at org.apache.spark.sql.errors.QueryExecutionErrors$.onceStrategyIdempotenceIsBrokenForBatchError(QueryExecutionErrors.scala:1200) at org.apache.spark.sql.catalyst.rules.RuleExecutor.checkBatchIdempotence(RuleExecutor.scala:168) at org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$execute$1(RuleExecutor.scala:254) at org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$execute$1$adapted(RuleExecutor.scala:200) at scala.collection.immutable.List.foreach(List.scala:431) at org.apache.spark.sql.catalyst.rules.RuleExecutor.execute(RuleExecutor.scala:200) at org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$executeAndTrack$1(RuleExecutor.scala:179) at org.apache.spark.sql.catalyst.QueryPlanningTracker$.withTracker(QueryPlanningTracker.scala:88) at org.apache.spark.sql.catalyst.rules.RuleExecutor.executeAndTrack(RuleExecutor.scala:179) at org.apache.spark.sql.execution.QueryExecution.$anonfun$optimizedPlan$1(QueryExecution.scala:138) at org.apache.spark.sql.catalyst.QueryPlanningTracker.measurePhase(QueryPlanningTracker.scala:111) at org.apache.spark.sql.execution.QueryExecution.$anonfun$executePhase$1(QueryExecution.scala:196) at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775) at org.apache.spark.sql.execution.QueryExecution.executePhase(QueryExecution.scala:196) at org.apache.spark.sql.execution.QueryExecution.optimizedPlan$lzycompute(QueryExecution.scala:134) at org.apache.spark.sql.execution.QueryExecution.optimizedPlan(QueryExecution.scala:130) at org.apache.spark.sql.execution.QueryExecution.assertOptimized(QueryExecution.scala:148) at org.apache.spark.sql.execution.QueryExecution.$anonfun$executedPlan$1(QueryExecution.scala:166) at org.apache.spark.sql.execution.QueryExecution.withCteMap(QueryExecution.scala:73) at org.apache.spark.sql.execution.QueryExecution.executedPlan$lzycompute(QueryExecution.scala:163) at org.apache.spark.
[spark] branch master updated (a440025 -> 52c5ff2)
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from a440025 [SPARK-36739][DOCS][PYTHON] Add apache license headers to makefiles add 52c5ff2 [SPARK-36715][SQL] InferFiltersFromGenerate should not infer filter for udf No new revisions were added by this update. Summary of changes: .../spark/sql/catalyst/optimizer/Optimizer.scala | 3 ++- .../optimizer/InferFiltersFromGenerateSuite.scala | 24 +- 2 files changed, 25 insertions(+), 2 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.2 updated: [SPARK-36739][DOCS][PYTHON] Add apache license headers to makefiles
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-3.2 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.2 by this push: new b3488a5 [SPARK-36739][DOCS][PYTHON] Add apache license headers to makefiles b3488a5 is described below commit b3488a50d756bfba5b677246b2cd2ca1bc1beb06 Author: Leona Yoda AuthorDate: Tue Sep 14 09:16:05 2021 +0900 [SPARK-36739][DOCS][PYTHON] Add apache license headers to makefiles ### What changes were proposed in this pull request? Add apache license headers to makefiles of PySpark documents. ### Why are the changes needed? Makefiles of PySpark documentations do not have apache license headers, while the other files have. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? `make html` Closes #33979 from yoda-mon/add-license-header-makefiles. Authored-by: Leona Yoda Signed-off-by: Hyukjin Kwon (cherry picked from commit a440025f08e374df4799c8749970bac56b38b7f5) Signed-off-by: Hyukjin Kwon --- python/docs/Makefile | 15 +++ python/docs/make.bat | 19 ++- python/docs/make2.bat | 23 --- 3 files changed, 53 insertions(+), 4 deletions(-) diff --git a/python/docs/Makefile b/python/docs/Makefile index 86bce0f..a0275b9 100644 --- a/python/docs/Makefile +++ b/python/docs/Makefile @@ -1,3 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +#http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + # Minimal makefile for Sphinx documentation # You can set these variables from the command line. diff --git a/python/docs/make.bat b/python/docs/make.bat index cc29acd..ad72e8f 100644 --- a/python/docs/make.bat +++ b/python/docs/make.bat @@ -1,4 +1,21 @@ -@ECHO OFF +@echo off + +rem +rem Licensed to the Apache Software Foundation (ASF) under one or more +rem contributor license agreements. See the NOTICE file distributed with +rem this work for additional information regarding copyright ownership. +rem The ASF licenses this file to You under the Apache License, Version 2.0 +rem (the "License"); you may not use this file except in compliance with +rem the License. You may obtain a copy of the License at +rem +remhttp://www.apache.org/licenses/LICENSE-2.0 +rem +rem Unless required by applicable law or agreed to in writing, software +rem distributed under the License is distributed on an "AS IS" BASIS, +rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +rem See the License for the specific language governing permissions and +rem limitations under the License. +rem rem This is the entry point for running Sphinx documentation. To avoid polluting the rem environment, it just launches a new cmd to do the real work. diff --git a/python/docs/make2.bat b/python/docs/make2.bat index 26364c1..29321bb 100644 --- a/python/docs/make2.bat +++ b/python/docs/make2.bat @@ -1,6 +1,23 @@ -@ECHO OFF - -REM Command file for Sphinx documentation +@echo off + +rem +rem Licensed to the Apache Software Foundation (ASF) under one or more +rem contributor license agreements. See the NOTICE file distributed with +rem this work for additional information regarding copyright ownership. +rem The ASF licenses this file to You under the Apache License, Version 2.0 +rem (the "License"); you may not use this file except in compliance with +rem the License. You may obtain a copy of the License at +rem +remhttp://www.apache.org/licenses/LICENSE-2.0 +rem +rem Unless required by applicable law or agreed to in writing, software +rem distributed under the License is distributed on an "AS IS" BASIS, +rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +rem See the License for the specific language governing permissions and +rem limitations under the License. +rem + +rem Command file for Sphinx documentation if "%SPHINXBUILD%" == "" ( set SPHINXBUILD=sphinx-build - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache
[spark] branch master updated (f8657d1 -> a440025)
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from f8657d1 [SPARK-36653][PYTHON] Implement Series.__xor__ and Series.__rxor__ add a440025 [SPARK-36739][DOCS][PYTHON] Add apache license headers to makefiles No new revisions were added by this update. Summary of changes: python/docs/Makefile | 15 +++ python/docs/make.bat | 19 ++- python/docs/make2.bat | 23 --- 3 files changed, 53 insertions(+), 4 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-36653][PYTHON] Implement Series.__xor__ and Series.__rxor__
This is an automated email from the ASF dual-hosted git repository. ueshin pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new f8657d1 [SPARK-36653][PYTHON] Implement Series.__xor__ and Series.__rxor__ f8657d1 is described below commit f8657d192413160c304ad342184661314f5472f6 Author: dgd-contributor AuthorDate: Mon Sep 13 15:09:22 2021 -0700 [SPARK-36653][PYTHON] Implement Series.__xor__ and Series.__rxor__ ### What changes were proposed in this pull request? Implement Series.\_\_xor__ and Series.\_\_rxor__ ### Why are the changes needed? Follow pandas ### Does this PR introduce _any_ user-facing change? Yes, user can use ``` python psdf = ps.DataFrame([[11, 11], [1, 2]]) psdf[0] ^ psdf[1] ``` ### How was this patch tested? unit tests Closes #33911 from dgd-contributor/SPARK-36653_Implement_Series._xor_. Authored-by: dgd-contributor Signed-off-by: Takuya UESHIN --- python/pyspark/pandas/base.py | 6 +++ python/pyspark/pandas/data_type_ops/base.py| 27 ++ python/pyspark/pandas/data_type_ops/boolean_ops.py | 38 ++ python/pyspark/pandas/data_type_ops/num_ops.py | 30 +++ python/pyspark/pandas/indexes/base.py | 3 ++ .../pandas/tests/data_type_ops/test_boolean_ops.py | 47 ++ .../pandas/tests/data_type_ops/test_num_ops.py | 58 ++ .../pandas/tests/data_type_ops/testing_utils.py| 8 +++ 8 files changed, 217 insertions(+) diff --git a/python/pyspark/pandas/base.py b/python/pyspark/pandas/base.py index 58f6c19..533460c 100644 --- a/python/pyspark/pandas/base.py +++ b/python/pyspark/pandas/base.py @@ -428,6 +428,12 @@ class IndexOpsMixin(object, metaclass=ABCMeta): def __ror__(self, other: Any) -> SeriesOrIndex: return self._dtype_op.ror(self, other) +def __xor__(self, other: Any) -> SeriesOrIndex: +return self._dtype_op.xor(self, other) + +def __rxor__(self, other: Any) -> SeriesOrIndex: +return self._dtype_op.rxor(self, other) + def __len__(self) -> int: return len(self._psdf) diff --git a/python/pyspark/pandas/data_type_ops/base.py b/python/pyspark/pandas/data_type_ops/base.py index e6261c3..7900432 100644 --- a/python/pyspark/pandas/data_type_ops/base.py +++ b/python/pyspark/pandas/data_type_ops/base.py @@ -195,6 +195,26 @@ def _sanitize_list_like(operand: Any) -> None: raise TypeError("The operation can not be applied to %s." % type(operand).__name__) +def _is_valid_for_logical_operator(right: Any) -> bool: +from pyspark.pandas.base import IndexOpsMixin + +return isinstance(right, (int, bool)) or ( +isinstance(right, IndexOpsMixin) +and ( +isinstance(right.spark.data_type, BooleanType) +or isinstance(right.spark.data_type, IntegralType) +) +) + + +def _is_boolean_type(right: Any) -> bool: +from pyspark.pandas.base import IndexOpsMixin + +return isinstance(right, bool) or ( +isinstance(right, IndexOpsMixin) and isinstance(right.spark.data_type, BooleanType) +) + + class DataTypeOps(object, metaclass=ABCMeta): """The base class for binary operations of pandas-on-Spark objects (of different data types).""" @@ -319,6 +339,9 @@ class DataTypeOps(object, metaclass=ABCMeta): def __and__(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex: raise TypeError("Bitwise and can not be applied to %s." % self.pretty_name) +def xor(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex: +raise TypeError("Bitwise xor can not be applied to %s." % self.pretty_name) + def __or__(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex: raise TypeError("Bitwise or can not be applied to %s." % self.pretty_name) @@ -326,6 +349,10 @@ class DataTypeOps(object, metaclass=ABCMeta): _sanitize_list_like(right) return left.__and__(right) +def rxor(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex: +_sanitize_list_like(right) +return left ^ right + def ror(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex: _sanitize_list_like(right) return left.__or__(right) diff --git a/python/pyspark/pandas/data_type_ops/boolean_ops.py b/python/pyspark/pandas/data_type_ops/boolean_ops.py index cb77945..5ca5aa2 100644 --- a/python/pyspark/pandas/data_type_ops/boolean_ops.py +++ b/python/pyspark/pandas/data_type_ops/boolean_ops.py @@ -31,6 +31,8 @@ from pyspark.pandas.data_type_ops.base import ( _as_categorical_type, _as_other_type, _sanitize_list_like, +_is_valid_for_logical_operator, +_is_boolean_type, ) from pyspark.pandas.spark import functions as SF from pyspark.pandas.typedef.typehints import as_spark_type,
[spark] branch branch-3.2 updated: [SPARK-36705][SHUFFLE] Disable push based shuffle when IO encryption is enabled or serializer is not relocatable
This is an automated email from the ASF dual-hosted git repository. mridulm80 pushed a commit to branch branch-3.2 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.2 by this push: new 79e148e [SPARK-36705][SHUFFLE] Disable push based shuffle when IO encryption is enabled or serializer is not relocatable 79e148e is described below commit 79e148ee934404b2f3a748847dc57f13b05dbc87 Author: Minchu Yang AuthorDate: Mon Sep 13 16:14:35 2021 -0500 [SPARK-36705][SHUFFLE] Disable push based shuffle when IO encryption is enabled or serializer is not relocatable ### What changes were proposed in this pull request? Disable push-based shuffle when IO encryption is enabled or serializer does not support relocation of serialized objects. ### Why are the changes needed? Push based shuffle is not compatible with IO encryption or non-relocatable serialization. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Added some tests to check whether push-based shuffle can be disabled successfully when IO encryption is enabled or a serializer that does not support relocation of serialized object is used. Closes #33976 from rmcyang/SPARK-36705. Authored-by: Minchu Yang Signed-off-by: Mridul Muralidharan gmail.com> (cherry picked from commit 999473b1a5bad4ae2ae345df8abf018100c9d918) Signed-off-by: Mridul Muralidharan --- .../main/scala/org/apache/spark/util/Utils.scala | 30 +- .../scala/org/apache/spark/util/UtilsSuite.scala | 11 ++-- 2 files changed, 32 insertions(+), 9 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala index b130789..a112214 100644 --- a/core/src/main/scala/org/apache/spark/util/Utils.scala +++ b/core/src/main/scala/org/apache/spark/util/Utils.scala @@ -70,7 +70,7 @@ import org.apache.spark.internal.config.UI._ import org.apache.spark.internal.config.Worker._ import org.apache.spark.launcher.SparkLauncher import org.apache.spark.network.util.JavaUtils -import org.apache.spark.serializer.{DeserializationStream, SerializationStream, SerializerInstance} +import org.apache.spark.serializer.{DeserializationStream, SerializationStream, Serializer, SerializerInstance} import org.apache.spark.status.api.v1.{StackTrace, ThreadStackTrace} import org.apache.spark.util.io.ChunkedByteBufferOutputStream @@ -2591,14 +2591,30 @@ private[spark] object Utils extends Logging { } /** - * Push based shuffle can only be enabled when the application is submitted - * to run in YARN mode, with external shuffle service enabled + * Push based shuffle can only be enabled when below conditions are met: + * - the application is submitted to run in YARN mode + * - external shuffle service enabled + * - IO encryption disabled + * - serializer(such as KryoSerializer) supports relocation of serialized objects */ def isPushBasedShuffleEnabled(conf: SparkConf): Boolean = { -conf.get(PUSH_BASED_SHUFFLE_ENABLED) && - (conf.get(IS_TESTING).getOrElse(false) || -(conf.get(SHUFFLE_SERVICE_ENABLED) && - conf.get(SparkLauncher.SPARK_MASTER, null) == "yarn")) +val serializer = Utils.classForName(conf.get(SERIALIZER)).getConstructor(classOf[SparkConf]) + .newInstance(conf).asInstanceOf[Serializer] +val canDoPushBasedShuffle = + conf.get(PUSH_BASED_SHUFFLE_ENABLED) && +(conf.get(IS_TESTING).getOrElse(false) || + (conf.get(SHUFFLE_SERVICE_ENABLED) && +conf.get(SparkLauncher.SPARK_MASTER, null) == "yarn" && +// TODO: [SPARK-36744] needs to support IO encryption for push-based shuffle +!conf.get(IO_ENCRYPTION_ENABLED) && +serializer.supportsRelocationOfSerializedObjects)) + +if (!canDoPushBasedShuffle) { + logWarning("Push-based shuffle can only be enabled when the application is submitted" + +"to run in YARN mode, with external shuffle service enabled, IO encryption disabled, and" + +"relocation of serialized objects supported.") +} +canDoPushBasedShuffle } /** diff --git a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala index 095dbef..de8f4ce 100644 --- a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala @@ -1447,10 +1447,17 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging { assert(Utils.isPushBasedShuffleEnabled(conf) === false) conf.set(SHUFFLE_SERVICE_ENABLED, true) conf.set(SparkLauncher.SPARK_MASTER, "yarn") -conf.set("spark.yarn.maxAttempts", "1") +conf.set("spark.yarn.maxAppAttempts", "1") +conf.set(SERIALIZER,
[spark] branch master updated: [SPARK-36705][SHUFFLE] Disable push based shuffle when IO encryption is enabled or serializer is not relocatable
This is an automated email from the ASF dual-hosted git repository. mridulm80 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 999473b [SPARK-36705][SHUFFLE] Disable push based shuffle when IO encryption is enabled or serializer is not relocatable 999473b is described below commit 999473b1a5bad4ae2ae345df8abf018100c9d918 Author: Minchu Yang AuthorDate: Mon Sep 13 16:14:35 2021 -0500 [SPARK-36705][SHUFFLE] Disable push based shuffle when IO encryption is enabled or serializer is not relocatable ### What changes were proposed in this pull request? Disable push-based shuffle when IO encryption is enabled or serializer does not support relocation of serialized objects. ### Why are the changes needed? Push based shuffle is not compatible with IO encryption or non-relocatable serialization. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Added some tests to check whether push-based shuffle can be disabled successfully when IO encryption is enabled or a serializer that does not support relocation of serialized object is used. Closes #33976 from rmcyang/SPARK-36705. Authored-by: Minchu Yang Signed-off-by: Mridul Muralidharan gmail.com> --- .../main/scala/org/apache/spark/util/Utils.scala | 30 +- .../scala/org/apache/spark/util/UtilsSuite.scala | 11 ++-- 2 files changed, 32 insertions(+), 9 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala index 5bbb4790..bbff56c 100644 --- a/core/src/main/scala/org/apache/spark/util/Utils.scala +++ b/core/src/main/scala/org/apache/spark/util/Utils.scala @@ -70,7 +70,7 @@ import org.apache.spark.internal.config.UI._ import org.apache.spark.internal.config.Worker._ import org.apache.spark.launcher.SparkLauncher import org.apache.spark.network.util.JavaUtils -import org.apache.spark.serializer.{DeserializationStream, SerializationStream, SerializerInstance} +import org.apache.spark.serializer.{DeserializationStream, SerializationStream, Serializer, SerializerInstance} import org.apache.spark.status.api.v1.{StackTrace, ThreadStackTrace} import org.apache.spark.util.io.ChunkedByteBufferOutputStream @@ -2597,14 +2597,30 @@ private[spark] object Utils extends Logging { } /** - * Push based shuffle can only be enabled when the application is submitted - * to run in YARN mode, with external shuffle service enabled + * Push based shuffle can only be enabled when below conditions are met: + * - the application is submitted to run in YARN mode + * - external shuffle service enabled + * - IO encryption disabled + * - serializer(such as KryoSerializer) supports relocation of serialized objects */ def isPushBasedShuffleEnabled(conf: SparkConf): Boolean = { -conf.get(PUSH_BASED_SHUFFLE_ENABLED) && - (conf.get(IS_TESTING).getOrElse(false) || -(conf.get(SHUFFLE_SERVICE_ENABLED) && - conf.get(SparkLauncher.SPARK_MASTER, null) == "yarn")) +val serializer = Utils.classForName(conf.get(SERIALIZER)).getConstructor(classOf[SparkConf]) + .newInstance(conf).asInstanceOf[Serializer] +val canDoPushBasedShuffle = + conf.get(PUSH_BASED_SHUFFLE_ENABLED) && +(conf.get(IS_TESTING).getOrElse(false) || + (conf.get(SHUFFLE_SERVICE_ENABLED) && +conf.get(SparkLauncher.SPARK_MASTER, null) == "yarn" && +// TODO: [SPARK-36744] needs to support IO encryption for push-based shuffle +!conf.get(IO_ENCRYPTION_ENABLED) && +serializer.supportsRelocationOfSerializedObjects)) + +if (!canDoPushBasedShuffle) { + logWarning("Push-based shuffle can only be enabled when the application is submitted" + +"to run in YARN mode, with external shuffle service enabled, IO encryption disabled, and" + +"relocation of serialized objects supported.") +} +canDoPushBasedShuffle } /** diff --git a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala index c1b7b5f..a4df5cd 100644 --- a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala @@ -1509,10 +1509,17 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging { assert(Utils.isPushBasedShuffleEnabled(conf) === false) conf.set(SHUFFLE_SERVICE_ENABLED, true) conf.set(SparkLauncher.SPARK_MASTER, "yarn") -conf.set("spark.yarn.maxAttempts", "1") +conf.set("spark.yarn.maxAppAttempts", "1") +conf.set(SERIALIZER, "org.apache.spark.serializer.KryoSerializer") assert(Utils.isPushBasedShuffleEnabled(conf) === true) -conf.set(
[spark] branch branch-3.2 updated: [SPARK-36712][BUILD] Make scala-parallel-collections in 2.13 POM a direct dependency (not in maven profile)
This is an automated email from the ASF dual-hosted git repository. srowen pushed a commit to branch branch-3.2 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.2 by this push: new 2e75837 [SPARK-36712][BUILD] Make scala-parallel-collections in 2.13 POM a direct dependency (not in maven profile) 2e75837 is described below commit 2e7583799ebd2cecabdd7bd0271ad129f852c569 Author: Lukas Rytz AuthorDate: Mon Sep 13 11:06:50 2021 -0500 [SPARK-36712][BUILD] Make scala-parallel-collections in 2.13 POM a direct dependency (not in maven profile) As [reported on `devspark.apache.org`](https://lists.apache.org/thread.html/r84cff66217de438f1389899e6d6891b573780159cd45463acf3657aa%40%3Cdev.spark.apache.org%3E), the published POMs when building with Scala 2.13 have the `scala-parallel-collections` dependency only in the `scala-2.13` profile of the pom. ### What changes were proposed in this pull request? This PR suggests to work around this by un-commenting the `scala-parallel-collections` dependency when switching to 2.13 using the the `change-scala-version.sh` script. I included an upgrade to scala-parallel-collections version 1.0.3, the changes compared to 0.2.0 are minor. - removed OSGi metadata - renamed some internal inner classes - added `Automatic-Module-Name` ### Why are the changes needed? According to the posts, this solves issues for developers that write unit tests for their applications. Stephen Coy suggested to use the https://www.mojohaus.org/flatten-maven-plugin. While this sounds like a more principled solution, it is possibly too risky to do at this specific point in time? ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Locally Closes #33948 from lrytz/parCollDep. Authored-by: Lukas Rytz Signed-off-by: Sean Owen (cherry picked from commit 1a62e6a2c119df707f15101b03ecff0c3dee62f5) Signed-off-by: Sean Owen --- core/pom.xml| 15 ++- dev/change-scala-version.sh | 12 external/avro/pom.xml | 17 ++--- external/kafka-0-10-sql/pom.xml | 17 ++--- external/kafka-0-10/pom.xml | 18 ++ mllib/pom.xml | 18 ++ pom.xml | 16 +++- sql/catalyst/pom.xml| 18 ++ sql/core/pom.xml| 15 ++- sql/hive-thriftserver/pom.xml | 17 ++--- sql/hive/pom.xml| 15 ++- streaming/pom.xml | 17 ++--- 12 files changed, 75 insertions(+), 120 deletions(-) diff --git a/core/pom.xml b/core/pom.xml index d2b4616..2229a95 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -35,6 +35,12 @@ + org.apache.avro avro @@ -639,15 +645,6 @@ - - scala-2.13 - - - org.scala-lang.modules - scala-parallel-collections_${scala.binary.version} - - - diff --git a/dev/change-scala-version.sh b/dev/change-scala-version.sh index 48b7f64..e17a224 100755 --- a/dev/change-scala-version.sh +++ b/dev/change-scala-version.sh @@ -54,11 +54,15 @@ sed_i() { sed -e "$1" "$2" > "$2.tmp" && mv "$2.tmp" "$2" } -export -f sed_i - BASEDIR=$(dirname $0)/.. -find "$BASEDIR" -name 'pom.xml' -not -path '*target*' -print \ - -exec bash -c "sed_i 's/\(artifactId.*\)_'$FROM_VERSION'/\1_'$TO_VERSION'/g' {}" \; +for f in $(find "$BASEDIR" -name 'pom.xml' -not -path '*target*'); do + echo $f + sed_i 's/\(artifactId.*\)_'$FROM_VERSION'/\1_'$TO_VERSION'/g' $f + sed_i 's/^\([[:space:]]*\)\(\)/\1\2/' $f + sed_i 's/^\([[:space:]]*\)/\1\)/\1-->\2/' $f +done # dependency:get is workaround for SPARK-34762 to download the JAR file of commons-cli. # Without this, build with Scala 2.13 using SBT will fail because the help plugin used below downloads only the POM file. diff --git a/external/avro/pom.xml b/external/avro/pom.xml index 92b8b73..6dbfc68 100644 --- a/external/avro/pom.xml +++ b/external/avro/pom.xml @@ -70,22 +70,17 @@ org.apache.spark spark-tags_${scala.binary.version} + org.tukaani xz - - - scala-2.13 - - - org.scala-lang.modules - scala-parallel-collections_${scala.binary.version} - - - - target/scala-${scala.binary.version}/classes target/scala-${scala.binary.version}/test-classes diff --git a/external/kafka-0-10-sql/pom.xml b/external/kafka-0-10-sql/pom.xml index b194592..a3988db 100644 --- a/external/kafka-0-10-sql/pom.xml +++ b/external/kafka-0-10-sql/pom.xml @@ -74,6 +74,12 @@ test-jar test + org
[spark] branch master updated: [SPARK-36712][BUILD] Make scala-parallel-collections in 2.13 POM a direct dependency (not in maven profile)
This is an automated email from the ASF dual-hosted git repository. srowen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 1a62e6a [SPARK-36712][BUILD] Make scala-parallel-collections in 2.13 POM a direct dependency (not in maven profile) 1a62e6a is described below commit 1a62e6a2c119df707f15101b03ecff0c3dee62f5 Author: Lukas Rytz AuthorDate: Mon Sep 13 11:06:50 2021 -0500 [SPARK-36712][BUILD] Make scala-parallel-collections in 2.13 POM a direct dependency (not in maven profile) As [reported on `devspark.apache.org`](https://lists.apache.org/thread.html/r84cff66217de438f1389899e6d6891b573780159cd45463acf3657aa%40%3Cdev.spark.apache.org%3E), the published POMs when building with Scala 2.13 have the `scala-parallel-collections` dependency only in the `scala-2.13` profile of the pom. ### What changes were proposed in this pull request? This PR suggests to work around this by un-commenting the `scala-parallel-collections` dependency when switching to 2.13 using the the `change-scala-version.sh` script. I included an upgrade to scala-parallel-collections version 1.0.3, the changes compared to 0.2.0 are minor. - removed OSGi metadata - renamed some internal inner classes - added `Automatic-Module-Name` ### Why are the changes needed? According to the posts, this solves issues for developers that write unit tests for their applications. Stephen Coy suggested to use the https://www.mojohaus.org/flatten-maven-plugin. While this sounds like a more principled solution, it is possibly too risky to do at this specific point in time? ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Locally Closes #33948 from lrytz/parCollDep. Authored-by: Lukas Rytz Signed-off-by: Sean Owen --- core/pom.xml| 15 ++- dev/change-scala-version.sh | 12 external/avro/pom.xml | 17 ++--- external/kafka-0-10-sql/pom.xml | 17 ++--- external/kafka-0-10/pom.xml | 18 ++ mllib/pom.xml | 18 ++ pom.xml | 16 +++- sql/catalyst/pom.xml| 18 ++ sql/core/pom.xml| 15 ++- sql/hive-thriftserver/pom.xml | 17 ++--- sql/hive/pom.xml| 15 ++- streaming/pom.xml | 17 ++--- 12 files changed, 75 insertions(+), 120 deletions(-) diff --git a/core/pom.xml b/core/pom.xml index be44964..dbde22f 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -35,6 +35,12 @@ + org.apache.avro avro @@ -639,15 +645,6 @@ - - scala-2.13 - - - org.scala-lang.modules - scala-parallel-collections_${scala.binary.version} - - - diff --git a/dev/change-scala-version.sh b/dev/change-scala-version.sh index 48b7f64..e17a224 100755 --- a/dev/change-scala-version.sh +++ b/dev/change-scala-version.sh @@ -54,11 +54,15 @@ sed_i() { sed -e "$1" "$2" > "$2.tmp" && mv "$2.tmp" "$2" } -export -f sed_i - BASEDIR=$(dirname $0)/.. -find "$BASEDIR" -name 'pom.xml' -not -path '*target*' -print \ - -exec bash -c "sed_i 's/\(artifactId.*\)_'$FROM_VERSION'/\1_'$TO_VERSION'/g' {}" \; +for f in $(find "$BASEDIR" -name 'pom.xml' -not -path '*target*'); do + echo $f + sed_i 's/\(artifactId.*\)_'$FROM_VERSION'/\1_'$TO_VERSION'/g' $f + sed_i 's/^\([[:space:]]*\)\(\)/\1\2/' $f + sed_i 's/^\([[:space:]]*\)/\1\)/\1-->\2/' $f +done # dependency:get is workaround for SPARK-34762 to download the JAR file of commons-cli. # Without this, build with Scala 2.13 using SBT will fail because the help plugin used below downloads only the POM file. diff --git a/external/avro/pom.xml b/external/avro/pom.xml index d9d3583..7e414be 100644 --- a/external/avro/pom.xml +++ b/external/avro/pom.xml @@ -70,22 +70,17 @@ org.apache.spark spark-tags_${scala.binary.version} + org.tukaani xz - - - scala-2.13 - - - org.scala-lang.modules - scala-parallel-collections_${scala.binary.version} - - - - target/scala-${scala.binary.version}/classes target/scala-${scala.binary.version}/test-classes diff --git a/external/kafka-0-10-sql/pom.xml b/external/kafka-0-10-sql/pom.xml index 8d505cf..7bedcee 100644 --- a/external/kafka-0-10-sql/pom.xml +++ b/external/kafka-0-10-sql/pom.xml @@ -74,6 +74,12 @@ test-jar test + org.apache.kafka kafka-clients @@ -169,17 +175,6 @@ - - - scala-2.13 - -
[spark] branch master updated: [SPARK-36736][SQL] Support ILIKE (ALL | ANY | SOME) - case insensitive LIKE
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new bd62ad9 [SPARK-36736][SQL] Support ILIKE (ALL | ANY | SOME) - case insensitive LIKE bd62ad9 is described below commit bd62ad99823e8d5aacbfbf09d9f605dc9ee9f1e9 Author: Max Gekk AuthorDate: Mon Sep 13 22:51:49 2021 +0800 [SPARK-36736][SQL] Support ILIKE (ALL | ANY | SOME) - case insensitive LIKE ### What changes were proposed in this pull request? In the PR, I propose to support a case-insensitive variant of the `LIKE (ALL | ANY | SOME)` expression - `ILIKE`. In this way, Spark's users can match strings to single pattern in the case-insensitive manner. For example: ```sql spark-sql> create table ilike_example(subject varchar(20)); spark-sql> insert into ilike_example values > ('jane doe'), > ('Jane Doe'), > ('JANE DOE'), > ('John Doe'), > ('John Smith'); spark-sql> select * > from ilike_example > where subject ilike any ('jane%', '%SMITH') > order by subject; JANE DOE Jane Doe John Smith jane doe ``` The syntax of `ILIKE` is similar to `LIKE`: ``` str NOT? ILIKE (ANY | SOME | ALL) (pattern+) ``` ### Why are the changes needed? 1. To improve user experience with Spark SQL. No need to use `lower(col_name)` in where clauses. 2. To make migration from other popular DMBSs to Spark SQL easier. DBMSs below support `ilike` in SQL: - [Snowflake](https://docs.snowflake.com/en/sql-reference/functions/ilike.html#ilike) - [PostgreSQL](https://www.postgresql.org/docs/12/functions-matching.html) - [CockroachDB](https://www.cockroachlabs.com/docs/stable/functions-and-operators.html) ### Does this PR introduce _any_ user-facing change? No, it doesn't. The PR **extends** existing APIs. ### How was this patch tested? 1. By running of expression examples via: ``` $ build/sbt "sql/test:testOnly org.apache.spark.sql.expressions.ExpressionInfoSuite" ``` 2. Added new test to test parsing of `ILIKE`: ``` $ build/sbt "test:testOnly *.ExpressionParserSuite" ``` 3. Via existing test suites: ``` $ build/sbt "sql/testOnly org.apache.spark.sql.SQLQueryTestSuite -- -z ilike-any.sql" $ build/sbt "sql/testOnly org.apache.spark.sql.SQLQueryTestSuite -- -z ilike-all.sql" ``` Closes #33966 from MaxGekk/ilike-any. Authored-by: Max Gekk Signed-off-by: Wenchen Fan --- .../apache/spark/sql/catalyst/parser/SqlBase.g4| 2 +- .../spark/sql/catalyst/parser/AstBuilder.scala | 37 -- .../catalyst/parser/ExpressionParserSuite.scala| 15 +++ .../test/resources/sql-tests/inputs/ilike-all.sql | 41 ++ .../test/resources/sql-tests/inputs/ilike-any.sql | 41 ++ .../test/resources/sql-tests/inputs/like-all.sql | 2 +- .../resources/sql-tests/results/ilike-all.sql.out | 140 .../resources/sql-tests/results/ilike-any.sql.out | 146 + .../resources/sql-tests/results/like-all.sql.out | 4 +- 9 files changed, 414 insertions(+), 14 deletions(-) diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 index 3cceda3..bd9f923 100644 --- a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 +++ b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 @@ -797,7 +797,7 @@ predicate | NOT? kind=IN '(' expression (',' expression)* ')' | NOT? kind=IN '(' query ')' | NOT? kind=RLIKE pattern=valueExpression -| NOT? kind=LIKE quantifier=(ANY | SOME | ALL) ('('')' | '(' expression (',' expression)* ')') +| NOT? kind=(LIKE | ILIKE) quantifier=(ANY | SOME | ALL) ('('')' | '(' expression (',' expression)* ')') | NOT? kind=(LIKE | ILIKE) pattern=valueExpression (ESCAPE escapeChar=STRING)? | IS NOT? kind=NULL | IS NOT? kind=(TRUE | FALSE | UNKNOWN) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala index 53e0de4..1b12994 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala @@ -1557,7 +1557,7 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg * Add a predicate to the given expression. Supported expressions are: * - (NOT) BETWEEN * - (NOT) IN - * - (NOT) LIKE (ANY | SOME | ALL) + * - (NOT) (LIKE | ILIKE) (ANY | SOME | ALL)
[spark] branch master updated: [SPARK-36724][SQL] Support timestamp_ntz as a type of time column for SessionWindow
This is an automated email from the ASF dual-hosted git repository. gengliang pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new e858cd5 [SPARK-36724][SQL] Support timestamp_ntz as a type of time column for SessionWindow e858cd5 is described below commit e858cd568a74123f7fd8fe4c3d2917a7e5bbb685 Author: Kousuke Saruta AuthorDate: Mon Sep 13 21:47:43 2021 +0800 [SPARK-36724][SQL] Support timestamp_ntz as a type of time column for SessionWindow ### What changes were proposed in this pull request? This PR proposes to support `timestamp_ntz` as a type of time column for `SessionWIndow` like `TimeWindow` does. ### Why are the changes needed? For better usability. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? New test. Closes #33965 from sarutak/session-window-ntz. Authored-by: Kousuke Saruta Signed-off-by: Gengliang Wang --- .../spark/sql/catalyst/analysis/Analyzer.scala | 9 +++--- .../sql/catalyst/expressions/SessionWindow.scala | 6 ++-- .../spark/sql/DataFrameSessionWindowingSuite.scala | 33 -- 3 files changed, 39 insertions(+), 9 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index 340b859..0f90159 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -3999,7 +3999,8 @@ object SessionWindowing extends Rule[LogicalPlan] { val sessionAttr = AttributeReference( SESSION_COL_NAME, session.dataType, metadata = newMetadata)() -val sessionStart = PreciseTimestampConversion(session.timeColumn, TimestampType, LongType) +val sessionStart = + PreciseTimestampConversion(session.timeColumn, session.timeColumn.dataType, LongType) val gapDuration = session.gapDuration match { case expr if Cast.canCast(expr.dataType, CalendarIntervalType) => Cast(expr, CalendarIntervalType) @@ -4007,13 +4008,13 @@ object SessionWindowing extends Rule[LogicalPlan] { throw QueryCompilationErrors.sessionWindowGapDurationDataTypeError(other.dataType) } val sessionEnd = PreciseTimestampConversion(session.timeColumn + gapDuration, - TimestampType, LongType) + session.timeColumn.dataType, LongType) val literalSessionStruct = CreateNamedStruct( Literal(SESSION_START) :: -PreciseTimestampConversion(sessionStart, LongType, TimestampType) :: +PreciseTimestampConversion(sessionStart, LongType, session.timeColumn.dataType) :: Literal(SESSION_END) :: -PreciseTimestampConversion(sessionEnd, LongType, TimestampType) :: +PreciseTimestampConversion(sessionEnd, LongType, session.timeColumn.dataType) :: Nil) val sessionStruct = Alias(literalSessionStruct, SESSION_COL_NAME)( diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SessionWindow.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SessionWindow.scala index 796ea27..77e8dfd 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SessionWindow.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SessionWindow.scala @@ -69,10 +69,10 @@ case class SessionWindow(timeColumn: Expression, gapDuration: Expression) extend with NonSQLExpression { override def children: Seq[Expression] = Seq(timeColumn, gapDuration) - override def inputTypes: Seq[AbstractDataType] = Seq(TimestampType, AnyDataType) + override def inputTypes: Seq[AbstractDataType] = Seq(AnyTimestampType, AnyDataType) override def dataType: DataType = new StructType() -.add(StructField("start", TimestampType)) -.add(StructField("end", TimestampType)) +.add(StructField("start", timeColumn.dataType)) +.add(StructField("end", timeColumn.dataType)) // This expression is replaced in the analyzer. override lazy val resolved = false diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSessionWindowingSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSessionWindowingSuite.scala index 7a0cd42..b3d2127 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSessionWindowingSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSessionWindowingSuite.scala @@ -17,12 +17,15 @@ package org.apache.spark.sql +import java.time.LocalDateTime + import org.scalatest.BeforeAndAfterEach -import org.apache.spark.sql.catalyst.plans.l
[spark] branch branch-3.0 updated (6b804c7 -> 540937f)
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a change to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git. from 6b804c7 [SPARK-36636][CORE][TEST] LocalSparkCluster change to use tmp workdir in test to avoid directory name collision add 540937f [SPARK-36738][SQL][DOC] Fixed the wrong documentation on Cot API No new revisions were added by this update. Summary of changes: .../org/apache/spark/sql/catalyst/expressions/mathExpressions.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.1 updated: [SPARK-36738][SQL][DOC] Fixed the wrong documentation on Cot API
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-3.1 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.1 by this push: new 2f0efbd [SPARK-36738][SQL][DOC] Fixed the wrong documentation on Cot API 2f0efbd is described below commit 2f0efbd30cafde9586f4989202f0a026bb914c2c Author: Yuto Akutsu AuthorDate: Mon Sep 13 21:51:29 2021 +0900 [SPARK-36738][SQL][DOC] Fixed the wrong documentation on Cot API Fixed wrong documentation on Cot API [Doc](https://spark.apache.org/docs/latest/api/sql/index.html#cot) says `1/java.lang.Math.cot` but it should be `1/java.lang.Math.tan`. No. Manual check. Closes #33978 from yutoacts/SPARK-36738. Authored-by: Yuto Akutsu Signed-off-by: Hyukjin Kwon (cherry picked from commit 3747cfdb402955cc19c9a383713b569fc010db70) Signed-off-by: Hyukjin Kwon --- .../org/apache/spark/sql/catalyst/expressions/mathExpressions.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala index 931365f..7bfea0d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala @@ -660,7 +660,7 @@ case class Tan(child: Expression) extends UnaryMathExpression(math.tan, "TAN") @ExpressionDescription( usage = """ -_FUNC_(expr) - Returns the cotangent of `expr`, as if computed by `1/java.lang.Math._FUNC_`. +_FUNC_(expr) - Returns the cotangent of `expr`, as if computed by `1/java.lang.Math.tan`. """, arguments = """ Arguments: - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.2 updated: [SPARK-36738][SQL][DOC] Fixed the wrong documentation on Cot API
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-3.2 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.2 by this push: new b043ee4 [SPARK-36738][SQL][DOC] Fixed the wrong documentation on Cot API b043ee4 is described below commit b043ee4de7ab59584e85191cacb2912bc866e6f9 Author: Yuto Akutsu AuthorDate: Mon Sep 13 21:51:29 2021 +0900 [SPARK-36738][SQL][DOC] Fixed the wrong documentation on Cot API ### What changes were proposed in this pull request? Fixed wrong documentation on Cot API ### Why are the changes needed? [Doc](https://spark.apache.org/docs/latest/api/sql/index.html#cot) says `1/java.lang.Math.cot` but it should be `1/java.lang.Math.tan`. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Manual check. Closes #33978 from yutoacts/SPARK-36738. Authored-by: Yuto Akutsu Signed-off-by: Hyukjin Kwon (cherry picked from commit 3747cfdb402955cc19c9a383713b569fc010db70) Signed-off-by: Hyukjin Kwon --- .../org/apache/spark/sql/catalyst/expressions/mathExpressions.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala index 2466940..d4b86d1 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala @@ -731,7 +731,7 @@ case class Tan(child: Expression) extends UnaryMathExpression(math.tan, "TAN") { @ExpressionDescription( usage = """ -_FUNC_(expr) - Returns the cotangent of `expr`, as if computed by `1/java.lang.Math._FUNC_`. +_FUNC_(expr) - Returns the cotangent of `expr`, as if computed by `1/java.lang.Math.tan`. """, arguments = """ Arguments: - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-36738][SQL][DOC] Fixed the wrong documentation on Cot API
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 3747cfd [SPARK-36738][SQL][DOC] Fixed the wrong documentation on Cot API 3747cfd is described below commit 3747cfdb402955cc19c9a383713b569fc010db70 Author: Yuto Akutsu AuthorDate: Mon Sep 13 21:51:29 2021 +0900 [SPARK-36738][SQL][DOC] Fixed the wrong documentation on Cot API ### What changes were proposed in this pull request? Fixed wrong documentation on Cot API ### Why are the changes needed? [Doc](https://spark.apache.org/docs/latest/api/sql/index.html#cot) says `1/java.lang.Math.cot` but it should be `1/java.lang.Math.tan`. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Manual check. Closes #33978 from yutoacts/SPARK-36738. Authored-by: Yuto Akutsu Signed-off-by: Hyukjin Kwon --- .../org/apache/spark/sql/catalyst/expressions/mathExpressions.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala index 2466940..d4b86d1 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala @@ -731,7 +731,7 @@ case class Tan(child: Expression) extends UnaryMathExpression(math.tan, "TAN") { @ExpressionDescription( usage = """ -_FUNC_(expr) - Returns the cotangent of `expr`, as if computed by `1/java.lang.Math._FUNC_`. +_FUNC_(expr) - Returns the cotangent of `expr`, as if computed by `1/java.lang.Math.tan`. """, arguments = """ Arguments: - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-33832][SQL] Support optimize skewed join even if introduce extra shuffle
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 4a6b2b9 [SPARK-33832][SQL] Support optimize skewed join even if introduce extra shuffle 4a6b2b9 is described below commit 4a6b2b9fc8b68d59857c5ee71e817b9b06db5ba8 Author: ulysses-you AuthorDate: Mon Sep 13 17:21:27 2021 +0800 [SPARK-33832][SQL] Support optimize skewed join even if introduce extra shuffle ### What changes were proposed in this pull request? - move the rule `OptimizeSkewedJoin` from stage optimization phase to stage preparation phase. - run the rule `EnsureRequirements` one more time after the `OptimizeSkewedJoin` rule in the stage preparation phase. - add `SkewJoinAwareCost` to support estimate skewed join cost - add new config to decide if force optimize skewed join - in `OptimizeSkewedJoin`, we generate 2 physical plans, one with skew join optimization and one without. Then we use the cost evaluator w.r.t. the force-skew-join flag and pick the plan with lower cost. ### Why are the changes needed? In general, skewed join has more impact on performance than once more shuffle. It makes sense to force optimize skewed join even if introduce extra shuffle. A common case: ``` HashAggregate SortMergJoin Sort Exchange Sort Exchange ``` and after this PR, the plan looks like: ``` HashAggregate Exchange SortMergJoin (isSkew=true) Sort Exchange Sort Exchange ``` Note that, the new introduced shuffle also can be optimized by AQE. ### Does this PR introduce _any_ user-facing change? Yes, a new config. ### How was this patch tested? * Add new test * pass exists test `SPARK-30524: Do not optimize skew join if introduce additional shuffle` * pass exists test `SPARK-33551: Do not use custom shuffle reader for repartition` Closes #32816 from ulysses-you/support-extra-shuffle. Authored-by: ulysses-you Signed-off-by: Wenchen Fan --- .../org/apache/spark/sql/internal/SQLConf.scala| 7 ++ .../execution/adaptive/AdaptiveSparkPlanExec.scala | 31 +++ .../execution/adaptive/OptimizeSkewedJoin.scala| 25 -- .../sql/execution/adaptive/simpleCosting.scala | 48 +-- .../execution/exchange/EnsureRequirements.scala| 96 ++ .../adaptive/AdaptiveQueryExecSuite.scala | 68 +++ 6 files changed, 217 insertions(+), 58 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 8ba2b9f..9f71ecb 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -666,6 +666,13 @@ object SQLConf { .booleanConf .createWithDefault(true) + val ADAPTIVE_FORCE_OPTIMIZE_SKEWED_JOIN = +buildConf("spark.sql.adaptive.forceOptimizeSkewedJoin") + .doc("When true, force enable OptimizeSkewedJoin even if it introduces extra shuffle.") + .version("3.3.0") + .booleanConf + .createWithDefault(false) + val ADAPTIVE_CUSTOM_COST_EVALUATOR_CLASS = buildConf("spark.sql.adaptive.customCostEvaluatorClass") .doc("The custom cost evaluator class to be used for adaptive execution. If not being set," + diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala index bf810f3..13c9528 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala @@ -97,27 +97,36 @@ case class AdaptiveSparkPlanExec( AQEUtils.getRequiredDistribution(inputPlan) } + @transient private val costEvaluator = +conf.getConf(SQLConf.ADAPTIVE_CUSTOM_COST_EVALUATOR_CLASS) match { + case Some(className) => CostEvaluator.instantiate(className, session.sparkContext.getConf) + case _ => SimpleCostEvaluator(conf.getConf(SQLConf.ADAPTIVE_FORCE_OPTIMIZE_SKEWED_JOIN)) +} + // A list of physical plan rules to be applied before creation of query stages. The physical // plan should reach a final status of query stages (i.e., no more addition or removal of // Exchange nodes) after running these rules. - @transient private val queryStagePreparationRules: Seq[Rule[SparkPlan]] = Seq( -RemoveRedundantProjects, + @transient private val queryStagePreparationRules: Seq[Rule[SparkPlan]] = { // For case