[spark] branch master updated (2e3adad -> 71c73d5)
This is an automated email from the ASF dual-hosted git repository. yamamuro pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from 2e3adad [SPARK-31061][SQL] Provide ability to alter the provider of a table add 71c73d5 [SPARK-30279][SQL] Support 32 or more grouping attributes for GROUPING_ID No new revisions were added by this update. Summary of changes: docs/sql-migration-guide.md| 3 ++ .../spark/sql/catalyst/analysis/Analyzer.scala | 10 ++-- .../spark/sql/catalyst/expressions/grouping.scala | 28 +++ .../plans/logical/basicLogicalOperators.scala | 13 -- .../org/apache/spark/sql/internal/SQLConf.scala| 9 .../analysis/ResolveGroupingAnalyticsSuite.scala | 54 -- .../sql-tests/results/group-analytics.sql.out | 8 ++-- .../sql-tests/results/grouping_set.sql.out | 4 +- .../results/postgreSQL/groupingsets.sql.out| 2 +- .../results/udf/udf-group-analytics.sql.out| 8 ++-- .../scala/org/apache/spark/sql/SQLQuerySuite.scala | 31 + 11 files changed, 117 insertions(+), 53 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.0 updated: [SPARK-31061][SQL] Provide ability to alter the provider of a table
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.0 by this push: new 35a8e1e [SPARK-31061][SQL] Provide ability to alter the provider of a table 35a8e1e is described below commit 35a8e1ed7bb2f9493742d7401e32eab4621fd2d0 Author: Burak Yavuz AuthorDate: Fri Mar 6 15:40:44 2020 +0800 [SPARK-31061][SQL] Provide ability to alter the provider of a table This PR adds functionality to HiveExternalCatalog to be able to change the provider of a table. This is useful for catalogs in Spark 3.0 to be able to use alterTable to change the provider of a table as part of an atomic REPLACE TABLE function. No Unit tests Closes #27822 from brkyvz/externalCat. Authored-by: Burak Yavuz Signed-off-by: Wenchen Fan (cherry picked from commit 2e3adadc6a53fe044b286a6a59529a94e7eeda6c) Signed-off-by: Wenchen Fan --- .../spark/sql/hive/HiveExternalCatalog.scala | 10 +- .../spark/sql/hive/HiveExternalCatalogSuite.scala | 40 ++ 2 files changed, 49 insertions(+), 1 deletion(-) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala index ca292f6..be6d824 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala @@ -634,7 +634,15 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat k.startsWith(DATASOURCE_PREFIX) || k.startsWith(STATISTICS_PREFIX) || k.startsWith(CREATED_SPARK_VERSION) } - val newTableProps = propsFromOldTable ++ tableDefinition.properties + partitionProviderProp + val newFormatIfExists = tableDefinition.provider.flatMap { p => +if (DDLUtils.isDatasourceTable(tableDefinition)) { + Some(DATASOURCE_PROVIDER -> p) +} else { + None +} + } + val newTableProps = +propsFromOldTable ++ tableDefinition.properties + partitionProviderProp ++ newFormatIfExists // // Add old table's owner if we need to restore val owner = Option(tableDefinition.owner).filter(_.nonEmpty).getOrElse(oldTableDef.owner) diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogSuite.scala index 0a88898..473a93b 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogSuite.scala @@ -17,6 +17,8 @@ package org.apache.spark.sql.hive +import java.net.URI + import org.apache.hadoop.conf.Configuration import org.apache.spark.SparkConf @@ -178,4 +180,42 @@ class HiveExternalCatalogSuite extends ExternalCatalogSuite { assertThrows[QueryExecutionException](client.runSqlHive( "INSERT overwrite directory \"fs://localhost/tmp\" select 1 as a")) } + + test("SPARK-31061: alterTable should be able to change table provider") { +val catalog = newBasicCatalog() +val parquetTable = CatalogTable( + identifier = TableIdentifier("parq_tbl", Some("db1")), + tableType = CatalogTableType.MANAGED, + storage = storageFormat.copy(locationUri = Some(new URI("file:/some/path"))), + schema = new StructType().add("col1", "int").add("col2", "string"), + provider = Some("parquet")) +catalog.createTable(parquetTable, ignoreIfExists = false) + +val rawTable = externalCatalog.getTable("db1", "parq_tbl") +assert(rawTable.provider === Some("parquet")) + +val fooTable = parquetTable.copy(provider = Some("foo")) +catalog.alterTable(fooTable) +val alteredTable = externalCatalog.getTable("db1", "parq_tbl") +assert(alteredTable.provider === Some("foo")) + } + + test("SPARK-31061: alterTable should be able to change table provider from hive") { +val catalog = newBasicCatalog() +val hiveTable = CatalogTable( + identifier = TableIdentifier("parq_tbl", Some("db1")), + tableType = CatalogTableType.MANAGED, + storage = storageFormat, + schema = new StructType().add("col1", "int").add("col2", "string"), + provider = Some("hive")) +catalog.createTable(hiveTable, ignoreIfExists = false) + +val rawTable = externalCatalog.getTable("db1", "parq_tbl") +assert(rawTable.provider === Some("hive")) + +val fooTable = rawTable.copy(provider = Some("foo")) +catalog.alterTable(fooTable) +val alteredTable = externalCatalog.getTable("db1", "parq_tbl") +assert(alteredTable.provider === Some("foo")) + } } - To unsubs
[spark] branch master updated (6468d6f -> 2e3adad)
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from 6468d6f [SPARK-30776][ML] Support FValueSelector for continuous features and continuous labels add 2e3adad [SPARK-31061][SQL] Provide ability to alter the provider of a table No new revisions were added by this update. Summary of changes: .../spark/sql/hive/HiveExternalCatalog.scala | 10 +- .../spark/sql/hive/HiveExternalCatalogSuite.scala | 40 ++ 2 files changed, 49 insertions(+), 1 deletion(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.0 updated: [SPARK-31045][SQL][FOLLOWUP][3.0] Fix build due to divergence between master and 3.0
This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.0 by this push: new 9b48f33 [SPARK-31045][SQL][FOLLOWUP][3.0] Fix build due to divergence between master and 3.0 9b48f33 is described below commit 9b48f3358d3efb523715a5f258e5ed83e28692f6 Author: Jungtaek Lim (HeartSaVioR) AuthorDate: Thu Mar 5 21:31:08 2020 -0800 [SPARK-31045][SQL][FOLLOWUP][3.0] Fix build due to divergence between master and 3.0 ### What changes were proposed in this pull request? This patch fixes the build failure in `branch-3.0` due to cherry-picking SPARK-31045 to branch-3.0, as `.version()` is not available in `branch-3.0` yet. ### Why are the changes needed? The build is failing in `branch-3.0`. ### Does this PR introduce any user-facing change? No. ### How was this patch tested? Jenkins build will verify. Closes #27826 from HeartSaVioR/SPARK-31045-branch-3.0-FOLLOWUP. Authored-by: Jungtaek Lim (HeartSaVioR) Signed-off-by: Dongjoon Hyun --- sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala | 1 - 1 file changed, 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index cd465bc..fdaf0ec 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -382,7 +382,6 @@ object SQLConf { .internal() .doc("Configures the log level for adaptive execution logging of plan changes. The value " + "can be 'trace', 'debug', 'info', 'warn', or 'error'. The default log level is 'debug'.") -.version("3.0.0") .stringConf .transform(_.toUpperCase(Locale.ROOT)) .checkValues(Set("TRACE", "DEBUG", "INFO", "WARN", "ERROR")) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated (1426ad8 -> 6468d6f)
This is an automated email from the ASF dual-hosted git repository. ruifengz pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from 1426ad8 [SPARK-23817][FOLLOWUP][TEST] Add OrcV2QuerySuite add 6468d6f [SPARK-30776][ML] Support FValueSelector for continuous features and continuous labels No new revisions were added by this update. Summary of changes: .../apache/spark/ml/feature/ChiSqSelector.scala| 2 +- .../{ChiSqSelector.scala => FValueSelector.scala} | 290 - .../org/apache/spark/ml/stat/FValueTest.scala | 36 ++- .../apache/spark/ml/stat/SelectionTestResult.scala | 101 +++ .../spark/ml/feature/FValueSelectorSuite.scala | 212 +++ 5 files changed, 512 insertions(+), 129 deletions(-) copy mllib/src/main/scala/org/apache/spark/ml/feature/{ChiSqSelector.scala => FValueSelector.scala} (56%) create mode 100644 mllib/src/main/scala/org/apache/spark/ml/stat/SelectionTestResult.scala create mode 100644 mllib/src/test/scala/org/apache/spark/ml/feature/FValueSelectorSuite.scala - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.0 updated (5375b40 -> 7c09c9f)
This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a change to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git. from 5375b40 [SPARK-31010][SQL][FOLLOW-UP] Deprecate untyped scala UDF add 7c09c9f [SPARK-23817][FOLLOWUP][TEST] Add OrcV2QuerySuite No new revisions were added by this update. Summary of changes: .../spark/sql/execution/datasources/orc/OrcQuerySuite.scala | 9 - 1 file changed, 8 insertions(+), 1 deletion(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated (587266f -> 1426ad8)
This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from 587266f [SPARK-31010][SQL][FOLLOW-UP] Deprecate untyped scala UDF add 1426ad8 [SPARK-23817][FOLLOWUP][TEST] Add OrcV2QuerySuite No new revisions were added by this update. Summary of changes: .../spark/sql/execution/datasources/orc/OrcQuerySuite.scala | 9 - 1 file changed, 8 insertions(+), 1 deletion(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.0 updated: [SPARK-31010][SQL][FOLLOW-UP] Deprecate untyped scala UDF
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.0 by this push: new 5375b40 [SPARK-31010][SQL][FOLLOW-UP] Deprecate untyped scala UDF 5375b40 is described below commit 5375b40f37cefe6f18f9f943412b60c65fe2f844 Author: yi.wu AuthorDate: Fri Mar 6 13:00:04 2020 +0800 [SPARK-31010][SQL][FOLLOW-UP] Deprecate untyped scala UDF ### What changes were proposed in this pull request? Use scala annotation deprecate to deprecate untyped scala UDF. ### Why are the changes needed? After #27488, it's weird to see the untyped scala UDF will fail by default without deprecation. ### Does this PR introduce any user-facing change? Yes, user will see the warning: ``` :26: warning: method udf in object functions is deprecated (since 3.0.0): Untyped Scala UDF API is deprecated, please use typed Scala UDF API such as 'def udf[RT: TypeTag](f: Function0[RT]): UserDefinedFunction' instead. val myudf = udf(() => Math.random(), DoubleType) ^ ``` ### How was this patch tested? Tested manually. Closes #27794 from Ngone51/deprecate_untyped_scala_udf. Authored-by: yi.wu Signed-off-by: Wenchen Fan (cherry picked from commit 587266f887f85bfa2a73a77485ab4db522c6aca1) Signed-off-by: Wenchen Fan --- sql/core/src/main/scala/org/apache/spark/sql/functions.scala | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala index c6e8cf7..7a58957 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala @@ -4732,6 +4732,8 @@ object functions { * @group udf_funcs * @since 2.0.0 */ + @deprecated("Untyped Scala UDF API is deprecated, please use typed Scala UDF API such as " + +"'def udf[RT: TypeTag](f: Function0[RT]): UserDefinedFunction' instead.", "3.0.0") def udf(f: AnyRef, dataType: DataType): UserDefinedFunction = { if (!SQLConf.get.getConf(SQLConf.LEGACY_ALLOW_UNTYPED_SCALA_UDF)) { val errorMsg = "You're using untyped Scala UDF, which does not have the input type " + - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated (59f1e76 -> 587266f)
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from 59f1e76 [SPARK-31020][SPARK-31023][SPARK-31025][SPARK-31044][SQL] Support foldable args by `from_csv/json` and `schema_of_csv/json` add 587266f [SPARK-31010][SQL][FOLLOW-UP] Deprecate untyped scala UDF No new revisions were added by this update. Summary of changes: sql/core/src/main/scala/org/apache/spark/sql/functions.scala | 2 ++ 1 file changed, 2 insertions(+) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated (afb84e9 -> 59f1e76)
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from afb84e9 [SPARK-30886][SQL] Deprecate two-parameter TRIM/LTRIM/RTRIM functions add 59f1e76 [SPARK-31020][SPARK-31023][SPARK-31025][SPARK-31044][SQL] Support foldable args by `from_csv/json` and `schema_of_csv/json` No new revisions were added by this update. Summary of changes: .../spark/sql/catalyst/expressions/ExprUtils.scala | 33 +- .../sql/catalyst/expressions/csvExpressions.scala | 12 +--- .../sql/catalyst/expressions/jsonExpressions.scala | 12 +--- .../sql-tests/results/csv-functions.sql.out| 6 ++-- .../sql-tests/results/json-functions.sql.out | 6 ++-- .../org/apache/spark/sql/CsvFunctionsSuite.scala | 26 + .../org/apache/spark/sql/JsonFunctionsSuite.scala | 23 ++- 7 files changed, 83 insertions(+), 35 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.0 updated: [SPARK-30886][SQL] Deprecate two-parameter TRIM/LTRIM/RTRIM functions
This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.0 by this push: new 1535b2b [SPARK-30886][SQL] Deprecate two-parameter TRIM/LTRIM/RTRIM functions 1535b2b is described below commit 1535b2bb51782a89b271b1ebe53273ab610b390b Author: Dongjoon Hyun AuthorDate: Thu Mar 5 20:09:39 2020 -0800 [SPARK-30886][SQL] Deprecate two-parameter TRIM/LTRIM/RTRIM functions ### What changes were proposed in this pull request? This PR aims to show a deprecation warning on two-parameter TRIM/LTRIM/RTRIM function usages based on the community decision. - https://lists.apache.org/thread.html/r48b6c2596ab06206b7b7fd4bbafd4099dccd4e2cf9801aaa9034c418%40%3Cdev.spark.apache.org%3E ### Why are the changes needed? For backward compatibility, SPARK-28093 is reverted. However, from Apache Spark 3.0.0, we should give a safe guideline to use SQL syntax instead of the esoteric function signatures. ### Does this PR introduce any user-facing change? Yes. This shows a directional warning. ### How was this patch tested? Pass the Jenkins with a newly added test case. Closes #27643 from dongjoon-hyun/SPARK-30886. Authored-by: Dongjoon Hyun Signed-off-by: Dongjoon Hyun (cherry picked from commit afb84e9d378003c57cd01d21cdb1a977ba25454b) Signed-off-by: Dongjoon Hyun --- .../spark/sql/catalyst/analysis/Analyzer.scala | 20 ++--- .../sql/catalyst/analysis/AnalysisSuite.scala | 52 ++ 2 files changed, 66 insertions(+), 6 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index 3cb754d..eadcd0f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -19,6 +19,7 @@ package org.apache.spark.sql.catalyst.analysis import java.util import java.util.Locale +import java.util.concurrent.atomic.AtomicBoolean import scala.collection.mutable import scala.collection.mutable.ArrayBuffer @@ -1795,6 +1796,7 @@ class Analyzer( * Replaces [[UnresolvedFunction]]s with concrete [[Expression]]s. */ object ResolveFunctions extends Rule[LogicalPlan] { +val trimWarningEnabled = new AtomicBoolean(true) def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperatorsUp { case q: LogicalPlan => q transformExpressions { @@ -1839,13 +1841,19 @@ class Analyzer( } AggregateExpression(agg, Complete, isDistinct, filter) // This function is not an aggregate function, just return the resolved one. -case other => - if (isDistinct || filter.isDefined) { -failAnalysis("DISTINCT or FILTER specified, " + - s"but ${other.prettyName} is not an aggregate function") - } else { -other +case other if (isDistinct || filter.isDefined) => + failAnalysis("DISTINCT or FILTER specified, " + +s"but ${other.prettyName} is not an aggregate function") +case e: String2TrimExpression if arguments.size == 2 => + if (trimWarningEnabled.get) { +log.warn("Two-parameter TRIM/LTRIM/RTRIM function signatures are deprecated." + + " Use SQL syntax `TRIM((BOTH | LEADING | TRAILING)? trimStr FROM str)`" + + " instead.") +trimWarningEnabled.set(false) } + e +case other => + other } } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala index d385133..8451b9b 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala @@ -21,6 +21,7 @@ import java.util.{Locale, TimeZone} import scala.reflect.ClassTag +import org.apache.log4j.Level import org.scalatest.Matchers import org.apache.spark.api.python.PythonEvalType @@ -768,4 +769,55 @@ class AnalysisSuite extends AnalysisTest with Matchers { assert(message.startsWith(s"Max iterations ($maxIterations) reached for batch Resolution, " + s"please set '${SQLConf.ANALYZER_MAX_ITERATIONS.key}' to a larger value.")) } + + test("SPARK-30886 Deprecate two-parameter TRIM/LTRIM/RTRIM") { +Seq("trim
[spark] branch master updated (d705d36 -> afb84e9)
This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from d705d36 [SPARK-31045][SQL] Add config for AQE logging level add afb84e9 [SPARK-30886][SQL] Deprecate two-parameter TRIM/LTRIM/RTRIM functions No new revisions were added by this update. Summary of changes: .../spark/sql/catalyst/analysis/Analyzer.scala | 20 ++--- .../sql/catalyst/analysis/AnalysisSuite.scala | 52 ++ 2 files changed, 66 insertions(+), 6 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.0 updated: [SPARK-31045][SQL] Add config for AQE logging level
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.0 by this push: new db1c3fe [SPARK-31045][SQL] Add config for AQE logging level db1c3fe is described below commit db1c3feacdca04c1b72191269f4f3910ad05bcb4 Author: maryannxue AuthorDate: Fri Mar 6 11:41:45 2020 +0800 [SPARK-31045][SQL] Add config for AQE logging level ### What changes were proposed in this pull request? This PR adds an internal config for changing the logging level of adaptive execution query plan evolvement. ### Why are the changes needed? To make AQE debugging easier. ### Does this PR introduce any user-facing change? No. ### How was this patch tested? Added UT. Closes #27798 from maryannxue/aqe-log-level. Authored-by: maryannxue Signed-off-by: Wenchen Fan (cherry picked from commit d705d36c0c94d3a4684de6ca0f444557c3cec25e) Signed-off-by: Wenchen Fan --- .../org/apache/spark/sql/internal/SQLConf.scala| 12 ++ .../execution/adaptive/AdaptiveSparkPlanExec.scala | 12 +- .../adaptive/AdaptiveQueryExecSuite.scala | 47 ++ 3 files changed, 70 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index b2b3d12..cd465bc 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -378,6 +378,16 @@ object SQLConf { .booleanConf .createWithDefault(false) + val ADAPTIVE_EXECUTION_LOG_LEVEL = buildConf("spark.sql.adaptive.logLevel") +.internal() +.doc("Configures the log level for adaptive execution logging of plan changes. The value " + + "can be 'trace', 'debug', 'info', 'warn', or 'error'. The default log level is 'debug'.") +.version("3.0.0") +.stringConf +.transform(_.toUpperCase(Locale.ROOT)) +.checkValues(Set("TRACE", "DEBUG", "INFO", "WARN", "ERROR")) +.createWithDefault("debug") + val ADVISORY_PARTITION_SIZE_IN_BYTES = buildConf("spark.sql.adaptive.advisoryPartitionSizeInBytes") .doc("The advisory size in bytes of the shuffle partition during adaptive optimization " + @@ -2428,6 +2438,8 @@ class SQLConf extends Serializable with Logging { def adaptiveExecutionEnabled: Boolean = getConf(ADAPTIVE_EXECUTION_ENABLED) + def adaptiveExecutionLogLevel: String = getConf(ADAPTIVE_EXECUTION_LOG_LEVEL) + def fetchShuffleBlocksInBatch: Boolean = getConf(FETCH_SHUFFLE_BLOCKS_IN_BATCH) def nonEmptyPartitionRatioForBroadcastJoin: Double = diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala index b74401e..fc88a7f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala @@ -66,6 +66,15 @@ case class AdaptiveSparkPlanExec( @transient private val lock = new Object() + @transient private val logOnLevel: ( => String) => Unit = conf.adaptiveExecutionLogLevel match { +case "TRACE" => logTrace(_) +case "DEBUG" => logDebug(_) +case "INFO" => logInfo(_) +case "WARN" => logWarning(_) +case "ERROR" => logError(_) +case _ => logDebug(_) + } + // The logical plan optimizer for re-optimizing the current logical plan. @transient private val optimizer = new RuleExecutor[LogicalPlan] { // TODO add more optimization rules @@ -204,6 +213,7 @@ case class AdaptiveSparkPlanExec( val newCost = costEvaluator.evaluateCost(newPhysicalPlan) if (newCost < origCost || (newCost == origCost && currentPhysicalPlan != newPhysicalPlan)) { + logOnLevel(s"Plan changed from $currentPhysicalPlan to $newPhysicalPlan") cleanUpTempTags(newPhysicalPlan) currentPhysicalPlan = newPhysicalPlan currentLogicalPlan = newLogicalPlan @@ -217,7 +227,7 @@ case class AdaptiveSparkPlanExec( currentPhysicalPlan = applyPhysicalRules(result.newPlan, queryStageOptimizerRules) isFinalPlan = true executionId.foreach(onUpdatePlan(_, Seq(currentPhysicalPlan))) - logDebug(s"Final plan: $currentPhysicalPlan") + logOnLevel(s"Final plan: $currentPhysicalPlan") } currentPhysicalPlan } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala index a7fa63d..25b1f89 100644 --- a/sql/core/src/t
[spark] branch master updated (d9254b2 -> d705d36)
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from d9254b2 [SPARK-30841][SQL][DOC][FOLLOW-UP] Add version information to the configuration of SQL add d705d36 [SPARK-31045][SQL] Add config for AQE logging level No new revisions were added by this update. Summary of changes: .../org/apache/spark/sql/internal/SQLConf.scala| 12 ++ .../execution/adaptive/AdaptiveSparkPlanExec.scala | 12 +- .../adaptive/AdaptiveQueryExecSuite.scala | 47 ++ 3 files changed, 70 insertions(+), 1 deletion(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated (e36227e -> d9254b2)
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from e36227e [SPARK-30914][CORE][DOC] Add version information to the configuration of UI add d9254b2 [SPARK-30841][SQL][DOC][FOLLOW-UP] Add version information to the configuration of SQL No new revisions were added by this update. Summary of changes: .../org/apache/spark/sql/internal/SQLConf.scala| 112 ++--- 1 file changed, 56 insertions(+), 56 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated (cf7c397 -> e36227e)
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from cf7c397 [MINOR][SQL] Remove an ignored test from JsonSuite add e36227e [SPARK-30914][CORE][DOC] Add version information to the configuration of UI No new revisions were added by this update. Summary of changes: .../org/apache/spark/internal/config/UI.scala | 26 ++ docs/configuration.md | 10 + 2 files changed, 36 insertions(+) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-2.4 updated: [MINOR][SQL] Remove an ignored test from JsonSuite
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-2.4 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-2.4 by this push: new 7c237cc [MINOR][SQL] Remove an ignored test from JsonSuite 7c237cc is described below commit 7c237cc8f7d83d96aed282d799e8a42acdc9d623 Author: Maxim Gekk AuthorDate: Fri Mar 6 10:35:44 2020 +0900 [MINOR][SQL] Remove an ignored test from JsonSuite ### What changes were proposed in this pull request? Remove ignored and outdated test `Type conflict in primitive field values (Ignored)` from JsonSuite. ### Why are the changes needed? The test is not maintained for long time. It can be removed to reduce size of JsonSuite, and improve maintainability. ### Does this PR introduce any user-facing change? No ### How was this patch tested? By running the command `./build/sbt "test:testOnly *JsonV2Suite"` Closes #27795 from MaxGekk/remove-ignored-test-in-JsonSuite. Authored-by: Maxim Gekk Signed-off-by: HyukjinKwon (cherry picked from commit cf7c397ede05fd106697bd5cc8062f394623bf22) Signed-off-by: HyukjinKwon --- .../sql/execution/datasources/json/JsonSuite.scala | 53 -- 1 file changed, 53 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala index 5ca430a..9fcd67a 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala @@ -468,59 +468,6 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData { ) } - ignore("Type conflict in primitive field values (Ignored)") { -val jsonDF = spark.read.json(primitiveFieldValueTypeConflict) -jsonDF.createOrReplaceTempView("jsonTable") - -// Right now, the analyzer does not promote strings in a boolean expression. -// Number and Boolean conflict: resolve the type as boolean in this query. -checkAnswer( - sql("select num_bool from jsonTable where NOT num_bool"), - Row(false) -) - -checkAnswer( - sql("select str_bool from jsonTable where NOT str_bool"), - Row(false) -) - -// Right now, the analyzer does not know that num_bool should be treated as a boolean. -// Number and Boolean conflict: resolve the type as boolean in this query. -checkAnswer( - sql("select num_bool from jsonTable where num_bool"), - Row(true) -) - -checkAnswer( - sql("select str_bool from jsonTable where str_bool"), - Row(false) -) - -// The plan of the following DSL is -// Project [(CAST(num_str#65:4, DoubleType) + 1.2) AS num#78] -// Filter (CAST(CAST(num_str#65:4, DoubleType), DecimalType) > 92233720368547758060) -//ExistingRdd [num_bool#61,num_num_1#62L,num_num_2#63,num_num_3#64,num_str#65,str_bool#66] -// We should directly cast num_str to DecimalType and also need to do the right type promotion -// in the Project. -checkAnswer( - jsonDF. -where('num_str >= BigDecimal("92233720368547758060")). -select(('num_str + 1.2).as("num")), - Row(new java.math.BigDecimal("92233720368547758071.2").doubleValue()) -) - -// The following test will fail. The type of num_str is StringType. -// So, to evaluate num_str + 1.2, we first need to use Cast to convert the type. -// In our test data, one value of num_str is 13.1. -// The result of (CAST(num_str#65:4, DoubleType) + 1.2) for this value is 14.299, -// which is not 14.3. -// Number and String conflict: resolve the type as number in this query. -checkAnswer( - sql("select num_str + 1.2 from jsonTable where num_str > 13"), - Row(BigDecimal("14.3")) :: Row(BigDecimal("92233720368547758071.2")) :: Nil -) - } - test("Type conflict in complex field values") { val jsonDF = spark.read.json(complexFieldValueTypeConflict) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.0 updated: [MINOR][SQL] Remove an ignored test from JsonSuite
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.0 by this push: new 6d8ee15 [MINOR][SQL] Remove an ignored test from JsonSuite 6d8ee15 is described below commit 6d8ee156adcac8fd2a442ab70da750f950a83661 Author: Maxim Gekk AuthorDate: Fri Mar 6 10:35:44 2020 +0900 [MINOR][SQL] Remove an ignored test from JsonSuite ### What changes were proposed in this pull request? Remove ignored and outdated test `Type conflict in primitive field values (Ignored)` from JsonSuite. ### Why are the changes needed? The test is not maintained for long time. It can be removed to reduce size of JsonSuite, and improve maintainability. ### Does this PR introduce any user-facing change? No ### How was this patch tested? By running the command `./build/sbt "test:testOnly *JsonV2Suite"` Closes #27795 from MaxGekk/remove-ignored-test-in-JsonSuite. Authored-by: Maxim Gekk Signed-off-by: HyukjinKwon (cherry picked from commit cf7c397ede05fd106697bd5cc8062f394623bf22) Signed-off-by: HyukjinKwon --- .../sql/execution/datasources/json/JsonSuite.scala | 53 -- 1 file changed, 53 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala index df0bca7..fb3328c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala @@ -481,59 +481,6 @@ abstract class JsonSuite extends QueryTest with SharedSparkSession with TestJson ) } - ignore("Type conflict in primitive field values (Ignored)") { -val jsonDF = spark.read.json(primitiveFieldValueTypeConflict) -jsonDF.createOrReplaceTempView("jsonTable") - -// Right now, the analyzer does not promote strings in a boolean expression. -// Number and Boolean conflict: resolve the type as boolean in this query. -checkAnswer( - sql("select num_bool from jsonTable where NOT num_bool"), - Row(false) -) - -checkAnswer( - sql("select str_bool from jsonTable where NOT str_bool"), - Row(false) -) - -// Right now, the analyzer does not know that num_bool should be treated as a boolean. -// Number and Boolean conflict: resolve the type as boolean in this query. -checkAnswer( - sql("select num_bool from jsonTable where num_bool"), - Row(true) -) - -checkAnswer( - sql("select str_bool from jsonTable where str_bool"), - Row(false) -) - -// The plan of the following DSL is -// Project [(CAST(num_str#65:4, DoubleType) + 1.2) AS num#78] -// Filter (CAST(CAST(num_str#65:4, DoubleType), DecimalType) > 92233720368547758060) -//ExistingRdd [num_bool#61,num_num_1#62L,num_num_2#63,num_num_3#64,num_str#65,str_bool#66] -// We should directly cast num_str to DecimalType and also need to do the right type promotion -// in the Project. -checkAnswer( - jsonDF. -where('num_str >= BigDecimal("92233720368547758060")). -select(('num_str + 1.2).as("num")), - Row(new java.math.BigDecimal("92233720368547758071.2").doubleValue()) -) - -// The following test will fail. The type of num_str is StringType. -// So, to evaluate num_str + 1.2, we first need to use Cast to convert the type. -// In our test data, one value of num_str is 13.1. -// The result of (CAST(num_str#65:4, DoubleType) + 1.2) for this value is 14.299, -// which is not 14.3. -// Number and String conflict: resolve the type as number in this query. -checkAnswer( - sql("select num_str + 1.2 from jsonTable where num_str > 13"), - Row(BigDecimal("14.3")) :: Row(BigDecimal("92233720368547758071.2")) :: Nil -) - } - test("Type conflict in complex field values") { val jsonDF = spark.read.json(complexFieldValueTypeConflict) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.0 updated: [MINOR][SQL] Remove an ignored test from JsonSuite
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.0 by this push: new 6d8ee15 [MINOR][SQL] Remove an ignored test from JsonSuite 6d8ee15 is described below commit 6d8ee156adcac8fd2a442ab70da750f950a83661 Author: Maxim Gekk AuthorDate: Fri Mar 6 10:35:44 2020 +0900 [MINOR][SQL] Remove an ignored test from JsonSuite ### What changes were proposed in this pull request? Remove ignored and outdated test `Type conflict in primitive field values (Ignored)` from JsonSuite. ### Why are the changes needed? The test is not maintained for long time. It can be removed to reduce size of JsonSuite, and improve maintainability. ### Does this PR introduce any user-facing change? No ### How was this patch tested? By running the command `./build/sbt "test:testOnly *JsonV2Suite"` Closes #27795 from MaxGekk/remove-ignored-test-in-JsonSuite. Authored-by: Maxim Gekk Signed-off-by: HyukjinKwon (cherry picked from commit cf7c397ede05fd106697bd5cc8062f394623bf22) Signed-off-by: HyukjinKwon --- .../sql/execution/datasources/json/JsonSuite.scala | 53 -- 1 file changed, 53 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala index df0bca7..fb3328c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala @@ -481,59 +481,6 @@ abstract class JsonSuite extends QueryTest with SharedSparkSession with TestJson ) } - ignore("Type conflict in primitive field values (Ignored)") { -val jsonDF = spark.read.json(primitiveFieldValueTypeConflict) -jsonDF.createOrReplaceTempView("jsonTable") - -// Right now, the analyzer does not promote strings in a boolean expression. -// Number and Boolean conflict: resolve the type as boolean in this query. -checkAnswer( - sql("select num_bool from jsonTable where NOT num_bool"), - Row(false) -) - -checkAnswer( - sql("select str_bool from jsonTable where NOT str_bool"), - Row(false) -) - -// Right now, the analyzer does not know that num_bool should be treated as a boolean. -// Number and Boolean conflict: resolve the type as boolean in this query. -checkAnswer( - sql("select num_bool from jsonTable where num_bool"), - Row(true) -) - -checkAnswer( - sql("select str_bool from jsonTable where str_bool"), - Row(false) -) - -// The plan of the following DSL is -// Project [(CAST(num_str#65:4, DoubleType) + 1.2) AS num#78] -// Filter (CAST(CAST(num_str#65:4, DoubleType), DecimalType) > 92233720368547758060) -//ExistingRdd [num_bool#61,num_num_1#62L,num_num_2#63,num_num_3#64,num_str#65,str_bool#66] -// We should directly cast num_str to DecimalType and also need to do the right type promotion -// in the Project. -checkAnswer( - jsonDF. -where('num_str >= BigDecimal("92233720368547758060")). -select(('num_str + 1.2).as("num")), - Row(new java.math.BigDecimal("92233720368547758071.2").doubleValue()) -) - -// The following test will fail. The type of num_str is StringType. -// So, to evaluate num_str + 1.2, we first need to use Cast to convert the type. -// In our test data, one value of num_str is 13.1. -// The result of (CAST(num_str#65:4, DoubleType) + 1.2) for this value is 14.299, -// which is not 14.3. -// Number and String conflict: resolve the type as number in this query. -checkAnswer( - sql("select num_str + 1.2 from jsonTable where num_str > 13"), - Row(BigDecimal("14.3")) :: Row(BigDecimal("92233720368547758071.2")) :: Nil -) - } - test("Type conflict in complex field values") { val jsonDF = spark.read.json(complexFieldValueTypeConflict) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-2.4 updated: [MINOR][SQL] Remove an ignored test from JsonSuite
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-2.4 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-2.4 by this push: new 7c237cc [MINOR][SQL] Remove an ignored test from JsonSuite 7c237cc is described below commit 7c237cc8f7d83d96aed282d799e8a42acdc9d623 Author: Maxim Gekk AuthorDate: Fri Mar 6 10:35:44 2020 +0900 [MINOR][SQL] Remove an ignored test from JsonSuite ### What changes were proposed in this pull request? Remove ignored and outdated test `Type conflict in primitive field values (Ignored)` from JsonSuite. ### Why are the changes needed? The test is not maintained for long time. It can be removed to reduce size of JsonSuite, and improve maintainability. ### Does this PR introduce any user-facing change? No ### How was this patch tested? By running the command `./build/sbt "test:testOnly *JsonV2Suite"` Closes #27795 from MaxGekk/remove-ignored-test-in-JsonSuite. Authored-by: Maxim Gekk Signed-off-by: HyukjinKwon (cherry picked from commit cf7c397ede05fd106697bd5cc8062f394623bf22) Signed-off-by: HyukjinKwon --- .../sql/execution/datasources/json/JsonSuite.scala | 53 -- 1 file changed, 53 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala index 5ca430a..9fcd67a 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala @@ -468,59 +468,6 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData { ) } - ignore("Type conflict in primitive field values (Ignored)") { -val jsonDF = spark.read.json(primitiveFieldValueTypeConflict) -jsonDF.createOrReplaceTempView("jsonTable") - -// Right now, the analyzer does not promote strings in a boolean expression. -// Number and Boolean conflict: resolve the type as boolean in this query. -checkAnswer( - sql("select num_bool from jsonTable where NOT num_bool"), - Row(false) -) - -checkAnswer( - sql("select str_bool from jsonTable where NOT str_bool"), - Row(false) -) - -// Right now, the analyzer does not know that num_bool should be treated as a boolean. -// Number and Boolean conflict: resolve the type as boolean in this query. -checkAnswer( - sql("select num_bool from jsonTable where num_bool"), - Row(true) -) - -checkAnswer( - sql("select str_bool from jsonTable where str_bool"), - Row(false) -) - -// The plan of the following DSL is -// Project [(CAST(num_str#65:4, DoubleType) + 1.2) AS num#78] -// Filter (CAST(CAST(num_str#65:4, DoubleType), DecimalType) > 92233720368547758060) -//ExistingRdd [num_bool#61,num_num_1#62L,num_num_2#63,num_num_3#64,num_str#65,str_bool#66] -// We should directly cast num_str to DecimalType and also need to do the right type promotion -// in the Project. -checkAnswer( - jsonDF. -where('num_str >= BigDecimal("92233720368547758060")). -select(('num_str + 1.2).as("num")), - Row(new java.math.BigDecimal("92233720368547758071.2").doubleValue()) -) - -// The following test will fail. The type of num_str is StringType. -// So, to evaluate num_str + 1.2, we first need to use Cast to convert the type. -// In our test data, one value of num_str is 13.1. -// The result of (CAST(num_str#65:4, DoubleType) + 1.2) for this value is 14.299, -// which is not 14.3. -// Number and String conflict: resolve the type as number in this query. -checkAnswer( - sql("select num_str + 1.2 from jsonTable where num_str > 13"), - Row(BigDecimal("14.3")) :: Row(BigDecimal("92233720368547758071.2")) :: Nil -) - } - test("Type conflict in complex field values") { val jsonDF = spark.read.json(complexFieldValueTypeConflict) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated (fc12165 -> cf7c397)
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from fc12165 [SPARK-31036][SQL] Use stringArgs in Expression.toString to respect hidden parameters add cf7c397 [MINOR][SQL] Remove an ignored test from JsonSuite No new revisions were added by this update. Summary of changes: .../sql/execution/datasources/json/JsonSuite.scala | 53 -- 1 file changed, 53 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-2.4 updated: [MINOR][SQL] Remove an ignored test from JsonSuite
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-2.4 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-2.4 by this push: new 7c237cc [MINOR][SQL] Remove an ignored test from JsonSuite 7c237cc is described below commit 7c237cc8f7d83d96aed282d799e8a42acdc9d623 Author: Maxim Gekk AuthorDate: Fri Mar 6 10:35:44 2020 +0900 [MINOR][SQL] Remove an ignored test from JsonSuite ### What changes were proposed in this pull request? Remove ignored and outdated test `Type conflict in primitive field values (Ignored)` from JsonSuite. ### Why are the changes needed? The test is not maintained for long time. It can be removed to reduce size of JsonSuite, and improve maintainability. ### Does this PR introduce any user-facing change? No ### How was this patch tested? By running the command `./build/sbt "test:testOnly *JsonV2Suite"` Closes #27795 from MaxGekk/remove-ignored-test-in-JsonSuite. Authored-by: Maxim Gekk Signed-off-by: HyukjinKwon (cherry picked from commit cf7c397ede05fd106697bd5cc8062f394623bf22) Signed-off-by: HyukjinKwon --- .../sql/execution/datasources/json/JsonSuite.scala | 53 -- 1 file changed, 53 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala index 5ca430a..9fcd67a 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala @@ -468,59 +468,6 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData { ) } - ignore("Type conflict in primitive field values (Ignored)") { -val jsonDF = spark.read.json(primitiveFieldValueTypeConflict) -jsonDF.createOrReplaceTempView("jsonTable") - -// Right now, the analyzer does not promote strings in a boolean expression. -// Number and Boolean conflict: resolve the type as boolean in this query. -checkAnswer( - sql("select num_bool from jsonTable where NOT num_bool"), - Row(false) -) - -checkAnswer( - sql("select str_bool from jsonTable where NOT str_bool"), - Row(false) -) - -// Right now, the analyzer does not know that num_bool should be treated as a boolean. -// Number and Boolean conflict: resolve the type as boolean in this query. -checkAnswer( - sql("select num_bool from jsonTable where num_bool"), - Row(true) -) - -checkAnswer( - sql("select str_bool from jsonTable where str_bool"), - Row(false) -) - -// The plan of the following DSL is -// Project [(CAST(num_str#65:4, DoubleType) + 1.2) AS num#78] -// Filter (CAST(CAST(num_str#65:4, DoubleType), DecimalType) > 92233720368547758060) -//ExistingRdd [num_bool#61,num_num_1#62L,num_num_2#63,num_num_3#64,num_str#65,str_bool#66] -// We should directly cast num_str to DecimalType and also need to do the right type promotion -// in the Project. -checkAnswer( - jsonDF. -where('num_str >= BigDecimal("92233720368547758060")). -select(('num_str + 1.2).as("num")), - Row(new java.math.BigDecimal("92233720368547758071.2").doubleValue()) -) - -// The following test will fail. The type of num_str is StringType. -// So, to evaluate num_str + 1.2, we first need to use Cast to convert the type. -// In our test data, one value of num_str is 13.1. -// The result of (CAST(num_str#65:4, DoubleType) + 1.2) for this value is 14.299, -// which is not 14.3. -// Number and String conflict: resolve the type as number in this query. -checkAnswer( - sql("select num_str + 1.2 from jsonTable where num_str > 13"), - Row(BigDecimal("14.3")) :: Row(BigDecimal("92233720368547758071.2")) :: Nil -) - } - test("Type conflict in complex field values") { val jsonDF = spark.read.json(complexFieldValueTypeConflict) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.0 updated: [MINOR][SQL] Remove an ignored test from JsonSuite
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.0 by this push: new 6d8ee15 [MINOR][SQL] Remove an ignored test from JsonSuite 6d8ee15 is described below commit 6d8ee156adcac8fd2a442ab70da750f950a83661 Author: Maxim Gekk AuthorDate: Fri Mar 6 10:35:44 2020 +0900 [MINOR][SQL] Remove an ignored test from JsonSuite ### What changes were proposed in this pull request? Remove ignored and outdated test `Type conflict in primitive field values (Ignored)` from JsonSuite. ### Why are the changes needed? The test is not maintained for long time. It can be removed to reduce size of JsonSuite, and improve maintainability. ### Does this PR introduce any user-facing change? No ### How was this patch tested? By running the command `./build/sbt "test:testOnly *JsonV2Suite"` Closes #27795 from MaxGekk/remove-ignored-test-in-JsonSuite. Authored-by: Maxim Gekk Signed-off-by: HyukjinKwon (cherry picked from commit cf7c397ede05fd106697bd5cc8062f394623bf22) Signed-off-by: HyukjinKwon --- .../sql/execution/datasources/json/JsonSuite.scala | 53 -- 1 file changed, 53 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala index df0bca7..fb3328c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala @@ -481,59 +481,6 @@ abstract class JsonSuite extends QueryTest with SharedSparkSession with TestJson ) } - ignore("Type conflict in primitive field values (Ignored)") { -val jsonDF = spark.read.json(primitiveFieldValueTypeConflict) -jsonDF.createOrReplaceTempView("jsonTable") - -// Right now, the analyzer does not promote strings in a boolean expression. -// Number and Boolean conflict: resolve the type as boolean in this query. -checkAnswer( - sql("select num_bool from jsonTable where NOT num_bool"), - Row(false) -) - -checkAnswer( - sql("select str_bool from jsonTable where NOT str_bool"), - Row(false) -) - -// Right now, the analyzer does not know that num_bool should be treated as a boolean. -// Number and Boolean conflict: resolve the type as boolean in this query. -checkAnswer( - sql("select num_bool from jsonTable where num_bool"), - Row(true) -) - -checkAnswer( - sql("select str_bool from jsonTable where str_bool"), - Row(false) -) - -// The plan of the following DSL is -// Project [(CAST(num_str#65:4, DoubleType) + 1.2) AS num#78] -// Filter (CAST(CAST(num_str#65:4, DoubleType), DecimalType) > 92233720368547758060) -//ExistingRdd [num_bool#61,num_num_1#62L,num_num_2#63,num_num_3#64,num_str#65,str_bool#66] -// We should directly cast num_str to DecimalType and also need to do the right type promotion -// in the Project. -checkAnswer( - jsonDF. -where('num_str >= BigDecimal("92233720368547758060")). -select(('num_str + 1.2).as("num")), - Row(new java.math.BigDecimal("92233720368547758071.2").doubleValue()) -) - -// The following test will fail. The type of num_str is StringType. -// So, to evaluate num_str + 1.2, we first need to use Cast to convert the type. -// In our test data, one value of num_str is 13.1. -// The result of (CAST(num_str#65:4, DoubleType) + 1.2) for this value is 14.299, -// which is not 14.3. -// Number and String conflict: resolve the type as number in this query. -checkAnswer( - sql("select num_str + 1.2 from jsonTable where num_str > 13"), - Row(BigDecimal("14.3")) :: Row(BigDecimal("92233720368547758071.2")) :: Nil -) - } - test("Type conflict in complex field values") { val jsonDF = spark.read.json(complexFieldValueTypeConflict) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated (fc12165 -> cf7c397)
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from fc12165 [SPARK-31036][SQL] Use stringArgs in Expression.toString to respect hidden parameters add cf7c397 [MINOR][SQL] Remove an ignored test from JsonSuite No new revisions were added by this update. Summary of changes: .../sql/execution/datasources/json/JsonSuite.scala | 53 -- 1 file changed, 53 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.0 updated: [SPARK-31036][SQL] Use stringArgs in Expression.toString to respect hidden parameters
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.0 by this push: new 5220a1c [SPARK-31036][SQL] Use stringArgs in Expression.toString to respect hidden parameters 5220a1c is described below commit 5220a1c8756a8ddcf015001c7ab5d8fa02ab8692 Author: HyukjinKwon AuthorDate: Fri Mar 6 10:33:20 2020 +0900 [SPARK-31036][SQL] Use stringArgs in Expression.toString to respect hidden parameters ### What changes were proposed in this pull request? This PR proposes to respect hidden parameters by using `stringArgs` in `Expression.toString `. By this, we can show the strings properly in some cases such as `NonSQLExpression`. ### Why are the changes needed? To respect "hidden" arguments in the string representation. ### Does this PR introduce any user-facing change? Yes, for example, on the top of https://github.com/apache/spark/pull/27657, ```scala val identify = udf((input: Seq[Int]) => input) spark.range(10).select(identify(array("id"))).show() ``` shows hidden parameter `useStringTypeWhenEmpty`. ``` +-+ |UDF(array(id, false))| +-+ | [0]| | [1]| ... ``` whereas: ```scala spark.range(10).select(array("id")).show() ``` ``` +-+ |array(id)| +-+ | [0]| | [1]| ... ``` ### How was this patch tested? Manually tested as below: ```scala val identify = udf((input: Boolean) => input) spark.range(10).select(identify(exists(array(col("id")), _ % 2 === 0))).show() ``` Before: ``` +-+ |UDF(exists(array(id), lambdafunction(((lambda 'x % 2) = 0), lambda 'x, false), true))| +-+ | true| | false| | true| ... ``` After: ``` +---+ |UDF(exists(array(id), lambdafunction(((lambda 'x % 2) = 0), lambda 'x, false)))| +---+ | true| | false| | true| ... ``` Closes #27788 from HyukjinKwon/arguments-str-repr. Authored-by: HyukjinKwon Signed-off-by: HyukjinKwon (cherry picked from commit fc12165f48b2e1dfe04116ddaa6ff6e8650a18fb) Signed-off-by: HyukjinKwon --- .../scala/org/apache/spark/sql/catalyst/expressions/Expression.scala| 2 +- .../apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala| 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala index 4632957..1599321 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala @@ -260,7 +260,7 @@ abstract class Expression extends TreeNode[Expression] { */ def prettyName: String = nodeName.toLowerCase(Locale.ROOT) - protected def flatArguments: Iterator[Any] = productIterator.flatMap { + protected def flatArguments: Iterator[Any] = stringArgs.flatMap { case t: Iterable[_] => t case single => single :: Nil } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala index 9dd4263..e91bd0c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala @@ -533,6 +533,8 @@ case class ArrayExists( SQLConf.get.getConf(SQLConf.LEGACY_ARRAY_EXISTS_FOLLOWS_THREE_VALUED_LOGIC)) } + override def stringArgs: Iterator[Any] = super.stringArgs.take(2) + override def nullable: Boolean = if (followThreeValuedL
[spark] branch master updated (72b52a3 -> fc12165)
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from 72b52a3 [SPARK-30563][SQL] Disable using commit coordinator with NoopDataSource add fc12165 [SPARK-31036][SQL] Use stringArgs in Expression.toString to respect hidden parameters No new revisions were added by this update. Summary of changes: .../scala/org/apache/spark/sql/catalyst/expressions/Expression.scala| 2 +- .../apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala| 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-2.4 updated: [MINOR][SQL] Remove an ignored test from JsonSuite
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-2.4 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-2.4 by this push: new 7c237cc [MINOR][SQL] Remove an ignored test from JsonSuite 7c237cc is described below commit 7c237cc8f7d83d96aed282d799e8a42acdc9d623 Author: Maxim Gekk AuthorDate: Fri Mar 6 10:35:44 2020 +0900 [MINOR][SQL] Remove an ignored test from JsonSuite ### What changes were proposed in this pull request? Remove ignored and outdated test `Type conflict in primitive field values (Ignored)` from JsonSuite. ### Why are the changes needed? The test is not maintained for long time. It can be removed to reduce size of JsonSuite, and improve maintainability. ### Does this PR introduce any user-facing change? No ### How was this patch tested? By running the command `./build/sbt "test:testOnly *JsonV2Suite"` Closes #27795 from MaxGekk/remove-ignored-test-in-JsonSuite. Authored-by: Maxim Gekk Signed-off-by: HyukjinKwon (cherry picked from commit cf7c397ede05fd106697bd5cc8062f394623bf22) Signed-off-by: HyukjinKwon --- .../sql/execution/datasources/json/JsonSuite.scala | 53 -- 1 file changed, 53 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala index 5ca430a..9fcd67a 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala @@ -468,59 +468,6 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData { ) } - ignore("Type conflict in primitive field values (Ignored)") { -val jsonDF = spark.read.json(primitiveFieldValueTypeConflict) -jsonDF.createOrReplaceTempView("jsonTable") - -// Right now, the analyzer does not promote strings in a boolean expression. -// Number and Boolean conflict: resolve the type as boolean in this query. -checkAnswer( - sql("select num_bool from jsonTable where NOT num_bool"), - Row(false) -) - -checkAnswer( - sql("select str_bool from jsonTable where NOT str_bool"), - Row(false) -) - -// Right now, the analyzer does not know that num_bool should be treated as a boolean. -// Number and Boolean conflict: resolve the type as boolean in this query. -checkAnswer( - sql("select num_bool from jsonTable where num_bool"), - Row(true) -) - -checkAnswer( - sql("select str_bool from jsonTable where str_bool"), - Row(false) -) - -// The plan of the following DSL is -// Project [(CAST(num_str#65:4, DoubleType) + 1.2) AS num#78] -// Filter (CAST(CAST(num_str#65:4, DoubleType), DecimalType) > 92233720368547758060) -//ExistingRdd [num_bool#61,num_num_1#62L,num_num_2#63,num_num_3#64,num_str#65,str_bool#66] -// We should directly cast num_str to DecimalType and also need to do the right type promotion -// in the Project. -checkAnswer( - jsonDF. -where('num_str >= BigDecimal("92233720368547758060")). -select(('num_str + 1.2).as("num")), - Row(new java.math.BigDecimal("92233720368547758071.2").doubleValue()) -) - -// The following test will fail. The type of num_str is StringType. -// So, to evaluate num_str + 1.2, we first need to use Cast to convert the type. -// In our test data, one value of num_str is 13.1. -// The result of (CAST(num_str#65:4, DoubleType) + 1.2) for this value is 14.299, -// which is not 14.3. -// Number and String conflict: resolve the type as number in this query. -checkAnswer( - sql("select num_str + 1.2 from jsonTable where num_str > 13"), - Row(BigDecimal("14.3")) :: Row(BigDecimal("92233720368547758071.2")) :: Nil -) - } - test("Type conflict in complex field values") { val jsonDF = spark.read.json(complexFieldValueTypeConflict) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.0 updated: [MINOR][SQL] Remove an ignored test from JsonSuite
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.0 by this push: new 6d8ee15 [MINOR][SQL] Remove an ignored test from JsonSuite 6d8ee15 is described below commit 6d8ee156adcac8fd2a442ab70da750f950a83661 Author: Maxim Gekk AuthorDate: Fri Mar 6 10:35:44 2020 +0900 [MINOR][SQL] Remove an ignored test from JsonSuite ### What changes were proposed in this pull request? Remove ignored and outdated test `Type conflict in primitive field values (Ignored)` from JsonSuite. ### Why are the changes needed? The test is not maintained for long time. It can be removed to reduce size of JsonSuite, and improve maintainability. ### Does this PR introduce any user-facing change? No ### How was this patch tested? By running the command `./build/sbt "test:testOnly *JsonV2Suite"` Closes #27795 from MaxGekk/remove-ignored-test-in-JsonSuite. Authored-by: Maxim Gekk Signed-off-by: HyukjinKwon (cherry picked from commit cf7c397ede05fd106697bd5cc8062f394623bf22) Signed-off-by: HyukjinKwon --- .../sql/execution/datasources/json/JsonSuite.scala | 53 -- 1 file changed, 53 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala index df0bca7..fb3328c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala @@ -481,59 +481,6 @@ abstract class JsonSuite extends QueryTest with SharedSparkSession with TestJson ) } - ignore("Type conflict in primitive field values (Ignored)") { -val jsonDF = spark.read.json(primitiveFieldValueTypeConflict) -jsonDF.createOrReplaceTempView("jsonTable") - -// Right now, the analyzer does not promote strings in a boolean expression. -// Number and Boolean conflict: resolve the type as boolean in this query. -checkAnswer( - sql("select num_bool from jsonTable where NOT num_bool"), - Row(false) -) - -checkAnswer( - sql("select str_bool from jsonTable where NOT str_bool"), - Row(false) -) - -// Right now, the analyzer does not know that num_bool should be treated as a boolean. -// Number and Boolean conflict: resolve the type as boolean in this query. -checkAnswer( - sql("select num_bool from jsonTable where num_bool"), - Row(true) -) - -checkAnswer( - sql("select str_bool from jsonTable where str_bool"), - Row(false) -) - -// The plan of the following DSL is -// Project [(CAST(num_str#65:4, DoubleType) + 1.2) AS num#78] -// Filter (CAST(CAST(num_str#65:4, DoubleType), DecimalType) > 92233720368547758060) -//ExistingRdd [num_bool#61,num_num_1#62L,num_num_2#63,num_num_3#64,num_str#65,str_bool#66] -// We should directly cast num_str to DecimalType and also need to do the right type promotion -// in the Project. -checkAnswer( - jsonDF. -where('num_str >= BigDecimal("92233720368547758060")). -select(('num_str + 1.2).as("num")), - Row(new java.math.BigDecimal("92233720368547758071.2").doubleValue()) -) - -// The following test will fail. The type of num_str is StringType. -// So, to evaluate num_str + 1.2, we first need to use Cast to convert the type. -// In our test data, one value of num_str is 13.1. -// The result of (CAST(num_str#65:4, DoubleType) + 1.2) for this value is 14.299, -// which is not 14.3. -// Number and String conflict: resolve the type as number in this query. -checkAnswer( - sql("select num_str + 1.2 from jsonTable where num_str > 13"), - Row(BigDecimal("14.3")) :: Row(BigDecimal("92233720368547758071.2")) :: Nil -) - } - test("Type conflict in complex field values") { val jsonDF = spark.read.json(complexFieldValueTypeConflict) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated (fc12165 -> cf7c397)
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from fc12165 [SPARK-31036][SQL] Use stringArgs in Expression.toString to respect hidden parameters add cf7c397 [MINOR][SQL] Remove an ignored test from JsonSuite No new revisions were added by this update. Summary of changes: .../sql/execution/datasources/json/JsonSuite.scala | 53 -- 1 file changed, 53 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.0 updated: [SPARK-31036][SQL] Use stringArgs in Expression.toString to respect hidden parameters
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.0 by this push: new 5220a1c [SPARK-31036][SQL] Use stringArgs in Expression.toString to respect hidden parameters 5220a1c is described below commit 5220a1c8756a8ddcf015001c7ab5d8fa02ab8692 Author: HyukjinKwon AuthorDate: Fri Mar 6 10:33:20 2020 +0900 [SPARK-31036][SQL] Use stringArgs in Expression.toString to respect hidden parameters ### What changes were proposed in this pull request? This PR proposes to respect hidden parameters by using `stringArgs` in `Expression.toString `. By this, we can show the strings properly in some cases such as `NonSQLExpression`. ### Why are the changes needed? To respect "hidden" arguments in the string representation. ### Does this PR introduce any user-facing change? Yes, for example, on the top of https://github.com/apache/spark/pull/27657, ```scala val identify = udf((input: Seq[Int]) => input) spark.range(10).select(identify(array("id"))).show() ``` shows hidden parameter `useStringTypeWhenEmpty`. ``` +-+ |UDF(array(id, false))| +-+ | [0]| | [1]| ... ``` whereas: ```scala spark.range(10).select(array("id")).show() ``` ``` +-+ |array(id)| +-+ | [0]| | [1]| ... ``` ### How was this patch tested? Manually tested as below: ```scala val identify = udf((input: Boolean) => input) spark.range(10).select(identify(exists(array(col("id")), _ % 2 === 0))).show() ``` Before: ``` +-+ |UDF(exists(array(id), lambdafunction(((lambda 'x % 2) = 0), lambda 'x, false), true))| +-+ | true| | false| | true| ... ``` After: ``` +---+ |UDF(exists(array(id), lambdafunction(((lambda 'x % 2) = 0), lambda 'x, false)))| +---+ | true| | false| | true| ... ``` Closes #27788 from HyukjinKwon/arguments-str-repr. Authored-by: HyukjinKwon Signed-off-by: HyukjinKwon (cherry picked from commit fc12165f48b2e1dfe04116ddaa6ff6e8650a18fb) Signed-off-by: HyukjinKwon --- .../scala/org/apache/spark/sql/catalyst/expressions/Expression.scala| 2 +- .../apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala| 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala index 4632957..1599321 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala @@ -260,7 +260,7 @@ abstract class Expression extends TreeNode[Expression] { */ def prettyName: String = nodeName.toLowerCase(Locale.ROOT) - protected def flatArguments: Iterator[Any] = productIterator.flatMap { + protected def flatArguments: Iterator[Any] = stringArgs.flatMap { case t: Iterable[_] => t case single => single :: Nil } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala index 9dd4263..e91bd0c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala @@ -533,6 +533,8 @@ case class ArrayExists( SQLConf.get.getConf(SQLConf.LEGACY_ARRAY_EXISTS_FOLLOWS_THREE_VALUED_LOGIC)) } + override def stringArgs: Iterator[Any] = super.stringArgs.take(2) + override def nullable: Boolean = if (followThreeValuedL
[spark] branch branch-3.0 updated: [SPARK-30563][SQL] Disable using commit coordinator with NoopDataSource
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.0 by this push: new 6a7aa0d [SPARK-30563][SQL] Disable using commit coordinator with NoopDataSource 6a7aa0d is described below commit 6a7aa0d1666aa57213ef16cc858dbf105f0810f4 Author: Peter Toth AuthorDate: Fri Mar 6 10:30:59 2020 +0900 [SPARK-30563][SQL] Disable using commit coordinator with NoopDataSource ### What changes were proposed in this pull request? This PR disables using commit coordinator with `NoopDataSource`. ### Why are the changes needed? No need for a coordinator in benchmarks. ### Does this PR introduce any user-facing change? No. ### How was this patch tested? Existing UTs. Closes #27791 from peter-toth/SPARK-30563-disalbe-commit-coordinator. Authored-by: Peter Toth Signed-off-by: HyukjinKwon (cherry picked from commit 72b52a3cdf2b8661c0f82db645e805ff6b085dfc) Signed-off-by: HyukjinKwon --- .../org/apache/spark/sql/execution/datasources/noop/NoopDataSource.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/noop/NoopDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/noop/NoopDataSource.scala index 4fad0a2..851cc51 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/noop/NoopDataSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/noop/NoopDataSource.scala @@ -61,6 +61,7 @@ private[noop] object NoopWriteBuilder extends WriteBuilder with SupportsTruncate private[noop] object NoopBatchWrite extends BatchWrite { override def createBatchWriterFactory(info: PhysicalWriteInfo): DataWriterFactory = NoopWriterFactory + override def useCommitCoordinator(): Boolean = false override def commit(messages: Array[WriterCommitMessage]): Unit = {} override def abort(messages: Array[WriterCommitMessage]): Unit = {} } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated (72b52a3 -> fc12165)
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from 72b52a3 [SPARK-30563][SQL] Disable using commit coordinator with NoopDataSource add fc12165 [SPARK-31036][SQL] Use stringArgs in Expression.toString to respect hidden parameters No new revisions were added by this update. Summary of changes: .../scala/org/apache/spark/sql/catalyst/expressions/Expression.scala| 2 +- .../apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala| 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.0 updated: [MINOR][SQL] Remove an ignored test from JsonSuite
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.0 by this push: new 6d8ee15 [MINOR][SQL] Remove an ignored test from JsonSuite 6d8ee15 is described below commit 6d8ee156adcac8fd2a442ab70da750f950a83661 Author: Maxim Gekk AuthorDate: Fri Mar 6 10:35:44 2020 +0900 [MINOR][SQL] Remove an ignored test from JsonSuite ### What changes were proposed in this pull request? Remove ignored and outdated test `Type conflict in primitive field values (Ignored)` from JsonSuite. ### Why are the changes needed? The test is not maintained for long time. It can be removed to reduce size of JsonSuite, and improve maintainability. ### Does this PR introduce any user-facing change? No ### How was this patch tested? By running the command `./build/sbt "test:testOnly *JsonV2Suite"` Closes #27795 from MaxGekk/remove-ignored-test-in-JsonSuite. Authored-by: Maxim Gekk Signed-off-by: HyukjinKwon (cherry picked from commit cf7c397ede05fd106697bd5cc8062f394623bf22) Signed-off-by: HyukjinKwon --- .../sql/execution/datasources/json/JsonSuite.scala | 53 -- 1 file changed, 53 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala index df0bca7..fb3328c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala @@ -481,59 +481,6 @@ abstract class JsonSuite extends QueryTest with SharedSparkSession with TestJson ) } - ignore("Type conflict in primitive field values (Ignored)") { -val jsonDF = spark.read.json(primitiveFieldValueTypeConflict) -jsonDF.createOrReplaceTempView("jsonTable") - -// Right now, the analyzer does not promote strings in a boolean expression. -// Number and Boolean conflict: resolve the type as boolean in this query. -checkAnswer( - sql("select num_bool from jsonTable where NOT num_bool"), - Row(false) -) - -checkAnswer( - sql("select str_bool from jsonTable where NOT str_bool"), - Row(false) -) - -// Right now, the analyzer does not know that num_bool should be treated as a boolean. -// Number and Boolean conflict: resolve the type as boolean in this query. -checkAnswer( - sql("select num_bool from jsonTable where num_bool"), - Row(true) -) - -checkAnswer( - sql("select str_bool from jsonTable where str_bool"), - Row(false) -) - -// The plan of the following DSL is -// Project [(CAST(num_str#65:4, DoubleType) + 1.2) AS num#78] -// Filter (CAST(CAST(num_str#65:4, DoubleType), DecimalType) > 92233720368547758060) -//ExistingRdd [num_bool#61,num_num_1#62L,num_num_2#63,num_num_3#64,num_str#65,str_bool#66] -// We should directly cast num_str to DecimalType and also need to do the right type promotion -// in the Project. -checkAnswer( - jsonDF. -where('num_str >= BigDecimal("92233720368547758060")). -select(('num_str + 1.2).as("num")), - Row(new java.math.BigDecimal("92233720368547758071.2").doubleValue()) -) - -// The following test will fail. The type of num_str is StringType. -// So, to evaluate num_str + 1.2, we first need to use Cast to convert the type. -// In our test data, one value of num_str is 13.1. -// The result of (CAST(num_str#65:4, DoubleType) + 1.2) for this value is 14.299, -// which is not 14.3. -// Number and String conflict: resolve the type as number in this query. -checkAnswer( - sql("select num_str + 1.2 from jsonTable where num_str > 13"), - Row(BigDecimal("14.3")) :: Row(BigDecimal("92233720368547758071.2")) :: Nil -) - } - test("Type conflict in complex field values") { val jsonDF = spark.read.json(complexFieldValueTypeConflict) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-2.4 updated: [MINOR][SQL] Remove an ignored test from JsonSuite
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-2.4 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-2.4 by this push: new 7c237cc [MINOR][SQL] Remove an ignored test from JsonSuite 7c237cc is described below commit 7c237cc8f7d83d96aed282d799e8a42acdc9d623 Author: Maxim Gekk AuthorDate: Fri Mar 6 10:35:44 2020 +0900 [MINOR][SQL] Remove an ignored test from JsonSuite ### What changes were proposed in this pull request? Remove ignored and outdated test `Type conflict in primitive field values (Ignored)` from JsonSuite. ### Why are the changes needed? The test is not maintained for long time. It can be removed to reduce size of JsonSuite, and improve maintainability. ### Does this PR introduce any user-facing change? No ### How was this patch tested? By running the command `./build/sbt "test:testOnly *JsonV2Suite"` Closes #27795 from MaxGekk/remove-ignored-test-in-JsonSuite. Authored-by: Maxim Gekk Signed-off-by: HyukjinKwon (cherry picked from commit cf7c397ede05fd106697bd5cc8062f394623bf22) Signed-off-by: HyukjinKwon --- .../sql/execution/datasources/json/JsonSuite.scala | 53 -- 1 file changed, 53 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala index 5ca430a..9fcd67a 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala @@ -468,59 +468,6 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData { ) } - ignore("Type conflict in primitive field values (Ignored)") { -val jsonDF = spark.read.json(primitiveFieldValueTypeConflict) -jsonDF.createOrReplaceTempView("jsonTable") - -// Right now, the analyzer does not promote strings in a boolean expression. -// Number and Boolean conflict: resolve the type as boolean in this query. -checkAnswer( - sql("select num_bool from jsonTable where NOT num_bool"), - Row(false) -) - -checkAnswer( - sql("select str_bool from jsonTable where NOT str_bool"), - Row(false) -) - -// Right now, the analyzer does not know that num_bool should be treated as a boolean. -// Number and Boolean conflict: resolve the type as boolean in this query. -checkAnswer( - sql("select num_bool from jsonTable where num_bool"), - Row(true) -) - -checkAnswer( - sql("select str_bool from jsonTable where str_bool"), - Row(false) -) - -// The plan of the following DSL is -// Project [(CAST(num_str#65:4, DoubleType) + 1.2) AS num#78] -// Filter (CAST(CAST(num_str#65:4, DoubleType), DecimalType) > 92233720368547758060) -//ExistingRdd [num_bool#61,num_num_1#62L,num_num_2#63,num_num_3#64,num_str#65,str_bool#66] -// We should directly cast num_str to DecimalType and also need to do the right type promotion -// in the Project. -checkAnswer( - jsonDF. -where('num_str >= BigDecimal("92233720368547758060")). -select(('num_str + 1.2).as("num")), - Row(new java.math.BigDecimal("92233720368547758071.2").doubleValue()) -) - -// The following test will fail. The type of num_str is StringType. -// So, to evaluate num_str + 1.2, we first need to use Cast to convert the type. -// In our test data, one value of num_str is 13.1. -// The result of (CAST(num_str#65:4, DoubleType) + 1.2) for this value is 14.299, -// which is not 14.3. -// Number and String conflict: resolve the type as number in this query. -checkAnswer( - sql("select num_str + 1.2 from jsonTable where num_str > 13"), - Row(BigDecimal("14.3")) :: Row(BigDecimal("92233720368547758071.2")) :: Nil -) - } - test("Type conflict in complex field values") { val jsonDF = spark.read.json(complexFieldValueTypeConflict) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated (fc12165 -> cf7c397)
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from fc12165 [SPARK-31036][SQL] Use stringArgs in Expression.toString to respect hidden parameters add cf7c397 [MINOR][SQL] Remove an ignored test from JsonSuite No new revisions were added by this update. Summary of changes: .../sql/execution/datasources/json/JsonSuite.scala | 53 -- 1 file changed, 53 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.0 updated: [SPARK-31036][SQL] Use stringArgs in Expression.toString to respect hidden parameters
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.0 by this push: new 5220a1c [SPARK-31036][SQL] Use stringArgs in Expression.toString to respect hidden parameters 5220a1c is described below commit 5220a1c8756a8ddcf015001c7ab5d8fa02ab8692 Author: HyukjinKwon AuthorDate: Fri Mar 6 10:33:20 2020 +0900 [SPARK-31036][SQL] Use stringArgs in Expression.toString to respect hidden parameters ### What changes were proposed in this pull request? This PR proposes to respect hidden parameters by using `stringArgs` in `Expression.toString `. By this, we can show the strings properly in some cases such as `NonSQLExpression`. ### Why are the changes needed? To respect "hidden" arguments in the string representation. ### Does this PR introduce any user-facing change? Yes, for example, on the top of https://github.com/apache/spark/pull/27657, ```scala val identify = udf((input: Seq[Int]) => input) spark.range(10).select(identify(array("id"))).show() ``` shows hidden parameter `useStringTypeWhenEmpty`. ``` +-+ |UDF(array(id, false))| +-+ | [0]| | [1]| ... ``` whereas: ```scala spark.range(10).select(array("id")).show() ``` ``` +-+ |array(id)| +-+ | [0]| | [1]| ... ``` ### How was this patch tested? Manually tested as below: ```scala val identify = udf((input: Boolean) => input) spark.range(10).select(identify(exists(array(col("id")), _ % 2 === 0))).show() ``` Before: ``` +-+ |UDF(exists(array(id), lambdafunction(((lambda 'x % 2) = 0), lambda 'x, false), true))| +-+ | true| | false| | true| ... ``` After: ``` +---+ |UDF(exists(array(id), lambdafunction(((lambda 'x % 2) = 0), lambda 'x, false)))| +---+ | true| | false| | true| ... ``` Closes #27788 from HyukjinKwon/arguments-str-repr. Authored-by: HyukjinKwon Signed-off-by: HyukjinKwon (cherry picked from commit fc12165f48b2e1dfe04116ddaa6ff6e8650a18fb) Signed-off-by: HyukjinKwon --- .../scala/org/apache/spark/sql/catalyst/expressions/Expression.scala| 2 +- .../apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala| 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala index 4632957..1599321 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala @@ -260,7 +260,7 @@ abstract class Expression extends TreeNode[Expression] { */ def prettyName: String = nodeName.toLowerCase(Locale.ROOT) - protected def flatArguments: Iterator[Any] = productIterator.flatMap { + protected def flatArguments: Iterator[Any] = stringArgs.flatMap { case t: Iterable[_] => t case single => single :: Nil } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala index 9dd4263..e91bd0c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala @@ -533,6 +533,8 @@ case class ArrayExists( SQLConf.get.getConf(SQLConf.LEGACY_ARRAY_EXISTS_FOLLOWS_THREE_VALUED_LOGIC)) } + override def stringArgs: Iterator[Any] = super.stringArgs.take(2) + override def nullable: Boolean = if (followThreeValuedL
[spark] branch master updated (72b52a3 -> fc12165)
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from 72b52a3 [SPARK-30563][SQL] Disable using commit coordinator with NoopDataSource add fc12165 [SPARK-31036][SQL] Use stringArgs in Expression.toString to respect hidden parameters No new revisions were added by this update. Summary of changes: .../scala/org/apache/spark/sql/catalyst/expressions/Expression.scala| 2 +- .../apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala| 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.0 updated: [SPARK-30563][SQL] Disable using commit coordinator with NoopDataSource
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.0 by this push: new 6a7aa0d [SPARK-30563][SQL] Disable using commit coordinator with NoopDataSource 6a7aa0d is described below commit 6a7aa0d1666aa57213ef16cc858dbf105f0810f4 Author: Peter Toth AuthorDate: Fri Mar 6 10:30:59 2020 +0900 [SPARK-30563][SQL] Disable using commit coordinator with NoopDataSource ### What changes were proposed in this pull request? This PR disables using commit coordinator with `NoopDataSource`. ### Why are the changes needed? No need for a coordinator in benchmarks. ### Does this PR introduce any user-facing change? No. ### How was this patch tested? Existing UTs. Closes #27791 from peter-toth/SPARK-30563-disalbe-commit-coordinator. Authored-by: Peter Toth Signed-off-by: HyukjinKwon (cherry picked from commit 72b52a3cdf2b8661c0f82db645e805ff6b085dfc) Signed-off-by: HyukjinKwon --- .../org/apache/spark/sql/execution/datasources/noop/NoopDataSource.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/noop/NoopDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/noop/NoopDataSource.scala index 4fad0a2..851cc51 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/noop/NoopDataSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/noop/NoopDataSource.scala @@ -61,6 +61,7 @@ private[noop] object NoopWriteBuilder extends WriteBuilder with SupportsTruncate private[noop] object NoopBatchWrite extends BatchWrite { override def createBatchWriterFactory(info: PhysicalWriteInfo): DataWriterFactory = NoopWriterFactory + override def useCommitCoordinator(): Boolean = false override def commit(messages: Array[WriterCommitMessage]): Unit = {} override def abort(messages: Array[WriterCommitMessage]): Unit = {} } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated (fc12165 -> cf7c397)
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from fc12165 [SPARK-31036][SQL] Use stringArgs in Expression.toString to respect hidden parameters add cf7c397 [MINOR][SQL] Remove an ignored test from JsonSuite No new revisions were added by this update. Summary of changes: .../sql/execution/datasources/json/JsonSuite.scala | 53 -- 1 file changed, 53 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.0 updated: [SPARK-31036][SQL] Use stringArgs in Expression.toString to respect hidden parameters
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.0 by this push: new 5220a1c [SPARK-31036][SQL] Use stringArgs in Expression.toString to respect hidden parameters 5220a1c is described below commit 5220a1c8756a8ddcf015001c7ab5d8fa02ab8692 Author: HyukjinKwon AuthorDate: Fri Mar 6 10:33:20 2020 +0900 [SPARK-31036][SQL] Use stringArgs in Expression.toString to respect hidden parameters ### What changes were proposed in this pull request? This PR proposes to respect hidden parameters by using `stringArgs` in `Expression.toString `. By this, we can show the strings properly in some cases such as `NonSQLExpression`. ### Why are the changes needed? To respect "hidden" arguments in the string representation. ### Does this PR introduce any user-facing change? Yes, for example, on the top of https://github.com/apache/spark/pull/27657, ```scala val identify = udf((input: Seq[Int]) => input) spark.range(10).select(identify(array("id"))).show() ``` shows hidden parameter `useStringTypeWhenEmpty`. ``` +-+ |UDF(array(id, false))| +-+ | [0]| | [1]| ... ``` whereas: ```scala spark.range(10).select(array("id")).show() ``` ``` +-+ |array(id)| +-+ | [0]| | [1]| ... ``` ### How was this patch tested? Manually tested as below: ```scala val identify = udf((input: Boolean) => input) spark.range(10).select(identify(exists(array(col("id")), _ % 2 === 0))).show() ``` Before: ``` +-+ |UDF(exists(array(id), lambdafunction(((lambda 'x % 2) = 0), lambda 'x, false), true))| +-+ | true| | false| | true| ... ``` After: ``` +---+ |UDF(exists(array(id), lambdafunction(((lambda 'x % 2) = 0), lambda 'x, false)))| +---+ | true| | false| | true| ... ``` Closes #27788 from HyukjinKwon/arguments-str-repr. Authored-by: HyukjinKwon Signed-off-by: HyukjinKwon (cherry picked from commit fc12165f48b2e1dfe04116ddaa6ff6e8650a18fb) Signed-off-by: HyukjinKwon --- .../scala/org/apache/spark/sql/catalyst/expressions/Expression.scala| 2 +- .../apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala| 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala index 4632957..1599321 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala @@ -260,7 +260,7 @@ abstract class Expression extends TreeNode[Expression] { */ def prettyName: String = nodeName.toLowerCase(Locale.ROOT) - protected def flatArguments: Iterator[Any] = productIterator.flatMap { + protected def flatArguments: Iterator[Any] = stringArgs.flatMap { case t: Iterable[_] => t case single => single :: Nil } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala index 9dd4263..e91bd0c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala @@ -533,6 +533,8 @@ case class ArrayExists( SQLConf.get.getConf(SQLConf.LEGACY_ARRAY_EXISTS_FOLLOWS_THREE_VALUED_LOGIC)) } + override def stringArgs: Iterator[Any] = super.stringArgs.take(2) + override def nullable: Boolean = if (followThreeValuedL
[spark] branch master updated (72b52a3 -> fc12165)
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from 72b52a3 [SPARK-30563][SQL] Disable using commit coordinator with NoopDataSource add fc12165 [SPARK-31036][SQL] Use stringArgs in Expression.toString to respect hidden parameters No new revisions were added by this update. Summary of changes: .../scala/org/apache/spark/sql/catalyst/expressions/Expression.scala| 2 +- .../apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala| 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.0 updated: [SPARK-30563][SQL] Disable using commit coordinator with NoopDataSource
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.0 by this push: new 6a7aa0d [SPARK-30563][SQL] Disable using commit coordinator with NoopDataSource 6a7aa0d is described below commit 6a7aa0d1666aa57213ef16cc858dbf105f0810f4 Author: Peter Toth AuthorDate: Fri Mar 6 10:30:59 2020 +0900 [SPARK-30563][SQL] Disable using commit coordinator with NoopDataSource ### What changes were proposed in this pull request? This PR disables using commit coordinator with `NoopDataSource`. ### Why are the changes needed? No need for a coordinator in benchmarks. ### Does this PR introduce any user-facing change? No. ### How was this patch tested? Existing UTs. Closes #27791 from peter-toth/SPARK-30563-disalbe-commit-coordinator. Authored-by: Peter Toth Signed-off-by: HyukjinKwon (cherry picked from commit 72b52a3cdf2b8661c0f82db645e805ff6b085dfc) Signed-off-by: HyukjinKwon --- .../org/apache/spark/sql/execution/datasources/noop/NoopDataSource.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/noop/NoopDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/noop/NoopDataSource.scala index 4fad0a2..851cc51 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/noop/NoopDataSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/noop/NoopDataSource.scala @@ -61,6 +61,7 @@ private[noop] object NoopWriteBuilder extends WriteBuilder with SupportsTruncate private[noop] object NoopBatchWrite extends BatchWrite { override def createBatchWriterFactory(info: PhysicalWriteInfo): DataWriterFactory = NoopWriterFactory + override def useCommitCoordinator(): Boolean = false override def commit(messages: Array[WriterCommitMessage]): Unit = {} override def abort(messages: Array[WriterCommitMessage]): Unit = {} } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.0 updated: [SPARK-31036][SQL] Use stringArgs in Expression.toString to respect hidden parameters
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.0 by this push: new 5220a1c [SPARK-31036][SQL] Use stringArgs in Expression.toString to respect hidden parameters 5220a1c is described below commit 5220a1c8756a8ddcf015001c7ab5d8fa02ab8692 Author: HyukjinKwon AuthorDate: Fri Mar 6 10:33:20 2020 +0900 [SPARK-31036][SQL] Use stringArgs in Expression.toString to respect hidden parameters ### What changes were proposed in this pull request? This PR proposes to respect hidden parameters by using `stringArgs` in `Expression.toString `. By this, we can show the strings properly in some cases such as `NonSQLExpression`. ### Why are the changes needed? To respect "hidden" arguments in the string representation. ### Does this PR introduce any user-facing change? Yes, for example, on the top of https://github.com/apache/spark/pull/27657, ```scala val identify = udf((input: Seq[Int]) => input) spark.range(10).select(identify(array("id"))).show() ``` shows hidden parameter `useStringTypeWhenEmpty`. ``` +-+ |UDF(array(id, false))| +-+ | [0]| | [1]| ... ``` whereas: ```scala spark.range(10).select(array("id")).show() ``` ``` +-+ |array(id)| +-+ | [0]| | [1]| ... ``` ### How was this patch tested? Manually tested as below: ```scala val identify = udf((input: Boolean) => input) spark.range(10).select(identify(exists(array(col("id")), _ % 2 === 0))).show() ``` Before: ``` +-+ |UDF(exists(array(id), lambdafunction(((lambda 'x % 2) = 0), lambda 'x, false), true))| +-+ | true| | false| | true| ... ``` After: ``` +---+ |UDF(exists(array(id), lambdafunction(((lambda 'x % 2) = 0), lambda 'x, false)))| +---+ | true| | false| | true| ... ``` Closes #27788 from HyukjinKwon/arguments-str-repr. Authored-by: HyukjinKwon Signed-off-by: HyukjinKwon (cherry picked from commit fc12165f48b2e1dfe04116ddaa6ff6e8650a18fb) Signed-off-by: HyukjinKwon --- .../scala/org/apache/spark/sql/catalyst/expressions/Expression.scala| 2 +- .../apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala| 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala index 4632957..1599321 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala @@ -260,7 +260,7 @@ abstract class Expression extends TreeNode[Expression] { */ def prettyName: String = nodeName.toLowerCase(Locale.ROOT) - protected def flatArguments: Iterator[Any] = productIterator.flatMap { + protected def flatArguments: Iterator[Any] = stringArgs.flatMap { case t: Iterable[_] => t case single => single :: Nil } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala index 9dd4263..e91bd0c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala @@ -533,6 +533,8 @@ case class ArrayExists( SQLConf.get.getConf(SQLConf.LEGACY_ARRAY_EXISTS_FOLLOWS_THREE_VALUED_LOGIC)) } + override def stringArgs: Iterator[Any] = super.stringArgs.take(2) + override def nullable: Boolean = if (followThreeValuedL
[spark] branch master updated (72b52a3 -> fc12165)
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from 72b52a3 [SPARK-30563][SQL] Disable using commit coordinator with NoopDataSource add fc12165 [SPARK-31036][SQL] Use stringArgs in Expression.toString to respect hidden parameters No new revisions were added by this update. Summary of changes: .../scala/org/apache/spark/sql/catalyst/expressions/Expression.scala| 2 +- .../apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala| 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.0 updated: [SPARK-30563][SQL] Disable using commit coordinator with NoopDataSource
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.0 by this push: new 6a7aa0d [SPARK-30563][SQL] Disable using commit coordinator with NoopDataSource 6a7aa0d is described below commit 6a7aa0d1666aa57213ef16cc858dbf105f0810f4 Author: Peter Toth AuthorDate: Fri Mar 6 10:30:59 2020 +0900 [SPARK-30563][SQL] Disable using commit coordinator with NoopDataSource ### What changes were proposed in this pull request? This PR disables using commit coordinator with `NoopDataSource`. ### Why are the changes needed? No need for a coordinator in benchmarks. ### Does this PR introduce any user-facing change? No. ### How was this patch tested? Existing UTs. Closes #27791 from peter-toth/SPARK-30563-disalbe-commit-coordinator. Authored-by: Peter Toth Signed-off-by: HyukjinKwon (cherry picked from commit 72b52a3cdf2b8661c0f82db645e805ff6b085dfc) Signed-off-by: HyukjinKwon --- .../org/apache/spark/sql/execution/datasources/noop/NoopDataSource.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/noop/NoopDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/noop/NoopDataSource.scala index 4fad0a2..851cc51 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/noop/NoopDataSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/noop/NoopDataSource.scala @@ -61,6 +61,7 @@ private[noop] object NoopWriteBuilder extends WriteBuilder with SupportsTruncate private[noop] object NoopBatchWrite extends BatchWrite { override def createBatchWriterFactory(info: PhysicalWriteInfo): DataWriterFactory = NoopWriterFactory + override def useCommitCoordinator(): Boolean = false override def commit(messages: Array[WriterCommitMessage]): Unit = {} override def abort(messages: Array[WriterCommitMessage]): Unit = {} } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.0 updated: [SPARK-30563][SQL] Disable using commit coordinator with NoopDataSource
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.0 by this push: new 6a7aa0d [SPARK-30563][SQL] Disable using commit coordinator with NoopDataSource 6a7aa0d is described below commit 6a7aa0d1666aa57213ef16cc858dbf105f0810f4 Author: Peter Toth AuthorDate: Fri Mar 6 10:30:59 2020 +0900 [SPARK-30563][SQL] Disable using commit coordinator with NoopDataSource ### What changes were proposed in this pull request? This PR disables using commit coordinator with `NoopDataSource`. ### Why are the changes needed? No need for a coordinator in benchmarks. ### Does this PR introduce any user-facing change? No. ### How was this patch tested? Existing UTs. Closes #27791 from peter-toth/SPARK-30563-disalbe-commit-coordinator. Authored-by: Peter Toth Signed-off-by: HyukjinKwon (cherry picked from commit 72b52a3cdf2b8661c0f82db645e805ff6b085dfc) Signed-off-by: HyukjinKwon --- .../org/apache/spark/sql/execution/datasources/noop/NoopDataSource.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/noop/NoopDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/noop/NoopDataSource.scala index 4fad0a2..851cc51 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/noop/NoopDataSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/noop/NoopDataSource.scala @@ -61,6 +61,7 @@ private[noop] object NoopWriteBuilder extends WriteBuilder with SupportsTruncate private[noop] object NoopBatchWrite extends BatchWrite { override def createBatchWriterFactory(info: PhysicalWriteInfo): DataWriterFactory = NoopWriterFactory + override def useCommitCoordinator(): Boolean = false override def commit(messages: Array[WriterCommitMessage]): Unit = {} override def abort(messages: Array[WriterCommitMessage]): Unit = {} } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated (ffec7a1 -> 72b52a3)
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from ffec7a1 [SQL][DOCS][MINOR] Fix typos and wrong phrases in docs add 72b52a3 [SPARK-30563][SQL] Disable using commit coordinator with NoopDataSource No new revisions were added by this update. Summary of changes: .../org/apache/spark/sql/execution/datasources/noop/NoopDataSource.scala | 1 + 1 file changed, 1 insertion(+) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.0 updated: [SQL][DOCS][MINOR] Fix typos and wrong phrases in docs
This is an automated email from the ASF dual-hosted git repository. gengliang pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.0 by this push: new ed7924a [SQL][DOCS][MINOR] Fix typos and wrong phrases in docs ed7924a is described below commit ed7924a86e2c4dd6b38075de68038e1997b2b30e Author: Takeshi Yamamuro AuthorDate: Thu Mar 5 16:54:59 2020 -0800 [SQL][DOCS][MINOR] Fix typos and wrong phrases in docs ### What changes were proposed in this pull request? This PR intends to fix typos and phrases in the `/docs` directory. To find them, I run the Intellij typo checker. ### Why are the changes needed? For better documents. ### Does this PR introduce any user-facing change? No. ### How was this patch tested? N/A Closes #27819 from maropu/TypoFix-20200306. Authored-by: Takeshi Yamamuro Signed-off-by: Gengliang Wang (cherry picked from commit ffec7a1964984a7f911ac0da125134c098f273ba) Signed-off-by: Gengliang Wang --- docs/sql-pyspark-pandas-with-arrow.md| 2 +- docs/sql-ref-ansi-compliance.md | 2 +- docs/sql-ref-null-semantics.md | 2 +- docs/sql-ref-syntax-aux-show-functions.md| 2 +- docs/sql-ref-syntax-ddl-alter-table.md | 2 +- docs/sql-ref-syntax-ddl-create-table-like.md | 2 +- docs/sql-ref-syntax-qry-select-limit.md | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/sql-pyspark-pandas-with-arrow.md b/docs/sql-pyspark-pandas-with-arrow.md index 63ba0ba..e8abb9f 100644 --- a/docs/sql-pyspark-pandas-with-arrow.md +++ b/docs/sql-pyspark-pandas-with-arrow.md @@ -91,7 +91,7 @@ specify the type hints of `pandas.Series` and `pandas.DataFrame` as below: -In the following sections, it describes the cominations of the supported type hints. For simplicity, +In the following sections, it describes the combinations of the supported type hints. For simplicity, `pandas.DataFrame` variant is omitted. ### Series to Series diff --git a/docs/sql-ref-ansi-compliance.md b/docs/sql-ref-ansi-compliance.md index 267184a..27e60b4 100644 --- a/docs/sql-ref-ansi-compliance.md +++ b/docs/sql-ref-ansi-compliance.md @@ -60,7 +60,7 @@ The following subsections present behaviour changes in arithmetic operations, ty ### Arithmetic Operations In Spark SQL, arithmetic operations performed on numeric types (with the exception of decimal) are not checked for overflows by default. -This means that in case an operation causes overflows, the result is the same that the same operation returns in a Java/Scala program (e.g., if the sum of 2 integers is higher than the maximum value representable, the result is a negative number). +This means that in case an operation causes overflows, the result is the same with the corresponding operation in a Java/Scala program (e.g., if the sum of 2 integers is higher than the maximum value representable, the result is a negative number). On the other hand, Spark SQL returns null for decimal overflows. When `spark.sql.ansi.enabled` is set to `true` and an overflow occurs in numeric and interval arithmetic operations, it throws an arithmetic exception at runtime. diff --git a/docs/sql-ref-null-semantics.md b/docs/sql-ref-null-semantics.md index 3cbc15c..37b4081 100644 --- a/docs/sql-ref-null-semantics.md +++ b/docs/sql-ref-null-semantics.md @@ -605,7 +605,7 @@ SELECT name, age FROM unknown_age; In Spark, EXISTS and NOT EXISTS expressions are allowed inside a WHERE clause. These are boolean expressions which return either `TRUE` or `FALSE`. In other words, EXISTS is a membership condition and returns `TRUE` -when the subquery it refers to returns one or more rows. Similary, NOT EXISTS +when the subquery it refers to returns one or more rows. Similarly, NOT EXISTS is a non-membership condition and returns TRUE when no rows or zero rows are returned from the subquery. diff --git a/docs/sql-ref-syntax-aux-show-functions.md b/docs/sql-ref-syntax-aux-show-functions.md index 701d427..d6f9df9 100644 --- a/docs/sql-ref-syntax-aux-show-functions.md +++ b/docs/sql-ref-syntax-aux-show-functions.md @@ -22,7 +22,7 @@ license: | ### Description Returns the list of functions after applying an optional regex pattern. Given number of functions supported by Spark is quite large, this statement -in conjuction with [describe function](sql-ref-syntax-aux-describe-function.html) +in conjunction with [describe function](sql-ref-syntax-aux-describe-function.html) may be used to quickly find the function and understand its usage. The `LIKE` clause is optional and supported only for compatibility with other systems. diff --git a/docs/sql-ref-syntax-ddl-alter-table.md b/docs/sql-ref-syntax-ddl-alter-table.md index a921478..373fa8d 100644 --- a/docs/sql-ref-syntax-ddl-alter-table.md +++ b/do
[spark] branch master updated: [SQL][DOCS][MINOR] Fix typos and wrong phrases in docs
This is an automated email from the ASF dual-hosted git repository. gengliang pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new ffec7a1 [SQL][DOCS][MINOR] Fix typos and wrong phrases in docs ffec7a1 is described below commit ffec7a1964984a7f911ac0da125134c098f273ba Author: Takeshi Yamamuro AuthorDate: Thu Mar 5 16:54:59 2020 -0800 [SQL][DOCS][MINOR] Fix typos and wrong phrases in docs ### What changes were proposed in this pull request? This PR intends to fix typos and phrases in the `/docs` directory. To find them, I run the Intellij typo checker. ### Why are the changes needed? For better documents. ### Does this PR introduce any user-facing change? No. ### How was this patch tested? N/A Closes #27819 from maropu/TypoFix-20200306. Authored-by: Takeshi Yamamuro Signed-off-by: Gengliang Wang --- docs/sql-pyspark-pandas-with-arrow.md| 2 +- docs/sql-ref-ansi-compliance.md | 2 +- docs/sql-ref-null-semantics.md | 2 +- docs/sql-ref-syntax-aux-show-functions.md| 2 +- docs/sql-ref-syntax-ddl-alter-table.md | 2 +- docs/sql-ref-syntax-ddl-create-table-like.md | 2 +- docs/sql-ref-syntax-qry-select-limit.md | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/sql-pyspark-pandas-with-arrow.md b/docs/sql-pyspark-pandas-with-arrow.md index 63ba0ba..e8abb9f 100644 --- a/docs/sql-pyspark-pandas-with-arrow.md +++ b/docs/sql-pyspark-pandas-with-arrow.md @@ -91,7 +91,7 @@ specify the type hints of `pandas.Series` and `pandas.DataFrame` as below: -In the following sections, it describes the cominations of the supported type hints. For simplicity, +In the following sections, it describes the combinations of the supported type hints. For simplicity, `pandas.DataFrame` variant is omitted. ### Series to Series diff --git a/docs/sql-ref-ansi-compliance.md b/docs/sql-ref-ansi-compliance.md index 267184a..27e60b4 100644 --- a/docs/sql-ref-ansi-compliance.md +++ b/docs/sql-ref-ansi-compliance.md @@ -60,7 +60,7 @@ The following subsections present behaviour changes in arithmetic operations, ty ### Arithmetic Operations In Spark SQL, arithmetic operations performed on numeric types (with the exception of decimal) are not checked for overflows by default. -This means that in case an operation causes overflows, the result is the same that the same operation returns in a Java/Scala program (e.g., if the sum of 2 integers is higher than the maximum value representable, the result is a negative number). +This means that in case an operation causes overflows, the result is the same with the corresponding operation in a Java/Scala program (e.g., if the sum of 2 integers is higher than the maximum value representable, the result is a negative number). On the other hand, Spark SQL returns null for decimal overflows. When `spark.sql.ansi.enabled` is set to `true` and an overflow occurs in numeric and interval arithmetic operations, it throws an arithmetic exception at runtime. diff --git a/docs/sql-ref-null-semantics.md b/docs/sql-ref-null-semantics.md index 3cbc15c..37b4081 100644 --- a/docs/sql-ref-null-semantics.md +++ b/docs/sql-ref-null-semantics.md @@ -605,7 +605,7 @@ SELECT name, age FROM unknown_age; In Spark, EXISTS and NOT EXISTS expressions are allowed inside a WHERE clause. These are boolean expressions which return either `TRUE` or `FALSE`. In other words, EXISTS is a membership condition and returns `TRUE` -when the subquery it refers to returns one or more rows. Similary, NOT EXISTS +when the subquery it refers to returns one or more rows. Similarly, NOT EXISTS is a non-membership condition and returns TRUE when no rows or zero rows are returned from the subquery. diff --git a/docs/sql-ref-syntax-aux-show-functions.md b/docs/sql-ref-syntax-aux-show-functions.md index 701d427..d6f9df9 100644 --- a/docs/sql-ref-syntax-aux-show-functions.md +++ b/docs/sql-ref-syntax-aux-show-functions.md @@ -22,7 +22,7 @@ license: | ### Description Returns the list of functions after applying an optional regex pattern. Given number of functions supported by Spark is quite large, this statement -in conjuction with [describe function](sql-ref-syntax-aux-describe-function.html) +in conjunction with [describe function](sql-ref-syntax-aux-describe-function.html) may be used to quickly find the function and understand its usage. The `LIKE` clause is optional and supported only for compatibility with other systems. diff --git a/docs/sql-ref-syntax-ddl-alter-table.md b/docs/sql-ref-syntax-ddl-alter-table.md index a921478..373fa8d 100644 --- a/docs/sql-ref-syntax-ddl-alter-table.md +++ b/docs/sql-ref-syntax-ddl-alter-table.md @@ -260,7 +260,7 @@ ALTER TABLE dbx.tab1 PARTITION (a='1', b='2') SET LOCATION
[spark] branch branch-3.0 updated: [SPARK-30994][BUILD][FOLLOW-UP] Change scope of xml-apis to include it and add xerces in SBT as dependency override
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.0 by this push: new 32dc6ac [SPARK-30994][BUILD][FOLLOW-UP] Change scope of xml-apis to include it and add xerces in SBT as dependency override 32dc6ac is described below commit 32dc6acdf5615cdd668fa65249d6d170692acb19 Author: HyukjinKwon AuthorDate: Fri Mar 6 09:39:02 2020 +0900 [SPARK-30994][BUILD][FOLLOW-UP] Change scope of xml-apis to include it and add xerces in SBT as dependency override ### What changes were proposed in this pull request? This PR propose 1. Explicitly include xml-apis. xml-apis is already the part of xerces 2.12.0 (https://repo1.maven.org/maven2/xerces/xercesImpl/2.12.0/xercesImpl-2.12.0.pom). However, we're excluding it by setting `scope` to `test`. This seems causing `spark-shell`, built from Maven, to fail. Seems like previously xml-apis wasn't reached for some reasons but after we upgrade, it seems requiring. Therefore, this PR proposes to include it. 2. Pins `xerces` version in SBT as well. Seems this dependency is resolved differently from Maven. Note that Hadoop 3 does not looks requiring this as they replaced xerces as of [HDFS-12221](https://issues.apache.org/jira/browse/HDFS-12221). ### Why are the changes needed? To make `spark-shell` working from Maven build, and uses the same xerces version. ### Does this PR introduce any user-facing change? No, it's master only. ### How was this patch tested? **1.** ```bash ./build/mvn -DskipTests -Psparkr -Phive clean package ./bin/spark-shell ``` Before: ``` Exception in thread "main" java.lang.NoClassDefFoundError: org/w3c/dom/ElementTraversal at java.lang.ClassLoader.defineClass1(Native Method) at java.lang.ClassLoader.defineClass(ClassLoader.java:763) at java.security.SecureClassLoader.defineClass(SecureClassLoader.java:142) at java.net.URLClassLoader.defineClass(URLClassLoader.java:468) at java.net.URLClassLoader.access$100(URLClassLoader.java:74) at java.net.URLClassLoader$1.run(URLClassLoader.java:369) at java.net.URLClassLoader$1.run(URLClassLoader.java:363) at java.security.AccessController.doPrivileged(Native Method) at java.net.URLClassLoader.findClass(URLClassLoader.java:362) at java.lang.ClassLoader.loadClass(ClassLoader.java:424) at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:349) at java.lang.ClassLoader.loadClass(ClassLoader.java:357) at org.apache.xerces.parsers.AbstractDOMParser.startDocument(Unknown Source) at org.apache.xerces.xinclude.XIncludeHandler.startDocument(Unknown Source) at org.apache.xerces.impl.dtd.XMLDTDValidator.startDocument(Unknown Source) at org.apache.xerces.impl.XMLDocumentScannerImpl.startEntity(Unknown Source) at org.apache.xerces.impl.XMLVersionDetector.startDocumentParsing(Unknown Source) at org.apache.xerces.parsers.XML11Configuration.parse(Unknown Source) at org.apache.xerces.parsers.XML11Configuration.parse(Unknown Source) at org.apache.xerces.parsers.XMLParser.parse(Unknown Source) at org.apache.xerces.parsers.DOMParser.parse(Unknown Source) at org.apache.xerces.jaxp.DocumentBuilderImpl.parse(Unknown Source) at javax.xml.parsers.DocumentBuilder.parse(DocumentBuilder.java:150) at org.apache.hadoop.conf.Configuration.parse(Configuration.java:2482) at org.apache.hadoop.conf.Configuration.parse(Configuration.java:2470) at org.apache.hadoop.conf.Configuration.loadResource(Configuration.java:2541) at org.apache.hadoop.conf.Configuration.loadResources(Configuration.java:2494) at org.apache.hadoop.conf.Configuration.getProps(Configuration.java:2407) at org.apache.hadoop.conf.Configuration.set(Configuration.java:1143) at org.apache.hadoop.conf.Configuration.set(Configuration.java:1115) at org.apache.spark.deploy.SparkHadoopUtil$.org$apache$spark$deploy$SparkHadoopUtil$$appendS3AndSparkHadoopHiveConfigurations(SparkHadoopUtil.scala:456) at org.apache.spark.deploy.SparkHadoopUtil$.newConfiguration(SparkHadoopUtil.scala:427) at org.apache.spark.deploy.SparkSubmit.$anonfun$prepareSubmitEnvironment$2(SparkSubmit.scala:342) at scala.Option.getOrElse(Option.scala:189) at org.apache.spark.deploy.SparkSubmit.prepareSubmitEnvironment(SparkSubmit.scala:342) at org.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:871) at org.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180) at org.apache.spark.deploy.SparkSubmit.su
[spark] branch master updated (fe126a6 -> 5b3277f)
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from fe126a6 [SPARK-31058][SQL][TEST-HIVE1.2] Consolidate the implementation of `quoteIfNeeded` add 5b3277f [SPARK-30994][BUILD][FOLLOW-UP] Change scope of xml-apis to include it and add xerces in SBT as dependency override No new revisions were added by this update. Summary of changes: dev/deps/spark-deps-hadoop-2.7-hive-1.2 | 1 + dev/deps/spark-deps-hadoop-2.7-hive-2.3 | 1 + pom.xml | 1 - project/SparkBuild.scala| 1 + 4 files changed, 3 insertions(+), 1 deletion(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.0 updated: [SPARK-31058][SQL][TEST-HIVE1.2] Consolidate the implementation of `quoteIfNeeded`
This is an automated email from the ASF dual-hosted git repository. dbtsai pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.0 by this push: new 853f69a [SPARK-31058][SQL][TEST-HIVE1.2] Consolidate the implementation of `quoteIfNeeded` 853f69a is described below commit 853f69a4cf4bf138afb9075325ad175bc5f2d334 Author: DB Tsai AuthorDate: Fri Mar 6 00:13:57 2020 + [SPARK-31058][SQL][TEST-HIVE1.2] Consolidate the implementation of `quoteIfNeeded` ### What changes were proposed in this pull request? There are two implementation of quoteIfNeeded. One is in `org.apache.spark.sql.connector.catalog.CatalogV2Implicits.quote` and the other is in `OrcFiltersBase.quoteAttributeNameIfNeeded`. This PR will consolidate them into one. ### Why are the changes needed? Simplify the codebase. ### Does this PR introduce any user-facing change? No ### How was this patch tested? Existing UTs. Closes #27814 from dbtsai/SPARK-31058. Authored-by: DB Tsai Signed-off-by: DB Tsai (cherry picked from commit fe126a6a05b0de8db68c8f890e876ce96bf194ca) Signed-off-by: DB Tsai --- .../spark/sql/connector/catalog/IdentifierImpl.java | 2 +- .../sql/connector/catalog/CatalogV2Implicits.scala | 10 +- .../spark/sql/connector/catalog/V1Table.scala | 13 +++-- .../catalyst/analysis/ResolveSessionCatalog.scala | 2 +- .../execution/datasources/orc/OrcFiltersBase.scala | 10 -- .../sql/execution/datasources/orc/OrcFilters.scala | 21 - .../sql/execution/datasources/orc/OrcFilters.scala | 21 - 7 files changed, 34 insertions(+), 45 deletions(-) diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/IdentifierImpl.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/IdentifierImpl.java index a56007b..30596d9 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/IdentifierImpl.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/IdentifierImpl.java @@ -55,7 +55,7 @@ class IdentifierImpl implements Identifier { @Override public String toString() { return Stream.concat(Stream.of(namespace), Stream.of(name)) - .map(CatalogV2Implicits::quote) + .map(CatalogV2Implicits::quoteIfNeeded) .collect(Collectors.joining(".")); } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/CatalogV2Implicits.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/CatalogV2Implicits.scala index 3478af8..71bab62 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/CatalogV2Implicits.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/CatalogV2Implicits.scala @@ -84,15 +84,15 @@ private[sql] object CatalogV2Implicits { } implicit class NamespaceHelper(namespace: Array[String]) { -def quoted: String = namespace.map(quote).mkString(".") +def quoted: String = namespace.map(quoteIfNeeded).mkString(".") } implicit class IdentifierHelper(ident: Identifier) { def quoted: String = { if (ident.namespace.nonEmpty) { -ident.namespace.map(quote).mkString(".") + "." + quote(ident.name) +ident.namespace.map(quoteIfNeeded).mkString(".") + "." + quoteIfNeeded(ident.name) } else { -quote(ident.name) +quoteIfNeeded(ident.name) } } @@ -122,10 +122,10 @@ private[sql] object CatalogV2Implicits { s"$quoted is not a valid TableIdentifier as it has more than 2 name parts.") } -def quoted: String = parts.map(quote).mkString(".") +def quoted: String = parts.map(quoteIfNeeded).mkString(".") } - def quote(part: String): String = { + def quoteIfNeeded(part: String): String = { if (part.contains(".") || part.contains("`")) { s"`${part.replace("`", "``")}`" } else { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/V1Table.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/V1Table.scala index 91e0c58..70fc968 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/V1Table.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/V1Table.scala @@ -24,6 +24,7 @@ import scala.collection.mutable import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.catalog.CatalogTable +import org.apache.spark.sql.connector.catalog.CatalogV2Implicits.quoteIfNeeded import org.apache.spark.sql.connector.expressions.{LogicalExpressions, Transform} import org.apache.spark.sql.types.StructType @@ -35,20 +36,12 @@ private[sql] case class V1Table(v1Table: CatalogTable) extends Table { def quoted: String = { identif
[spark] branch master updated (8d5ef2f -> fe126a6)
This is an automated email from the ASF dual-hosted git repository. dbtsai pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from 8d5ef2f [SPARK-31052][TEST][CORE] Fix flaky test "DAGSchedulerSuite.shuffle fetch failed on speculative task, but original task succeed" add fe126a6 [SPARK-31058][SQL][TEST-HIVE1.2] Consolidate the implementation of `quoteIfNeeded` No new revisions were added by this update. Summary of changes: .../spark/sql/connector/catalog/IdentifierImpl.java | 2 +- .../sql/connector/catalog/CatalogV2Implicits.scala | 10 +- .../spark/sql/connector/catalog/V1Table.scala | 13 +++-- .../catalyst/analysis/ResolveSessionCatalog.scala | 2 +- .../execution/datasources/orc/OrcFiltersBase.scala | 10 -- .../sql/execution/datasources/orc/OrcFilters.scala | 21 - .../sql/execution/datasources/orc/OrcFilters.scala | 21 - 7 files changed, 34 insertions(+), 45 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.0 updated (80a8947 -> 515eb9d)
This is an automated email from the ASF dual-hosted git repository. gengliang pushed a change to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git. from 80a8947 [SPARK-31052][TEST][CORE] Fix flaky test "DAGSchedulerSuite.shuffle fetch failed on speculative task, but original task succeed" add 515eb9d [SPARK-31013][CORE][WEBUI] InMemoryStore: improve removeAllByIndexValues over natural key index No new revisions were added by this update. Summary of changes: .../org/apache/spark/util/kvstore/InMemoryStore.java | 16 +++- .../apache/spark/util/kvstore/InMemoryStoreSuite.java | 18 -- 2 files changed, 23 insertions(+), 11 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.0 updated: [SPARK-31052][TEST][CORE] Fix flaky test "DAGSchedulerSuite.shuffle fetch failed on speculative task, but original task succeed"
This is an automated email from the ASF dual-hosted git repository. jiangxb1987 pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.0 by this push: new 80a8947 [SPARK-31052][TEST][CORE] Fix flaky test "DAGSchedulerSuite.shuffle fetch failed on speculative task, but original task succeed" 80a8947 is described below commit 80a894785121777aedfe340d0acacbbd17630cb3 Author: yi.wu AuthorDate: Thu Mar 5 10:56:49 2020 -0800 [SPARK-31052][TEST][CORE] Fix flaky test "DAGSchedulerSuite.shuffle fetch failed on speculative task, but original task succeed" ### What changes were proposed in this pull request? This PR fix the flaky test in #27050. ### Why are the changes needed? `SparkListenerStageCompleted` is posted by `listenerBus` asynchronously. So, we should make sure listener has consumed the event before asserting completed stages. See [error message](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/119308/testReport/org.apache.spark.scheduler/DAGSchedulerSuite/shuffle_fetch_failed_on_speculative_task__but_original_task_succeed__SPARK_30388_/): ``` sbt.ForkMain$ForkError: org.scalatest.exceptions.TestFailedException: List(0, 1, 1) did not equal List(0, 1, 1, 0) at org.scalatest.Assertions.newAssertionFailedException(Assertions.scala:530) at org.scalatest.Assertions.newAssertionFailedException$(Assertions.scala:529) at org.scalatest.FunSuite.newAssertionFailedException(FunSuite.scala:1560) at org.scalatest.Assertions$AssertionsHelper.macroAssert(Assertions.scala:503) at org.apache.spark.scheduler.DAGSchedulerSuite.$anonfun$new$88(DAGSchedulerSuite.scala:1976) ``` ### Does this PR introduce any user-facing change? No. ### How was this patch tested? Update test and test locally by no failure after running hundreds of times. Note, the failure is easy to reproduce when loop running the test for hundreds of times(e.g 200) Closes #27809 from Ngone51/fix_flaky_spark_30388. Authored-by: yi.wu Signed-off-by: Xingbo Jiang (cherry picked from commit 8d5ef2f766166cce3cc7a15a98ec016050ede4d8) Signed-off-by: Xingbo Jiang --- .../test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala| 5 - 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala index 72a2e4c..02bc216 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala @@ -1931,7 +1931,7 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi assertDataStructuresEmpty() } - test("shuffle fetch failed on speculative task, but original task succeed (SPARK-30388)") { + test("SPARK-30388: shuffle fetch failed on speculative task, but original task succeed") { var completedStage: List[Int] = Nil val listener = new SparkListener() { override def onStageCompleted(event: SparkListenerStageCompleted): Unit = { @@ -1945,6 +1945,7 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi val reduceRdd = new MyRDD(sc, 2, List(shuffleDep)) submit(reduceRdd, Array(0, 1)) completeShuffleMapStageSuccessfully(0, 0, 2) +sc.listenerBus.waitUntilEmpty() assert(completedStage === List(0)) // result task 0.0 succeed @@ -1960,6 +1961,7 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi info ) ) +sc.listenerBus.waitUntilEmpty() assert(completedStage === List(0, 1)) Thread.sleep(DAGScheduler.RESUBMIT_TIMEOUT * 2) @@ -1971,6 +1973,7 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi // original result task 1.0 succeed runEvent(makeCompletionEvent(taskSets(1).tasks(1), Success, 42)) +sc.listenerBus.waitUntilEmpty() assert(completedStage === List(0, 1, 1, 0)) assert(scheduler.activeJobs.isEmpty) } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-31052][TEST][CORE] Fix flaky test "DAGSchedulerSuite.shuffle fetch failed on speculative task, but original task succeed"
This is an automated email from the ASF dual-hosted git repository. jiangxb1987 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 8d5ef2f [SPARK-31052][TEST][CORE] Fix flaky test "DAGSchedulerSuite.shuffle fetch failed on speculative task, but original task succeed" 8d5ef2f is described below commit 8d5ef2f766166cce3cc7a15a98ec016050ede4d8 Author: yi.wu AuthorDate: Thu Mar 5 10:56:49 2020 -0800 [SPARK-31052][TEST][CORE] Fix flaky test "DAGSchedulerSuite.shuffle fetch failed on speculative task, but original task succeed" ### What changes were proposed in this pull request? This PR fix the flaky test in #27050. ### Why are the changes needed? `SparkListenerStageCompleted` is posted by `listenerBus` asynchronously. So, we should make sure listener has consumed the event before asserting completed stages. See [error message](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/119308/testReport/org.apache.spark.scheduler/DAGSchedulerSuite/shuffle_fetch_failed_on_speculative_task__but_original_task_succeed__SPARK_30388_/): ``` sbt.ForkMain$ForkError: org.scalatest.exceptions.TestFailedException: List(0, 1, 1) did not equal List(0, 1, 1, 0) at org.scalatest.Assertions.newAssertionFailedException(Assertions.scala:530) at org.scalatest.Assertions.newAssertionFailedException$(Assertions.scala:529) at org.scalatest.FunSuite.newAssertionFailedException(FunSuite.scala:1560) at org.scalatest.Assertions$AssertionsHelper.macroAssert(Assertions.scala:503) at org.apache.spark.scheduler.DAGSchedulerSuite.$anonfun$new$88(DAGSchedulerSuite.scala:1976) ``` ### Does this PR introduce any user-facing change? No. ### How was this patch tested? Update test and test locally by no failure after running hundreds of times. Note, the failure is easy to reproduce when loop running the test for hundreds of times(e.g 200) Closes #27809 from Ngone51/fix_flaky_spark_30388. Authored-by: yi.wu Signed-off-by: Xingbo Jiang --- .../test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala| 5 - 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala index 2b2fd32..4486389 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala @@ -1933,7 +1933,7 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi assertDataStructuresEmpty() } - test("shuffle fetch failed on speculative task, but original task succeed (SPARK-30388)") { + test("SPARK-30388: shuffle fetch failed on speculative task, but original task succeed") { var completedStage: List[Int] = Nil val listener = new SparkListener() { override def onStageCompleted(event: SparkListenerStageCompleted): Unit = { @@ -1947,6 +1947,7 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi val reduceRdd = new MyRDD(sc, 2, List(shuffleDep)) submit(reduceRdd, Array(0, 1)) completeShuffleMapStageSuccessfully(0, 0, 2) +sc.listenerBus.waitUntilEmpty() assert(completedStage === List(0)) // result task 0.0 succeed @@ -1962,6 +1963,7 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi info ) ) +sc.listenerBus.waitUntilEmpty() assert(completedStage === List(0, 1)) Thread.sleep(DAGScheduler.RESUBMIT_TIMEOUT * 2) @@ -1973,6 +1975,7 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi // original result task 1.0 succeed runEvent(makeCompletionEvent(taskSets(1).tasks(1), Success, 42)) +sc.listenerBus.waitUntilEmpty() assert(completedStage === List(0, 1, 1, 0)) assert(scheduler.activeJobs.isEmpty) } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.0 updated: [SPARK-31037][SQL] refine AQE config names
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.0 by this push: new 46b7f17 [SPARK-31037][SQL] refine AQE config names 46b7f17 is described below commit 46b7f1796bd0b96977ce9b473601033f397a3b18 Author: Wenchen Fan AuthorDate: Fri Mar 6 00:46:34 2020 +0800 [SPARK-31037][SQL] refine AQE config names When introducing AQE to others, I feel the config names are a bit incoherent and hard to use. This PR refines the config names: 1. remove the "shuffle" prefix. AQE is all about shuffle and we don't need to add the "shuffle" prefix everywhere. 2. `targetPostShuffleInputSize` is obscure, rename to `advisoryShufflePartitionSizeInBytes`. 3. `reducePostShufflePartitions` doesn't match the actual optimization, rename to `coalesceShufflePartitions` 4. `minNumPostShufflePartitions` is obscure, rename it `minPartitionNum` under the `coalesceShufflePartitions` namespace 5. `maxNumPostShufflePartitions` is confusing with the word "max", rename it `initialPartitionNum` 6. `skewedJoinOptimization` is too verbose. skew join is a well-known terminology in database area, we can just say `skewJoin` Make the config names easy to understand. deprecate the config `spark.sql.adaptive.shuffle.targetPostShuffleInputSize` N/A Closes #27793 from cloud-fan/aqe. Authored-by: Wenchen Fan Signed-off-by: Wenchen Fan --- .../org/apache/spark/sql/internal/SQLConf.scala| 142 +++-- .../spark/sql/execution/ShuffledRowRDD.scala | 2 +- .../execution/adaptive/AdaptiveSparkPlanExec.scala | 2 +- ...tions.scala => CoalesceShufflePartitions.scala} | 14 +- .../execution/adaptive/OptimizeSkewedJoin.scala| 14 +- .../execution/exchange/EnsureRequirements.scala| 4 +- scala => CoalesceShufflePartitionsSuite.scala} | 12 +- .../adaptive/AdaptiveQueryExecSuite.scala | 8 +- .../apache/spark/sql/internal/SQLConfSuite.scala | 26 ++-- .../spark/sql/sources/BucketedReadSuite.scala | 2 +- 10 files changed, 117 insertions(+), 109 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 3dbfc65..b2b3d12 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -356,8 +356,16 @@ object SQLConf { .checkValue(_ > 0, "The value of spark.sql.shuffle.partitions must be positive") .createWithDefault(200) + val SHUFFLE_TARGET_POSTSHUFFLE_INPUT_SIZE = +buildConf("spark.sql.adaptive.shuffle.targetPostShuffleInputSize") + .internal() + .doc("(Deprecated since Spark 3.0)") + .bytesConf(ByteUnit.BYTE) + .createWithDefaultString("64MB") + val ADAPTIVE_EXECUTION_ENABLED = buildConf("spark.sql.adaptive.enabled") -.doc("When true, enable adaptive query execution.") +.doc("When true, enable adaptive query execution, which re-optimizes the query plan in the " + + "middle of query execution, based on accurate runtime statistics.") .booleanConf .createWithDefault(false) @@ -365,90 +373,90 @@ object SQLConf { .internal() .doc("Adaptive query execution is skipped when the query does not have exchanges or " + "sub-queries. By setting this config to true (together with " + - s"'${ADAPTIVE_EXECUTION_ENABLED.key}' enabled), Spark will force apply adaptive query " + + s"'${ADAPTIVE_EXECUTION_ENABLED.key}' set to true), Spark will force apply adaptive query " + "execution for all supported queries.") .booleanConf .createWithDefault(false) - val REDUCE_POST_SHUFFLE_PARTITIONS_ENABLED = -buildConf("spark.sql.adaptive.shuffle.reducePostShufflePartitions") - .doc(s"When true and '${ADAPTIVE_EXECUTION_ENABLED.key}' is enabled, this enables reducing " + -"the number of post-shuffle partitions based on map output statistics.") - .booleanConf - .createWithDefault(true) + val ADVISORY_PARTITION_SIZE_IN_BYTES = +buildConf("spark.sql.adaptive.advisoryPartitionSizeInBytes") + .doc("The advisory size in bytes of the shuffle partition during adaptive optimization " + +s"(when ${ADAPTIVE_EXECUTION_ENABLED.key} is true). It takes effect when Spark " + +"coalesces small shuffle partitions or splits skewed shuffle partition.") + .fallbackConf(SHUFFLE_TARGET_POSTSHUFFLE_INPUT_SIZE) - val FETCH_SHUFFLE_BLOCKS_IN_BATCH_ENABLED = -buildConf("spark.sql.adaptive.shuffle.fetchShuffleBlocksInBatch") - .doc("Whether to fetch the continuous shuffle blocks in batch. Instead of fetching blocks " + -"one by one, fetching continuous shuffle
[spark] branch branch-3.0 updated: [SPARK-31037][SQL] refine AQE config names
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.0 by this push: new 46b7f17 [SPARK-31037][SQL] refine AQE config names 46b7f17 is described below commit 46b7f1796bd0b96977ce9b473601033f397a3b18 Author: Wenchen Fan AuthorDate: Fri Mar 6 00:46:34 2020 +0800 [SPARK-31037][SQL] refine AQE config names When introducing AQE to others, I feel the config names are a bit incoherent and hard to use. This PR refines the config names: 1. remove the "shuffle" prefix. AQE is all about shuffle and we don't need to add the "shuffle" prefix everywhere. 2. `targetPostShuffleInputSize` is obscure, rename to `advisoryShufflePartitionSizeInBytes`. 3. `reducePostShufflePartitions` doesn't match the actual optimization, rename to `coalesceShufflePartitions` 4. `minNumPostShufflePartitions` is obscure, rename it `minPartitionNum` under the `coalesceShufflePartitions` namespace 5. `maxNumPostShufflePartitions` is confusing with the word "max", rename it `initialPartitionNum` 6. `skewedJoinOptimization` is too verbose. skew join is a well-known terminology in database area, we can just say `skewJoin` Make the config names easy to understand. deprecate the config `spark.sql.adaptive.shuffle.targetPostShuffleInputSize` N/A Closes #27793 from cloud-fan/aqe. Authored-by: Wenchen Fan Signed-off-by: Wenchen Fan --- .../org/apache/spark/sql/internal/SQLConf.scala| 142 +++-- .../spark/sql/execution/ShuffledRowRDD.scala | 2 +- .../execution/adaptive/AdaptiveSparkPlanExec.scala | 2 +- ...tions.scala => CoalesceShufflePartitions.scala} | 14 +- .../execution/adaptive/OptimizeSkewedJoin.scala| 14 +- .../execution/exchange/EnsureRequirements.scala| 4 +- scala => CoalesceShufflePartitionsSuite.scala} | 12 +- .../adaptive/AdaptiveQueryExecSuite.scala | 8 +- .../apache/spark/sql/internal/SQLConfSuite.scala | 26 ++-- .../spark/sql/sources/BucketedReadSuite.scala | 2 +- 10 files changed, 117 insertions(+), 109 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 3dbfc65..b2b3d12 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -356,8 +356,16 @@ object SQLConf { .checkValue(_ > 0, "The value of spark.sql.shuffle.partitions must be positive") .createWithDefault(200) + val SHUFFLE_TARGET_POSTSHUFFLE_INPUT_SIZE = +buildConf("spark.sql.adaptive.shuffle.targetPostShuffleInputSize") + .internal() + .doc("(Deprecated since Spark 3.0)") + .bytesConf(ByteUnit.BYTE) + .createWithDefaultString("64MB") + val ADAPTIVE_EXECUTION_ENABLED = buildConf("spark.sql.adaptive.enabled") -.doc("When true, enable adaptive query execution.") +.doc("When true, enable adaptive query execution, which re-optimizes the query plan in the " + + "middle of query execution, based on accurate runtime statistics.") .booleanConf .createWithDefault(false) @@ -365,90 +373,90 @@ object SQLConf { .internal() .doc("Adaptive query execution is skipped when the query does not have exchanges or " + "sub-queries. By setting this config to true (together with " + - s"'${ADAPTIVE_EXECUTION_ENABLED.key}' enabled), Spark will force apply adaptive query " + + s"'${ADAPTIVE_EXECUTION_ENABLED.key}' set to true), Spark will force apply adaptive query " + "execution for all supported queries.") .booleanConf .createWithDefault(false) - val REDUCE_POST_SHUFFLE_PARTITIONS_ENABLED = -buildConf("spark.sql.adaptive.shuffle.reducePostShufflePartitions") - .doc(s"When true and '${ADAPTIVE_EXECUTION_ENABLED.key}' is enabled, this enables reducing " + -"the number of post-shuffle partitions based on map output statistics.") - .booleanConf - .createWithDefault(true) + val ADVISORY_PARTITION_SIZE_IN_BYTES = +buildConf("spark.sql.adaptive.advisoryPartitionSizeInBytes") + .doc("The advisory size in bytes of the shuffle partition during adaptive optimization " + +s"(when ${ADAPTIVE_EXECUTION_ENABLED.key} is true). It takes effect when Spark " + +"coalesces small shuffle partitions or splits skewed shuffle partition.") + .fallbackConf(SHUFFLE_TARGET_POSTSHUFFLE_INPUT_SIZE) - val FETCH_SHUFFLE_BLOCKS_IN_BATCH_ENABLED = -buildConf("spark.sql.adaptive.shuffle.fetchShuffleBlocksInBatch") - .doc("Whether to fetch the continuous shuffle blocks in batch. Instead of fetching blocks " + -"one by one, fetching continuous shuffle
[spark] branch master updated (2257ce2 -> ba86524)
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from 2257ce2 [SPARK-31034][CORE] ShuffleBlockFetcherIterator should always create request for last block group add ba86524 [SPARK-31037][SQL] refine AQE config names No new revisions were added by this update. Summary of changes: .../org/apache/spark/sql/internal/SQLConf.scala| 149 +++-- .../spark/sql/execution/ShuffledRowRDD.scala | 2 +- .../execution/adaptive/AdaptiveSparkPlanExec.scala | 2 +- ...tions.scala => CoalesceShufflePartitions.scala} | 14 +- .../execution/adaptive/OptimizeSkewedJoin.scala| 14 +- .../execution/exchange/EnsureRequirements.scala| 4 +- scala => CoalesceShufflePartitionsSuite.scala} | 12 +- .../adaptive/AdaptiveQueryExecSuite.scala | 8 +- .../apache/spark/sql/internal/SQLConfSuite.scala | 26 ++-- .../spark/sql/sources/BucketedReadSuite.scala | 2 +- 10 files changed, 121 insertions(+), 112 deletions(-) rename sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/{ReduceNumShufflePartitions.scala => CoalesceShufflePartitions.scala} (90%) rename sql/core/src/test/scala/org/apache/spark/sql/execution/{ReduceNumShufflePartitionsSuite.scala => CoalesceShufflePartitionsSuite.scala} (96%) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated (2257ce2 -> ba86524)
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from 2257ce2 [SPARK-31034][CORE] ShuffleBlockFetcherIterator should always create request for last block group add ba86524 [SPARK-31037][SQL] refine AQE config names No new revisions were added by this update. Summary of changes: .../org/apache/spark/sql/internal/SQLConf.scala| 149 +++-- .../spark/sql/execution/ShuffledRowRDD.scala | 2 +- .../execution/adaptive/AdaptiveSparkPlanExec.scala | 2 +- ...tions.scala => CoalesceShufflePartitions.scala} | 14 +- .../execution/adaptive/OptimizeSkewedJoin.scala| 14 +- .../execution/exchange/EnsureRequirements.scala| 4 +- scala => CoalesceShufflePartitionsSuite.scala} | 12 +- .../adaptive/AdaptiveQueryExecSuite.scala | 8 +- .../apache/spark/sql/internal/SQLConfSuite.scala | 26 ++-- .../spark/sql/sources/BucketedReadSuite.scala | 2 +- 10 files changed, 121 insertions(+), 112 deletions(-) rename sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/{ReduceNumShufflePartitions.scala => CoalesceShufflePartitions.scala} (90%) rename sql/core/src/test/scala/org/apache/spark/sql/execution/{ReduceNumShufflePartitionsSuite.scala => CoalesceShufflePartitionsSuite.scala} (96%) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.0 updated: [SPARK-31034][CORE] ShuffleBlockFetcherIterator should always create request for last block group
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.0 by this push: new f34898c [SPARK-31034][CORE] ShuffleBlockFetcherIterator should always create request for last block group f34898c is described below commit f34898c5e19c9a35c091eded9652cd5e3d661d19 Author: yi.wu AuthorDate: Thu Mar 5 21:31:26 2020 +0800 [SPARK-31034][CORE] ShuffleBlockFetcherIterator should always create request for last block group ### What changes were proposed in this pull request? This is a bug fix of #27280. This PR fix the bug where `ShuffleBlockFetcherIterator` may forget to create request for the last block group. ### Why are the changes needed? When (all blocks).sum < `targetRemoteRequestSize` and (all blocks).length > `maxBlocksInFlightPerAddress` and (last block group).size < `maxBlocksInFlightPerAddress`, `ShuffleBlockFetcherIterator` will not create a request for the last group. Thus, it will lost data for the reduce task. ### Does this PR introduce any user-facing change? No. ### How was this patch tested? Updated test. Closes #27786 from Ngone51/fix_no_request_bug. Authored-by: yi.wu Signed-off-by: Wenchen Fan (cherry picked from commit 2257ce24437f05c417821c02e3e44c77c93f7211) Signed-off-by: Wenchen Fan --- .../storage/ShuffleBlockFetcherIterator.scala | 8 +- .../storage/ShuffleBlockFetcherIteratorSuite.scala | 91 ++ 2 files changed, 78 insertions(+), 21 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala b/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala index cd4c860..2a0447d 100644 --- a/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala +++ b/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala @@ -339,14 +339,14 @@ final class ShuffleBlockFetcherIterator( + s"with ${blocks.size} blocks") } -def createFetchRequests(): Unit = { +def createFetchRequests(isLast: Boolean): Unit = { val mergedBlocks = mergeContinuousShuffleBlockIdsIfNeeded(curBlocks) curBlocks = new ArrayBuffer[FetchBlockInfo] if (mergedBlocks.length <= maxBlocksInFlightPerAddress) { createFetchRequest(mergedBlocks) } else { mergedBlocks.grouped(maxBlocksInFlightPerAddress).foreach { blocks => - if (blocks.length == maxBlocksInFlightPerAddress) { + if (blocks.length == maxBlocksInFlightPerAddress || isLast) { createFetchRequest(blocks) } else { // The last group does not exceed `maxBlocksInFlightPerAddress`. Put it back @@ -367,12 +367,12 @@ final class ShuffleBlockFetcherIterator( // For batch fetch, the actual block in flight should count for merged block. val mayExceedsMaxBlocks = !doBatchFetch && curBlocks.size >= maxBlocksInFlightPerAddress if (curRequestSize >= targetRemoteRequestSize || mayExceedsMaxBlocks) { -createFetchRequests() +createFetchRequests(isLast = false) } } // Add in the final request if (curBlocks.nonEmpty) { - createFetchRequests() + createFetchRequests(isLast = true) } } diff --git a/core/src/test/scala/org/apache/spark/storage/ShuffleBlockFetcherIteratorSuite.scala b/core/src/test/scala/org/apache/spark/storage/ShuffleBlockFetcherIteratorSuite.scala index 2090a51..773629c 100644 --- a/core/src/test/scala/org/apache/spark/storage/ShuffleBlockFetcherIteratorSuite.scala +++ b/core/src/test/scala/org/apache/spark/storage/ShuffleBlockFetcherIteratorSuite.scala @@ -433,32 +433,86 @@ class ShuffleBlockFetcherIteratorSuite extends SparkFunSuite with PrivateMethodT assert(blockManager.hostLocalDirManager.get.getCachedHostLocalDirs().size === 1) } - test("fetch continuous blocks in batch respects maxSize and maxBlocks") { + test("fetch continuous blocks in batch should respect maxBytesInFlight") { val blockManager = mock(classOf[BlockManager]) val localBmId = BlockManagerId("test-client", "test-local-host", 1) doReturn(localBmId).when(blockManager).blockManagerId // Make sure remote blocks would return the merged block -val remoteBmId = BlockManagerId("test-client-1", "test-client-1", 2) -val remoteBlocks = Seq[BlockId]( +val remoteBmId1 = BlockManagerId("test-client-1", "test-client-1", 1) +val remoteBmId2 = BlockManagerId("test-client-2", "test-client-2", 2) +val remoteBlocks1 = (0 until 15).map(ShuffleBlockId(0, 3, _)) +val remoteBlocks2 = Seq[BlockId](ShuffleBlockId(0, 4, 0), ShuffleBlockId(0, 4, 1)) +val mergedRemoteBlocks = Map[BlockId, ManagedBuffer]( + ShuffleBlockBatchId(0, 3, 0,
[spark] branch master updated (1fd9a91 -> 2257ce2)
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from 1fd9a91 [SPARK-31005][SQL] Support time zone ids in casting strings to timestamps add 2257ce2 [SPARK-31034][CORE] ShuffleBlockFetcherIterator should always create request for last block group No new revisions were added by this update. Summary of changes: .../storage/ShuffleBlockFetcherIterator.scala | 8 +- .../storage/ShuffleBlockFetcherIteratorSuite.scala | 91 ++ 2 files changed, 78 insertions(+), 21 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.0 updated: [SPARK-31005][SQL] Support time zone ids in casting strings to timestamps
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.0 by this push: new 2ff711c [SPARK-31005][SQL] Support time zone ids in casting strings to timestamps 2ff711c is described below commit 2ff711ca4f17004abf44186d24fbb0f35fa6336d Author: Maxim Gekk AuthorDate: Thu Mar 5 20:49:43 2020 +0800 [SPARK-31005][SQL] Support time zone ids in casting strings to timestamps ### What changes were proposed in this pull request? In the PR, I propose to change `DateTimeUtils.stringToTimestamp` to support any valid time zone id at the end of input string. After the changes, the function accepts zone ids in the formats: - no zone id. In that case, the function uses the local session time zone from the SQL config `spark.sql.session.timeZone` - -[h]h:[m]m - +[h]h:[m]m - Z - Short zone id, see https://docs.oracle.com/javase/8/docs/api/java/time/ZoneId.html#SHORT_IDS - Zone ID starts with 'UTC+', 'UTC-', 'GMT+', 'GMT-', 'UT+' or 'UT-'. The ID is split in two, with a two or three letter prefix and a suffix starting with the sign. The suffix must be in the formats: - +|-h[h] - +|-hh[:]mm - +|-hh:mm:ss - +|-hhmmss - Region-based zone IDs in the form `{area}/{city}`, such as `Europe/Paris` or `America/New_York`. The default set of region ids is supplied by the IANA Time Zone Database (TZDB). ### Why are the changes needed? - To use `stringToTimestamp` as a substitution of removed `stringToTime`, see https://github.com/apache/spark/pull/27710#discussion_r385020173 - Improve UX of Spark SQL by allowing flexible formats of zone ids. Currently, Spark accepts only `Z` and zone offsets that can be inconvenient when a time zone offset is shifted due to daylight saving rules. For instance: ```sql spark-sql> select cast('2015-03-18T12:03:17.123456 Europe/Moscow' as timestamp); NULL ``` ### Does this PR introduce any user-facing change? Yes. After the changes, casting strings to timestamps allows time zone id at the end of the strings: ```sql spark-sql> select cast('2015-03-18T12:03:17.123456 Europe/Moscow' as timestamp); 2015-03-18 12:03:17.123456 ``` ### How was this patch tested? - Added new test cases to the `string to timestamp` test in `DateTimeUtilsSuite`. - Run `CastSuite` and `AnsiCastSuite`. Closes #27753 from MaxGekk/stringToTimestamp-uni-zoneId. Authored-by: Maxim Gekk Signed-off-by: Wenchen Fan (cherry picked from commit 1fd9a91c662a368e348ef96604a79929f814041c) Signed-off-by: Wenchen Fan --- .../spark/sql/catalyst/util/DateTimeUtils.scala| 57 +++--- .../sql/catalyst/util/DateTimeUtilsSuite.scala | 23 +++-- 2 files changed, 48 insertions(+), 32 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala index 448e4b3..6811bfc 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala @@ -187,28 +187,28 @@ object DateTimeUtils { * `-[m]m` * `-[m]m-[d]d` * `-[m]m-[d]d ` - * `-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]` - * `-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]Z` - * `-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]-[h]h:[m]m` - * `-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]+[h]h:[m]m` - * `-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]` - * `-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]Z` - * `-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]-[h]h:[m]m` - * `-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]+[h]h:[m]m` - * `[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]` - * `[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]Z` - * `[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]-[h]h:[m]m` - * `[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]+[h]h:[m]m` - * `T[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]` - * `T[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]Z` - * `T[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]-[h]h:[m]m` - * `T[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]+[h]h:[m]m` + * `-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]` + * `-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]` + * `[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]` + * `T[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]` + * + * where `zone_id` should have one of the forms: + * - Z - Zulu time zone UTC+0 + * - +|-[h]h:[m]m + * - A short id, see https://docs.oracle.com/javase/8/docs/api/java/time/ZoneId.html#SHORT_IDS + * - An id with one of the prefixes UT
[spark] branch master updated: [SPARK-31005][SQL] Support time zone ids in casting strings to timestamps
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 1fd9a91 [SPARK-31005][SQL] Support time zone ids in casting strings to timestamps 1fd9a91 is described below commit 1fd9a91c662a368e348ef96604a79929f814041c Author: Maxim Gekk AuthorDate: Thu Mar 5 20:49:43 2020 +0800 [SPARK-31005][SQL] Support time zone ids in casting strings to timestamps ### What changes were proposed in this pull request? In the PR, I propose to change `DateTimeUtils.stringToTimestamp` to support any valid time zone id at the end of input string. After the changes, the function accepts zone ids in the formats: - no zone id. In that case, the function uses the local session time zone from the SQL config `spark.sql.session.timeZone` - -[h]h:[m]m - +[h]h:[m]m - Z - Short zone id, see https://docs.oracle.com/javase/8/docs/api/java/time/ZoneId.html#SHORT_IDS - Zone ID starts with 'UTC+', 'UTC-', 'GMT+', 'GMT-', 'UT+' or 'UT-'. The ID is split in two, with a two or three letter prefix and a suffix starting with the sign. The suffix must be in the formats: - +|-h[h] - +|-hh[:]mm - +|-hh:mm:ss - +|-hhmmss - Region-based zone IDs in the form `{area}/{city}`, such as `Europe/Paris` or `America/New_York`. The default set of region ids is supplied by the IANA Time Zone Database (TZDB). ### Why are the changes needed? - To use `stringToTimestamp` as a substitution of removed `stringToTime`, see https://github.com/apache/spark/pull/27710#discussion_r385020173 - Improve UX of Spark SQL by allowing flexible formats of zone ids. Currently, Spark accepts only `Z` and zone offsets that can be inconvenient when a time zone offset is shifted due to daylight saving rules. For instance: ```sql spark-sql> select cast('2015-03-18T12:03:17.123456 Europe/Moscow' as timestamp); NULL ``` ### Does this PR introduce any user-facing change? Yes. After the changes, casting strings to timestamps allows time zone id at the end of the strings: ```sql spark-sql> select cast('2015-03-18T12:03:17.123456 Europe/Moscow' as timestamp); 2015-03-18 12:03:17.123456 ``` ### How was this patch tested? - Added new test cases to the `string to timestamp` test in `DateTimeUtilsSuite`. - Run `CastSuite` and `AnsiCastSuite`. Closes #27753 from MaxGekk/stringToTimestamp-uni-zoneId. Authored-by: Maxim Gekk Signed-off-by: Wenchen Fan --- .../spark/sql/catalyst/util/DateTimeUtils.scala| 57 +++--- .../sql/catalyst/util/DateTimeUtilsSuite.scala | 23 +++-- 2 files changed, 48 insertions(+), 32 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala index 6b252ec..3a038a4 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala @@ -185,28 +185,28 @@ object DateTimeUtils { * `-[m]m` * `-[m]m-[d]d` * `-[m]m-[d]d ` - * `-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]` - * `-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]Z` - * `-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]-[h]h:[m]m` - * `-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]+[h]h:[m]m` - * `-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]` - * `-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]Z` - * `-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]-[h]h:[m]m` - * `-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]+[h]h:[m]m` - * `[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]` - * `[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]Z` - * `[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]-[h]h:[m]m` - * `[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]+[h]h:[m]m` - * `T[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]` - * `T[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]Z` - * `T[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]-[h]h:[m]m` - * `T[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]+[h]h:[m]m` + * `-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]` + * `-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]` + * `[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]` + * `T[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]` + * + * where `zone_id` should have one of the forms: + * - Z - Zulu time zone UTC+0 + * - +|-[h]h:[m]m + * - A short id, see https://docs.oracle.com/javase/8/docs/api/java/time/ZoneId.html#SHORT_IDS + * - An id with one of the prefixes UTC+, UTC-, GMT+, GMT-, UT+ or UT-, + * and a suffix in the formats: + * - +|-h[h] + * - +|-hh[:]
[spark] branch branch-3.0 updated: fix merge mistakes
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.0 by this push: new 49c07b7 fix merge mistakes 49c07b7 is described below commit 49c07b7335a89533bc25d6ef45f7877b43b6a98d Author: Wenchen Fan AuthorDate: Thu Mar 5 20:26:14 2020 +0800 fix merge mistakes --- sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala | 1 - .../apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala| 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index cc9c2ae..3dbfc65 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -2112,7 +2112,6 @@ object SQLConf { "MapFromEntries, StringToMap, MapConcat and TransformKeys. When EXCEPTION, the query " + "fails if duplicated map keys are detected. When LAST_WIN, the map key that is inserted " + "at last takes precedence.") -.version("3.0.0") .stringConf .transform(_.toUpperCase(Locale.ROOT)) .checkValues(MapKeyDedupPolicy.values.map(_.toString)) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala index e5f1fa6..500b6cc 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala @@ -609,7 +609,7 @@ class AdaptiveQueryExecSuite withSQLConf( SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true", SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1", - SQLConf.SHUFFLE_TARGET_POSTSHUFFLE_INPUT_SIZEDateFormatter.key -> "2000") { + SQLConf.SHUFFLE_TARGET_POSTSHUFFLE_INPUT_SIZE.key -> "2000") { withTempView("skewData1", "skewData2") { spark .range(0, 1000, 1, 10) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.0 updated: [SPARK-31038][SQL] Add checkValue for spark.sql.session.timeZone
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.0 by this push: new 1c165ee [SPARK-31038][SQL] Add checkValue for spark.sql.session.timeZone 1c165ee is described below commit 1c165eecd3948dc12db19827dfd326d32eb4e475 Author: Kent Yao AuthorDate: Thu Mar 5 19:38:20 2020 +0800 [SPARK-31038][SQL] Add checkValue for spark.sql.session.timeZone ### What changes were proposed in this pull request? The `spark.sql.session.timeZone` config can accept any string value including invalid time zone ids, then it will fail other queries that rely on the time zone. We should do the value checking in the set phase and fail fast if the zone value is invalid. ### Why are the changes needed? improve configuration ### Does this PR introduce any user-facing change? yes, will fail fast if the value is a wrong timezone id ### How was this patch tested? add ut Closes #27792 from yaooqinn/SPARK-31038. Authored-by: Kent Yao Signed-off-by: Wenchen Fan --- .../scala/org/apache/spark/sql/internal/SQLConf.scala | 8 .../org/apache/spark/sql/internal/SQLConfSuite.scala | 19 +++ 2 files changed, 27 insertions(+) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 68a89b2..cc9c2ae 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -24,6 +24,7 @@ import java.util.zip.Deflater import scala.collection.JavaConverters._ import scala.collection.immutable +import scala.util.Try import scala.util.matching.Regex import org.apache.hadoop.fs.Path @@ -38,6 +39,7 @@ import org.apache.spark.sql.catalyst.analysis.{HintErrorLogger, Resolver} import org.apache.spark.sql.catalyst.expressions.CodegenObjectFactoryMode import org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator import org.apache.spark.sql.catalyst.plans.logical.HintErrorHandler +import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.connector.catalog.CatalogManager.SESSION_CATALOG_NAME import org.apache.spark.unsafe.array.ByteArrayMethods import org.apache.spark.util.Utils @@ -1483,10 +1485,16 @@ object SQLConf { .doubleConf .createWithDefault(0.9) + private def isValidTimezone(zone: String): Boolean = { +Try { DateTimeUtils.getZoneId(zone) }.isSuccess + } + val SESSION_LOCAL_TIMEZONE = buildConf("spark.sql.session.timeZone") .doc("""The ID of session local timezone, e.g. "GMT", "America/Los_Angeles", etc.""") .stringConf + .checkValue(isValidTimezone, s"Cannot resolve the given timezone with" + +" ZoneId.of(_, ZoneId.SHORT_IDS)") .createWithDefaultFunction(() => TimeZone.getDefault.getID) val WINDOW_EXEC_BUFFER_IN_MEMORY_THRESHOLD = diff --git a/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala index 61be367..48be211 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala @@ -348,4 +348,23 @@ class SQLConfSuite extends QueryTest with SharedSparkSession { } check(config2) } + + test("spark.sql.session.timeZone should only accept valid zone id") { +spark.conf.set(SQLConf.SESSION_LOCAL_TIMEZONE.key, "MIT") +assert(sql(s"set ${SQLConf.SESSION_LOCAL_TIMEZONE.key}").head().getString(1) === "MIT") +spark.conf.set(SQLConf.SESSION_LOCAL_TIMEZONE.key, "America/Chicago") +assert(sql(s"set ${SQLConf.SESSION_LOCAL_TIMEZONE.key}").head().getString(1) === + "America/Chicago") + +intercept[IllegalArgumentException] { + spark.conf.set(SQLConf.SESSION_LOCAL_TIMEZONE.key, "pst") +} +intercept[IllegalArgumentException] { + spark.conf.set(SQLConf.SESSION_LOCAL_TIMEZONE.key, "GMT+8:00") +} +val e = intercept[IllegalArgumentException] { + spark.conf.set(SQLConf.SESSION_LOCAL_TIMEZONE.key, "Asia/shanghai") +} +assert(e.getMessage === "Cannot resolve the given timezone with ZoneId.of(_, ZoneId.SHORT_IDS)") + } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.0 updated: [SPARK-31019][SQL] make it clear that people can deduplicate map keys
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.0 by this push: new daa140d [SPARK-31019][SQL] make it clear that people can deduplicate map keys daa140d is described below commit daa140df726f8521ae7bbaf4586277ae4f4aea7c Author: Wenchen Fan AuthorDate: Thu Mar 5 20:43:52 2020 +0900 [SPARK-31019][SQL] make it clear that people can deduplicate map keys rename the config and make it non-internal. Now we fail the query if duplicated map keys are detected, and provide a legacy config to deduplicate it. However, we must provide a way to get users out of this situation, instead of just rejecting to run the query. This exit strategy should always be there, while legacy config indicates that it may be removed someday. no, just rename a config which was added in 3.0 add more tests for the fail behavior. Closes #27772 from cloud-fan/map. Authored-by: Wenchen Fan Signed-off-by: HyukjinKwon --- docs/sql-migration-guide.md| 2 +- .../sql/catalyst/util/ArrayBasedMapBuilder.scala | 20 +++--- .../org/apache/spark/sql/internal/SQLConf.scala| 25 +--- .../expressions/CollectionExpressionsSuite.scala | 14 - .../catalyst/expressions/ComplexTypeSuite.scala| 15 - .../expressions/HigherOrderFunctionsSuite.scala| 5 +- .../catalyst/util/ArrayBasedMapBuilderSuite.scala | 71 ++ .../apache/spark/sql/DataFrameFunctionsSuite.scala | 13 +++- 8 files changed, 112 insertions(+), 53 deletions(-) diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md index 0050061..6c73038 100644 --- a/docs/sql-migration-guide.md +++ b/docs/sql-migration-guide.md @@ -49,7 +49,7 @@ license: | - In Spark version 2.4 and earlier, float/double -0.0 is semantically equal to 0.0, but -0.0 and 0.0 are considered as different values when used in aggregate grouping keys, window partition keys and join keys. Since Spark 3.0, this bug is fixed. For example, `Seq(-0.0, 0.0).toDF("d").groupBy("d").count()` returns `[(0.0, 2)]` in Spark 3.0, and `[(0.0, 1), (-0.0, 1)]` in Spark 2.4 and earlier. - - In Spark version 2.4 and earlier, users can create a map with duplicated keys via built-in functions like `CreateMap`, `StringToMap`, etc. The behavior of map with duplicated keys is undefined, e.g. map look up respects the duplicated key appears first, `Dataset.collect` only keeps the duplicated key appears last, `MapKeys` returns duplicated keys, etc. Since Spark 3.0, new config `spark.sql.legacy.allowDuplicatedMapKeys` was added, with the default value `false`, Spark will throw Ru [...] + - In Spark version 2.4 and earlier, users can create a map with duplicated keys via built-in functions like `CreateMap`, `StringToMap`, etc. The behavior of map with duplicated keys is undefined, e.g. map look up respects the duplicated key appears first, `Dataset.collect` only keeps the duplicated key appears last, `MapKeys` returns duplicated keys, etc. Since Spark 3.0, Spark will throw RuntimeException while duplicated keys are found. Users can set `spark.sql.mapKeyDedupPolicy` to L [...] - In Spark version 2.4 and earlier, partition column value is converted as null if it can't be casted to corresponding user provided schema. Since 3.0, partition column value is validated with user provided schema. An exception is thrown if the validation fails. You can disable such validation by setting `spark.sql.sources.validatePartitionColumns` to `false`. diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapBuilder.scala index 40e75b5..0185b57 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapBuilder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapBuilder.scala @@ -21,7 +21,6 @@ import scala.collection.mutable import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.internal.SQLConf.LEGACY_ALLOW_DUPLICATED_MAP_KEY import org.apache.spark.sql.types._ import org.apache.spark.unsafe.array.ByteArrayMethods @@ -49,8 +48,7 @@ class ArrayBasedMapBuilder(keyType: DataType, valueType: DataType) extends Seria private lazy val keyGetter = InternalRow.getAccessor(keyType) private lazy val valueGetter = InternalRow.getAccessor(valueType) - private val allowDuplicatedMapKey = -SQLConf.get.getConf(LEGACY_ALLOW_DUPLICATED_MAP_KEY) + private val mapKeyDedupPolicy = SQLConf.get.getConf(SQLConf.MAP_KEY_DEDUP_POLICY) def put(key: Any, value: Any): Unit = { if (key == null) { @@ -67,13 +65,17 @@ class ArrayBasedMapBuil
[spark] branch master updated (f45ae7f -> 807ea41)
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from f45ae7f [SPARK-31038][SQL] Add checkValue for spark.sql.session.timeZone add 807ea41 [SPARK-31019][SQL] make it clear that people can deduplicate map keys No new revisions were added by this update. Summary of changes: docs/sql-migration-guide.md| 2 +- .../sql/catalyst/util/ArrayBasedMapBuilder.scala | 20 +++--- .../org/apache/spark/sql/internal/SQLConf.scala| 26 .../expressions/CollectionExpressionsSuite.scala | 14 - .../catalyst/expressions/ComplexTypeSuite.scala| 15 - .../expressions/HigherOrderFunctionsSuite.scala| 5 +- .../catalyst/util/ArrayBasedMapBuilderSuite.scala | 71 ++ .../apache/spark/sql/DataFrameFunctionsSuite.scala | 13 +++- 8 files changed, 112 insertions(+), 54 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated (9b602e2 -> f45ae7f)
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from 9b602e2 [SPARK-31046][SQL] Make more efficient and clean up AQE update UI code add f45ae7f [SPARK-31038][SQL] Add checkValue for spark.sql.session.timeZone No new revisions were added by this update. Summary of changes: .../scala/org/apache/spark/sql/internal/SQLConf.scala | 8 .../org/apache/spark/sql/internal/SQLConfSuite.scala | 19 +++ 2 files changed, 27 insertions(+) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.0 updated: [SPARK-31046][SQL] Make more efficient and clean up AQE update UI code
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.0 by this push: new 2247239 [SPARK-31046][SQL] Make more efficient and clean up AQE update UI code 2247239 is described below commit 2247239dd5fef9506f025c993c43fb8365e57ffd Author: maryannxue AuthorDate: Thu Mar 5 18:53:01 2020 +0800 [SPARK-31046][SQL] Make more efficient and clean up AQE update UI code ### What changes were proposed in this pull request? This PR avoids sending redundant metrics (those that have been included in previous update) as well as useless metrics (those in future stages) to Spark UI in AQE UI metrics update. ### Why are the changes needed? This change will make UI metrics update more efficient. ### Does this PR introduce any user-facing change? No. ### How was this patch tested? Manual test in Spark UI. Closes #27799 from maryannxue/aqe-ui-cleanup. Authored-by: maryannxue Signed-off-by: Wenchen Fan (cherry picked from commit 9b602e26d2a5252623a2ed66a1a2b382665fdab4) Signed-off-by: Wenchen Fan --- .../execution/adaptive/AdaptiveSparkPlanExec.scala | 36 +- 1 file changed, 8 insertions(+), 28 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala index c018ca4..f2ebe1a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala @@ -37,7 +37,6 @@ import org.apache.spark.sql.catalyst.trees.TreeNodeTag import org.apache.spark.sql.execution._ import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec._ import org.apache.spark.sql.execution.exchange._ -import org.apache.spark.sql.execution.metric.SQLMetric import org.apache.spark.sql.execution.ui.{SparkListenerSQLAdaptiveExecutionUpdate, SparkListenerSQLAdaptiveSQLMetricUpdates, SQLPlanMetric} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.util.ThreadUtils @@ -133,21 +132,6 @@ case class AdaptiveSparkPlanExec( executedPlan.resetMetrics() } - private def collectSQLMetrics(plan: SparkPlan): Seq[SQLMetric] = { -val metrics = new mutable.ArrayBuffer[SQLMetric]() -plan.foreach { - case p: ShuffleQueryStageExec if (p.resultOption.isEmpty) => -collectSQLMetrics(p.plan).foreach(metrics += _) - case p: BroadcastQueryStageExec if (p.resultOption.isEmpty) => -collectSQLMetrics(p.plan).foreach(metrics += _) - case p: SparkPlan => -p.metrics.foreach { case metric => - metrics += metric._2 -} -} -metrics - } - private def getFinalPhysicalPlan(): SparkPlan = lock.synchronized { if (!isFinalPlan) { // Subqueries do not have their own execution IDs and therefore rely on the main query to @@ -163,7 +147,7 @@ case class AdaptiveSparkPlanExec( currentPhysicalPlan = result.newPlan if (result.newStages.nonEmpty) { stagesToReplace = result.newStages ++ stagesToReplace - executionId.foreach(onUpdatePlan) + executionId.foreach(onUpdatePlan(_, result.newStages.map(_.plan))) // Start materialization of all new stages and fail fast if any stages failed eagerly result.newStages.foreach { stage => @@ -232,7 +216,7 @@ case class AdaptiveSparkPlanExec( // Run the final plan when there's no more unfinished stages. currentPhysicalPlan = applyPhysicalRules(result.newPlan, queryStageOptimizerRules) isFinalPlan = true - executionId.foreach(onUpdatePlan) + executionId.foreach(onUpdatePlan(_, Seq(currentPhysicalPlan))) logDebug(s"Final plan: $currentPhysicalPlan") } currentPhysicalPlan @@ -496,14 +480,18 @@ case class AdaptiveSparkPlanExec( /** * Notify the listeners of the physical plan change. */ - private def onUpdatePlan(executionId: Long): Unit = { + private def onUpdatePlan(executionId: Long, newSubPlans: Seq[SparkPlan]): Unit = { if (isSubquery) { // When executing subqueries, we can't update the query plan in the UI as the // UI doesn't support partial update yet. However, the subquery may have been // optimized into a different plan and we must let the UI know the SQL metrics // of the new plan nodes, so that it can track the valid accumulator updates later // and display SQL metrics correctly. - onUpdateSQLMetrics(collectSQLMetrics(currentPhysicalPlan), executionId) + val newMetrics = newSubPlans.flatMap { p => +p.flatMap(_.metrics.values.map(m => SQLPlanMetric(m.name.get, m.id, m.met
[spark] branch master updated (66b4fd0 -> 9b602e2)
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from 66b4fd0 [SPARK-31024][SQL] Allow specifying session catalog name `spark_catalog` in qualified column names for v1 tables add 9b602e2 [SPARK-31046][SQL] Make more efficient and clean up AQE update UI code No new revisions were added by this update. Summary of changes: .../execution/adaptive/AdaptiveSparkPlanExec.scala | 36 +- 1 file changed, 8 insertions(+), 28 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.0 updated: [SPARK-31024][SQL] Allow specifying session catalog name `spark_catalog` in qualified column names for v1 tables
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.0 by this push: new 4fcb5ae [SPARK-31024][SQL] Allow specifying session catalog name `spark_catalog` in qualified column names for v1 tables 4fcb5ae is described below commit 4fcb5ae22623de96247fcfe7341be0af7ed2471f Author: Terry Kim AuthorDate: Thu Mar 5 18:33:59 2020 +0800 [SPARK-31024][SQL] Allow specifying session catalog name `spark_catalog` in qualified column names for v1 tables ### What changes were proposed in this pull request? Currently, the user cannot specify the session catalog name (`spark_catalog`) in qualified column names for v1 tables: ``` SELECT spark_catalog.default.t.i FROM spark_catalog.default.t ``` fails with `cannot resolve 'spark_catalog.default.t.i`. This is inconsistent with v2 table behavior where catalog name can be used: ``` SELECT testcat.ns1.tbl.id FROM testcat.ns1.tbl.id ``` This PR proposes to fix the inconsistency and allow the user to specify session catalog name in column names for v1 tables. ### Why are the changes needed? Fixing an inconsistent behavior. ### Does this PR introduce any user-facing change? Yes, now the following query works: ``` SELECT spark_catalog.default.t.i FROM spark_catalog.default.t ``` ### How was this patch tested? Added new tests. Closes #27776 from imback82/spark_catalog_col_name_resolution. Authored-by: Terry Kim Signed-off-by: Wenchen Fan (cherry picked from commit 66b4fd040e97cb6de6536a5545017278879c98fb) Signed-off-by: Wenchen Fan --- .../spark/sql/catalyst/analysis/Analyzer.scala | 2 +- .../sql/catalyst/catalog/SessionCatalog.scala | 6 +- .../spark/sql/catalyst/expressions/package.scala | 102 - .../sql/catalyst/catalog/SessionCatalogSuite.scala | 6 +- .../results/columnresolution-negative.sql.out | 26 +++--- .../results/postgreSQL/create_view.sql.out | 2 +- .../sql-tests/results/postgreSQL/join.sql.out | 2 +- .../results/postgreSQL/select_having.sql.out | 2 +- .../results/postgreSQL/window_part3.sql.out| 4 +- .../results/udf/postgreSQL/udf-join.sql.out| 2 +- .../udf/postgreSQL/udf-select_having.sql.out | 2 +- .../scala/org/apache/spark/sql/SQLQuerySuite.scala | 4 +- .../spark/sql/connector/DataSourceV2SQLSuite.scala | 8 +- .../spark/sql/hive/HiveMetastoreCatalogSuite.scala | 2 +- 14 files changed, 111 insertions(+), 59 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index 254dd44..3cb754d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -935,7 +935,7 @@ class Analyzer( v1SessionCatalog.getRelation(v1Table.v1Table) case table => SubqueryAlias( -ident.asMultipartIdentifier, +catalog.name +: ident.asMultipartIdentifier, DataSourceV2Relation.create(table, Some(catalog), Some(ident))) } val key = catalog.name +: ident.namespace :+ ident.name diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala index c80d9d2..3a63aff 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala @@ -38,6 +38,7 @@ import org.apache.spark.sql.catalyst.expressions.{Expression, ExpressionInfo, Im import org.apache.spark.sql.catalyst.parser.{CatalystSqlParser, ParserInterface} import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, SubqueryAlias, View} import org.apache.spark.sql.catalyst.util.StringUtils +import org.apache.spark.sql.connector.catalog.CatalogManager import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.internal.StaticSQLConf.GLOBAL_TEMP_DATABASE import org.apache.spark.sql.types.StructType @@ -758,6 +759,7 @@ class SessionCatalog( val name = metadata.identifier val db = formatDatabaseName(name.database.getOrElse(currentDb)) val table = formatTableName(name.table) +val multiParts = Seq(CatalogManager.SESSION_CATALOG_NAME, db, table) if (metadata.tableType == CatalogTableType.VIEW) { val viewText = metadata.viewText.getOrElse(sys.error("Invalid view without text.")) @@ -769,9 +771,9 @@ class Session
[spark] branch master updated (0a22f19 -> 66b4fd0)
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from 0a22f19 [SPARK-31050][TEST] Disable flaky `Roundtrip` test in KafkaDelegationTokenSuite add 66b4fd0 [SPARK-31024][SQL] Allow specifying session catalog name `spark_catalog` in qualified column names for v1 tables No new revisions were added by this update. Summary of changes: .../spark/sql/catalyst/analysis/Analyzer.scala | 2 +- .../sql/catalyst/catalog/SessionCatalog.scala | 6 +- .../spark/sql/catalyst/expressions/package.scala | 102 - .../sql/catalyst/catalog/SessionCatalogSuite.scala | 6 +- .../results/columnresolution-negative.sql.out | 26 +++--- .../results/postgreSQL/create_view.sql.out | 2 +- .../sql-tests/results/postgreSQL/join.sql.out | 2 +- .../results/postgreSQL/select_having.sql.out | 2 +- .../results/postgreSQL/window_part3.sql.out| 4 +- .../results/udf/postgreSQL/udf-join.sql.out| 2 +- .../udf/postgreSQL/udf-select_having.sql.out | 2 +- .../scala/org/apache/spark/sql/SQLQuerySuite.scala | 4 +- .../spark/sql/connector/DataSourceV2SQLSuite.scala | 8 +- .../spark/sql/hive/HiveMetastoreCatalogSuite.scala | 2 +- 14 files changed, 111 insertions(+), 59 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.0 updated: [SPARK-31050][TEST] Disable flaky `Roundtrip` test in KafkaDelegationTokenSuite
This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.0 by this push: new 9cea92b [SPARK-31050][TEST] Disable flaky `Roundtrip` test in KafkaDelegationTokenSuite 9cea92b is described below commit 9cea92b6f2c2fc8e0effcec710e6ff6e8a7c965f Author: yi.wu AuthorDate: Thu Mar 5 00:21:32 2020 -0800 [SPARK-31050][TEST] Disable flaky `Roundtrip` test in KafkaDelegationTokenSuite ### What changes were proposed in this pull request? Disable test `KafkaDelegationTokenSuite`. ### Why are the changes needed? `KafkaDelegationTokenSuite` is too flaky. ### Does this PR introduce any user-facing change? No. ### How was this patch tested? Pass Jenkins. Closes #27789 from Ngone51/retry_kafka. Authored-by: yi.wu Signed-off-by: Dongjoon Hyun (cherry picked from commit 0a22f1966466629cb745d000a0608d521fece093) Signed-off-by: Dongjoon Hyun --- .../scala/org/apache/spark/sql/kafka010/KafkaDelegationTokenSuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaDelegationTokenSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaDelegationTokenSuite.scala index 3064838..79239e5 100644 --- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaDelegationTokenSuite.scala +++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaDelegationTokenSuite.scala @@ -62,7 +62,7 @@ class KafkaDelegationTokenSuite extends StreamTest with SharedSparkSession with } } - test("Roundtrip") { + ignore("Roundtrip") { val hadoopConf = new Configuration() val manager = new HadoopDelegationTokenManager(spark.sparkContext.conf, hadoopConf, null) val credentials = new Credentials() - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-31050][TEST] Disable flaky `Roundtrip` test in KafkaDelegationTokenSuite
This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 0a22f19 [SPARK-31050][TEST] Disable flaky `Roundtrip` test in KafkaDelegationTokenSuite 0a22f19 is described below commit 0a22f1966466629cb745d000a0608d521fece093 Author: yi.wu AuthorDate: Thu Mar 5 00:21:32 2020 -0800 [SPARK-31050][TEST] Disable flaky `Roundtrip` test in KafkaDelegationTokenSuite ### What changes were proposed in this pull request? Disable test `KafkaDelegationTokenSuite`. ### Why are the changes needed? `KafkaDelegationTokenSuite` is too flaky. ### Does this PR introduce any user-facing change? No. ### How was this patch tested? Pass Jenkins. Closes #27789 from Ngone51/retry_kafka. Authored-by: yi.wu Signed-off-by: Dongjoon Hyun --- .../scala/org/apache/spark/sql/kafka010/KafkaDelegationTokenSuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaDelegationTokenSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaDelegationTokenSuite.scala index 3064838..79239e5 100644 --- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaDelegationTokenSuite.scala +++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaDelegationTokenSuite.scala @@ -62,7 +62,7 @@ class KafkaDelegationTokenSuite extends StreamTest with SharedSparkSession with } } - test("Roundtrip") { + ignore("Roundtrip") { val hadoopConf = new Configuration() val manager = new HadoopDelegationTokenManager(spark.sparkContext.conf, hadoopConf, null) val credentials = new Credentials() - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.0 updated: [SPARK-30668][SQL][FOLLOWUP] Raise exception instead of silent change for new DateFormatter
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.0 by this push: new 253fbd5 [SPARK-30668][SQL][FOLLOWUP] Raise exception instead of silent change for new DateFormatter 253fbd5 is described below commit 253fbd54e7fe3dda0672ebc7143b943af8387a2c Author: Yuanjian Li AuthorDate: Thu Mar 5 15:29:39 2020 +0800 [SPARK-30668][SQL][FOLLOWUP] Raise exception instead of silent change for new DateFormatter This is a follow-up work for #27441. For the cases of new TimestampFormatter return null while legacy formatter can return a value, we need to throw an exception instead of silent change. The legacy config will be referenced in the error message. Avoid silent result change for new behavior in 3.0. Yes, an exception is thrown when we detect legacy formatter can parse the string and the new formatter return null. Extend existing UT. Closes #27537 from xuanyuanking/SPARK-30668-follow. Authored-by: Yuanjian Li Signed-off-by: Wenchen Fan (cherry picked from commit 7db0af578585ecaeee9fd23f8189292289b52a97) Signed-off-by: Wenchen Fan --- .../scala/org/apache/spark/SparkException.scala| 7 +++ .../spark/sql/catalyst/csv/UnivocityParser.scala | 2 + .../catalyst/expressions/datetimeExpressions.scala | 3 + .../spark/sql/catalyst/json/JacksonParser.scala| 2 + .../spark/sql/catalyst/util/DateFormatter.scala| 60 +++--- .../catalyst/util/DateTimeFormatterHelper.scala| 26 +++- .../sql/catalyst/util/TimestampFormatter.scala | 73 +- .../org/apache/spark/sql/internal/SQLConf.scala| 16 - .../expressions/DateExpressionsSuite.scala | 39 +--- .../sql/catalyst/json/JsonInferSchemaSuite.scala | 16 ++--- .../org/apache/spark/sql/CsvFunctionsSuite.scala | 20 -- .../org/apache/spark/sql/DateFunctionsSuite.scala | 58 ++--- .../adaptive/AdaptiveQueryExecSuite.scala | 2 +- .../sql/execution/datasources/csv/CSVSuite.scala | 22 ++- .../sql/execution/datasources/json/JsonSuite.scala | 22 ++- 15 files changed, 271 insertions(+), 97 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/SparkException.scala b/core/src/main/scala/org/apache/spark/SparkException.scala index 4ad9a0c..81c087e 100644 --- a/core/src/main/scala/org/apache/spark/SparkException.scala +++ b/core/src/main/scala/org/apache/spark/SparkException.scala @@ -43,3 +43,10 @@ private[spark] case class SparkUserAppException(exitCode: Int) */ private[spark] case class ExecutorDeadException(message: String) extends SparkException(message) + +/** + * Exception thrown when Spark returns different result after upgrading to a new version. + */ +private[spark] class SparkUpgradeException(version: String, message: String, cause: Throwable) + extends SparkException("You may get a different result due to the upgrading of Spark" + +s" $version: $message", cause) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala index f829e6b..dd8537b 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala @@ -23,6 +23,7 @@ import scala.util.control.NonFatal import com.univocity.parsers.csv.CsvParser +import org.apache.spark.SparkUpgradeException import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{ExprUtils, GenericInternalRow} @@ -285,6 +286,7 @@ class UnivocityParser( } } } catch { +case e: SparkUpgradeException => throw e case NonFatal(e) => badRecordException = badRecordException.orElse(Some(e)) row.setNullAt(i) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala index 05074d9..d92d700 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala @@ -26,6 +26,7 @@ import scala.util.control.NonFatal import org.apache.commons.text.StringEscapeUtils +import org.apache.spark.SparkUpgradeException import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen._ @@ -787,6 +788,7 @@ abstract class ToTimestamp formatter.parse( t.asInsta