svn commit: r29775 - in /dev/spark/2.4.1-SNAPSHOT-2018_09_28_22_02-ec2c17a-docs: ./ _site/ _site/api/ _site/api/R/ _site/api/java/ _site/api/java/lib/ _site/api/java/org/ _site/api/java/org/apache/ _s
Author: pwendell Date: Sat Sep 29 05:19:01 2018 New Revision: 29775 Log: Apache Spark 2.4.1-SNAPSHOT-2018_09_28_22_02-ec2c17a docs [This commit notification would consist of 1472 parts, which exceeds the limit of 50 ones, so it was shortened to the summary.] - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
svn commit: r29774 - in /dev/spark/2.3.3-SNAPSHOT-2018_09_28_22_02-eb78380-docs: ./ _site/ _site/api/ _site/api/R/ _site/api/java/ _site/api/java/lib/ _site/api/java/org/ _site/api/java/org/apache/ _s
Author: pwendell Date: Sat Sep 29 05:17:19 2018 New Revision: 29774 Log: Apache Spark 2.3.3-SNAPSHOT-2018_09_28_22_02-eb78380 docs [This commit notification would consist of 1443 parts, which exceeds the limit of 50 ones, so it was shortened to the summary.] - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-25570][SQL][TEST] Replace 2.3.1 with 2.3.2 in HiveExternalCatalogVersionsSuite
Repository: spark Updated Branches: refs/heads/branch-2.3 f13565b6e -> eb78380c0 [SPARK-25570][SQL][TEST] Replace 2.3.1 with 2.3.2 in HiveExternalCatalogVersionsSuite ## What changes were proposed in this pull request? This PR aims to prevent test slowdowns at `HiveExternalCatalogVersionsSuite` by using the latest Apache Spark 2.3.2 link because the Apache mirrors will remove the old Spark 2.3.1 binaries eventually. `HiveExternalCatalogVersionsSuite` will not fail because [SPARK-24813](https://issues.apache.org/jira/browse/SPARK-24813) implements a fallback logic. However, it will cause many trials and fallbacks in all builds over `branch-2.3/branch-2.4/master`. We had better fix this issue. ## How was this patch tested? Pass the Jenkins with the updated version. Closes #22587 from dongjoon-hyun/SPARK-25570. Authored-by: Dongjoon Hyun Signed-off-by: hyukjinkwon (cherry picked from commit 1e437835e96c4417117f44c29eba5ebc0112926f) Signed-off-by: hyukjinkwon Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/eb78380c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/eb78380c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/eb78380c Branch: refs/heads/branch-2.3 Commit: eb78380c0e1e620e996435a4c08acb652c868795 Parents: f13565b Author: Dongjoon Hyun Authored: Sat Sep 29 11:43:58 2018 +0800 Committer: hyukjinkwon Committed: Sat Sep 29 11:44:27 2018 +0800 -- .../apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/eb78380c/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala -- diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala index 5103aa8..af15da6 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala @@ -203,7 +203,7 @@ class HiveExternalCatalogVersionsSuite extends SparkSubmitTestUtils { object PROCESS_TABLES extends QueryTest with SQLTestUtils { // Tests the latest version of every release line. - val testingVersions = Seq("2.1.3", "2.2.2", "2.3.1") + val testingVersions = Seq("2.1.3", "2.2.2", "2.3.2") protected var spark: SparkSession = _ - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-25570][SQL][TEST] Replace 2.3.1 with 2.3.2 in HiveExternalCatalogVersionsSuite
Repository: spark Updated Branches: refs/heads/branch-2.4 7614313c9 -> ec2c17abf [SPARK-25570][SQL][TEST] Replace 2.3.1 with 2.3.2 in HiveExternalCatalogVersionsSuite ## What changes were proposed in this pull request? This PR aims to prevent test slowdowns at `HiveExternalCatalogVersionsSuite` by using the latest Apache Spark 2.3.2 link because the Apache mirrors will remove the old Spark 2.3.1 binaries eventually. `HiveExternalCatalogVersionsSuite` will not fail because [SPARK-24813](https://issues.apache.org/jira/browse/SPARK-24813) implements a fallback logic. However, it will cause many trials and fallbacks in all builds over `branch-2.3/branch-2.4/master`. We had better fix this issue. ## How was this patch tested? Pass the Jenkins with the updated version. Closes #22587 from dongjoon-hyun/SPARK-25570. Authored-by: Dongjoon Hyun Signed-off-by: hyukjinkwon (cherry picked from commit 1e437835e96c4417117f44c29eba5ebc0112926f) Signed-off-by: hyukjinkwon Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ec2c17ab Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ec2c17ab Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ec2c17ab Branch: refs/heads/branch-2.4 Commit: ec2c17abf43d304fab26dde3ae624f553cdbd32e Parents: 7614313 Author: Dongjoon Hyun Authored: Sat Sep 29 11:43:58 2018 +0800 Committer: hyukjinkwon Committed: Sat Sep 29 11:44:12 2018 +0800 -- .../apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ec2c17ab/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala -- diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala index 25df333..46b66c1 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala @@ -203,7 +203,7 @@ class HiveExternalCatalogVersionsSuite extends SparkSubmitTestUtils { object PROCESS_TABLES extends QueryTest with SQLTestUtils { // Tests the latest version of every release line. - val testingVersions = Seq("2.1.3", "2.2.2", "2.3.1") + val testingVersions = Seq("2.1.3", "2.2.2", "2.3.2") protected var spark: SparkSession = _ - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-25570][SQL][TEST] Replace 2.3.1 with 2.3.2 in HiveExternalCatalogVersionsSuite
Repository: spark Updated Branches: refs/heads/master e99ba8d7c -> 1e437835e [SPARK-25570][SQL][TEST] Replace 2.3.1 with 2.3.2 in HiveExternalCatalogVersionsSuite ## What changes were proposed in this pull request? This PR aims to prevent test slowdowns at `HiveExternalCatalogVersionsSuite` by using the latest Apache Spark 2.3.2 link because the Apache mirrors will remove the old Spark 2.3.1 binaries eventually. `HiveExternalCatalogVersionsSuite` will not fail because [SPARK-24813](https://issues.apache.org/jira/browse/SPARK-24813) implements a fallback logic. However, it will cause many trials and fallbacks in all builds over `branch-2.3/branch-2.4/master`. We had better fix this issue. ## How was this patch tested? Pass the Jenkins with the updated version. Closes #22587 from dongjoon-hyun/SPARK-25570. Authored-by: Dongjoon Hyun Signed-off-by: hyukjinkwon Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1e437835 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1e437835 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1e437835 Branch: refs/heads/master Commit: 1e437835e96c4417117f44c29eba5ebc0112926f Parents: e99ba8d Author: Dongjoon Hyun Authored: Sat Sep 29 11:43:58 2018 +0800 Committer: hyukjinkwon Committed: Sat Sep 29 11:43:58 2018 +0800 -- .../apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/1e437835/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala -- diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala index a7d6972..fd4985d 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala @@ -206,7 +206,7 @@ class HiveExternalCatalogVersionsSuite extends SparkSubmitTestUtils { object PROCESS_TABLES extends QueryTest with SQLTestUtils { // Tests the latest version of every release line. - val testingVersions = Seq("2.1.3", "2.2.2", "2.3.1") + val testingVersions = Seq("2.1.3", "2.2.2", "2.3.2") protected var spark: SparkSession = _ - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-25262][DOC][FOLLOWUP] Fix missing markup tag
Repository: spark Updated Branches: refs/heads/master 5d726b865 -> e99ba8d7c [SPARK-25262][DOC][FOLLOWUP] Fix missing markup tag ## What changes were proposed in this pull request? This adds a missing end markup tag. This should go `master` branch only. ## How was this patch tested? This is a doc-only change. Manual via `SKIP_API=1 jekyll build`. Closes #22584 from dongjoon-hyun/SPARK-25262. Authored-by: Dongjoon Hyun Signed-off-by: hyukjinkwon Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e99ba8d7 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e99ba8d7 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e99ba8d7 Branch: refs/heads/master Commit: e99ba8d7c8ec4b4cdd63fd1621f54be993bb0404 Parents: 5d726b8 Author: Dongjoon Hyun Authored: Sat Sep 29 11:23:37 2018 +0800 Committer: hyukjinkwon Committed: Sat Sep 29 11:23:37 2018 +0800 -- docs/running-on-kubernetes.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e99ba8d7/docs/running-on-kubernetes.md -- diff --git a/docs/running-on-kubernetes.md b/docs/running-on-kubernetes.md index 840e306..c7aea27 100644 --- a/docs/running-on-kubernetes.md +++ b/docs/running-on-kubernetes.md @@ -800,7 +800,7 @@ specific to Spark on Kubernetes. spark.kubernetes.local.dirs.tmpfs - false + false Configure the emptyDir volumes used to back SPARK_LOCAL_DIRS within the Spark driver and executor pods to use tmpfs backing i.e. RAM. See Local Storage earlier on this page for more discussion of this. - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
svn commit: r29772 - in /dev/spark/2.5.0-SNAPSHOT-2018_09_28_20_02-5d726b8-docs: ./ _site/ _site/api/ _site/api/R/ _site/api/java/ _site/api/java/lib/ _site/api/java/org/ _site/api/java/org/apache/ _s
Author: pwendell Date: Sat Sep 29 03:17:05 2018 New Revision: 29772 Log: Apache Spark 2.5.0-SNAPSHOT-2018_09_28_20_02-5d726b8 docs [This commit notification would consist of 1485 parts, which exceeds the limit of 50 ones, so it was shortened to the summary.] - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
svn commit: r29771 - in /dev/spark/2.4.1-SNAPSHOT-2018_09_28_18_02-7614313-docs: ./ _site/ _site/api/ _site/api/R/ _site/api/java/ _site/api/java/lib/ _site/api/java/org/ _site/api/java/org/apache/ _s
Author: pwendell Date: Sat Sep 29 01:16:37 2018 New Revision: 29771 Log: Apache Spark 2.4.1-SNAPSHOT-2018_09_28_18_02-7614313 docs [This commit notification would consist of 1472 parts, which exceeds the limit of 50 ones, so it was shortened to the summary.] - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-25559][SQL] Remove the unsupported predicates in Parquet when possible
Repository: spark Updated Branches: refs/heads/master 9362c5cc2 -> 5d726b865 [SPARK-25559][SQL] Remove the unsupported predicates in Parquet when possible ## What changes were proposed in this pull request? Currently, in `ParquetFilters`, if one of the children predicates is not supported by Parquet, the entire predicates will be thrown away. In fact, if the unsupported predicate is in the top level `And` condition or in the child before hitting `Not` or `Or` condition, it can be safely removed. ## How was this patch tested? Tests are added. Closes #22574 from dbtsai/removeUnsupportedPredicatesInParquet. Lead-authored-by: DB Tsai Co-authored-by: Dongjoon Hyun Co-authored-by: DB Tsai Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5d726b86 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5d726b86 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5d726b86 Branch: refs/heads/master Commit: 5d726b865948f993911fd5b9730b25cfa94e16c7 Parents: 9362c5c Author: DB Tsai Authored: Fri Sep 28 17:46:11 2018 -0700 Committer: Dongjoon Hyun Committed: Fri Sep 28 17:46:11 2018 -0700 -- .../datasources/parquet/ParquetFilters.scala| 38 +++-- .../parquet/ParquetFilterSuite.scala| 147 ++- 2 files changed, 172 insertions(+), 13 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5d726b86/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala index 0c286de..44a0d20 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala @@ -394,7 +394,13 @@ private[parquet] class ParquetFilters( */ def createFilter(schema: MessageType, predicate: sources.Filter): Option[FilterPredicate] = { val nameToParquetField = getFieldMap(schema) +createFilterHelper(nameToParquetField, predicate, canRemoveOneSideInAnd = true) + } + private def createFilterHelper( + nameToParquetField: Map[String, ParquetField], + predicate: sources.Filter, + canRemoveOneSideInAnd: Boolean): Option[FilterPredicate] = { // Decimal type must make sure that filter value's scale matched the file. // If doesn't matched, which would cause data corruption. def isDecimalMatched(value: Any, decimalMeta: DecimalMetadata): Boolean = value match { @@ -488,26 +494,36 @@ private[parquet] class ParquetFilters( .map(_(nameToParquetField(name).fieldName, value)) case sources.And(lhs, rhs) => -// At here, it is not safe to just convert one side if we do not understand the -// other side. Here is an example used to explain the reason. +// At here, it is not safe to just convert one side and remove the other side +// if we do not understand what the parent filters are. +// +// Here is an example used to explain the reason. // Let's say we have NOT(a = 2 AND b in ('1')) and we do not understand how to // convert b in ('1'). If we only convert a = 2, we will end up with a filter // NOT(a = 2), which will generate wrong results. -// Pushing one side of AND down is only safe to do at the top level. -// You can see ParquetRelation's initializeLocalJobFunc method as an example. -for { - lhsFilter <- createFilter(schema, lhs) - rhsFilter <- createFilter(schema, rhs) -} yield FilterApi.and(lhsFilter, rhsFilter) +// +// Pushing one side of AND down is only safe to do at the top level or in the child +// AND before hitting NOT or OR conditions, and in this case, the unsupported predicate +// can be safely removed. +val lhsFilterOption = createFilterHelper(nameToParquetField, lhs, canRemoveOneSideInAnd) +val rhsFilterOption = createFilterHelper(nameToParquetField, rhs, canRemoveOneSideInAnd) + +(lhsFilterOption, rhsFilterOption) match { + case (Some(lhsFilter), Some(rhsFilter)) => Some(FilterApi.and(lhsFilter, rhsFilter)) + case (Some(lhsFilter), None) if canRemoveOneSideInAnd => Some(lhsFilter) + case (None, Some(rhsFilter)) if canRemoveOneSideInAnd => Some(rhsFilter) + case _ => None +} case sources.Or(lhs, rhs) => for { - lhsFilter <- createFilter(schema, lhs) - rhsFilter <- c
spark git commit: [SPARK-25449][CORE] Heartbeat shouldn't include accumulators for zero metrics
Repository: spark Updated Branches: refs/heads/master a28146568 -> 9362c5cc2 [SPARK-25449][CORE] Heartbeat shouldn't include accumulators for zero metrics ## What changes were proposed in this pull request? Heartbeat shouldn't include accumulators for zero metrics. Heartbeats sent from executors to the driver every 10 seconds contain metrics and are generally on the order of a few KBs. However, for large jobs with lots of tasks, heartbeats can be on the order of tens of MBs, causing tasks to die with heartbeat failures. We can mitigate this by not sending zero metrics to the driver. ## How was this patch tested? (Please explain how this patch was tested. E.g. unit tests, integration tests, manual tests) (If this patch involves UI changes, please attach a screenshot; otherwise, remove this) Please review http://spark.apache.org/contributing.html before opening a pull request. Closes #22473 from mukulmurthy/25449-heartbeat. Authored-by: Mukul Murthy Signed-off-by: Shixiong Zhu Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9362c5cc Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9362c5cc Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9362c5cc Branch: refs/heads/master Commit: 9362c5cc273fdd09f9b3b512e2f6b64bcefc25ab Parents: a281465 Author: Mukul Murthy Authored: Fri Sep 28 16:34:17 2018 -0700 Committer: Shixiong Zhu Committed: Fri Sep 28 16:34:17 2018 -0700 -- .../main/scala/org/apache/spark/SparkConf.scala | 11 +- .../scala/org/apache/spark/SparkContext.scala | 2 +- .../org/apache/spark/executor/Executor.scala| 40 +-- .../apache/spark/internal/config/package.scala | 14 +++ .../apache/spark/executor/ExecutorSuite.scala | 111 +-- .../MesosCoarseGrainedSchedulerBackend.scala| 3 +- 6 files changed, 154 insertions(+), 27 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/9362c5cc/core/src/main/scala/org/apache/spark/SparkConf.scala -- diff --git a/core/src/main/scala/org/apache/spark/SparkConf.scala b/core/src/main/scala/org/apache/spark/SparkConf.scala index e0f98f1..81aa31d 100644 --- a/core/src/main/scala/org/apache/spark/SparkConf.scala +++ b/core/src/main/scala/org/apache/spark/SparkConf.scala @@ -609,13 +609,14 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging with Seria require(!encryptionEnabled || get(NETWORK_AUTH_ENABLED), s"${NETWORK_AUTH_ENABLED.key} must be enabled when enabling encryption.") -val executorTimeoutThreshold = getTimeAsSeconds("spark.network.timeout", "120s") -val executorHeartbeatInterval = getTimeAsSeconds("spark.executor.heartbeatInterval", "10s") +val executorTimeoutThresholdMs = + getTimeAsSeconds("spark.network.timeout", "120s") * 1000 +val executorHeartbeatIntervalMs = get(EXECUTOR_HEARTBEAT_INTERVAL) // If spark.executor.heartbeatInterval bigger than spark.network.timeout, // it will almost always cause ExecutorLostFailure. See SPARK-22754. -require(executorTimeoutThreshold > executorHeartbeatInterval, "The value of " + - s"spark.network.timeout=${executorTimeoutThreshold}s must be no less than the value of " + - s"spark.executor.heartbeatInterval=${executorHeartbeatInterval}s.") +require(executorTimeoutThresholdMs > executorHeartbeatIntervalMs, "The value of " + + s"spark.network.timeout=${executorTimeoutThresholdMs}ms must be no less than the value of " + + s"spark.executor.heartbeatInterval=${executorHeartbeatIntervalMs}ms.") } /** http://git-wip-us.apache.org/repos/asf/spark/blob/9362c5cc/core/src/main/scala/org/apache/spark/SparkContext.scala -- diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index d943087..0a66dae 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -499,7 +499,7 @@ class SparkContext(config: SparkConf) extends Logging { // create and start the heartbeater for collecting memory metrics _heartbeater = new Heartbeater(env.memoryManager, reportHeartBeat, "driver-heartbeater", - conf.getTimeAsMs("spark.executor.heartbeatInterval", "10s")) + conf.get(EXECUTOR_HEARTBEAT_INTERVAL)) _heartbeater.start() // start TaskScheduler after taskScheduler sets DAGScheduler reference in DAGScheduler's http://git-wip-us.apache.org/repos/asf/spark/blob/9362c5cc/core/src/main/scala/org/apache/spark/executor/Executor.scala -- diff --git a/core/src/main/scala/org
svn commit: r29770 - in /dev/spark/2.5.0-SNAPSHOT-2018_09_28_16_02-a281465-docs: ./ _site/ _site/api/ _site/api/R/ _site/api/java/ _site/api/java/lib/ _site/api/java/org/ _site/api/java/org/apache/ _s
Author: pwendell Date: Fri Sep 28 23:17:21 2018 New Revision: 29770 Log: Apache Spark 2.5.0-SNAPSHOT-2018_09_28_16_02-a281465 docs [This commit notification would consist of 1485 parts, which exceeds the limit of 50 ones, so it was shortened to the summary.] - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-25429][SQL] Use Set instead of Array to improve lookup performance
Repository: spark Updated Branches: refs/heads/master 7deef7a49 -> a28146568 [SPARK-25429][SQL] Use Set instead of Array to improve lookup performance ## What changes were proposed in this pull request? Use `Set` instead of `Array` to improve `accumulatorIds.contains(acc.id)` performance. This PR close https://github.com/apache/spark/pull/22420 ## How was this patch tested? manual tests. Benchmark code: ```scala def benchmark(func: () => Unit): Long = { val start = System.currentTimeMillis() func() val end = System.currentTimeMillis() end - start } val range = Range(1, 100) val set = range.toSet val array = range.toArray for (i <- 0 until 5) { val setExecutionTime = benchmark(() => for (i <- 0 until 500) { set.contains(scala.util.Random.nextInt()) }) val arrayExecutionTime = benchmark(() => for (i <- 0 until 500) { array.contains(scala.util.Random.nextInt()) }) println(s"set execution time: $setExecutionTime, array execution time: $arrayExecutionTime") } ``` Benchmark result: ``` set execution time: 4, array execution time: 2760 set execution time: 1, array execution time: 1911 set execution time: 3, array execution time: 2043 set execution time: 12, array execution time: 2214 set execution time: 6, array execution time: 1770 ``` Closes #22579 from wangyum/SPARK-25429. Authored-by: Yuming Wang Signed-off-by: gatorsmile Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a2814656 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a2814656 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a2814656 Branch: refs/heads/master Commit: a281465686e8099bb2c0fa4f2ef4822b6e634269 Parents: 7deef7a Author: Yuming Wang Authored: Fri Sep 28 15:08:15 2018 -0700 Committer: gatorsmile Committed: Fri Sep 28 15:08:15 2018 -0700 -- .../apache/spark/sql/execution/ui/SQLAppStatusListener.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a2814656/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala index d254af4..1199eec 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala @@ -81,9 +81,9 @@ class SQLAppStatusListener( // Record the accumulator IDs for the stages of this job, so that the code that keeps // track of the metrics knows which accumulators to look at. -val accumIds = exec.metrics.map(_.accumulatorId).sorted.toList +val accumIds = exec.metrics.map(_.accumulatorId).toSet event.stageIds.foreach { id => - stageMetrics.put(id, new LiveStageMetrics(id, 0, accumIds.toArray, new ConcurrentHashMap())) + stageMetrics.put(id, new LiveStageMetrics(id, 0, accumIds, new ConcurrentHashMap())) } exec.jobs = exec.jobs + (jobId -> JobExecutionStatus.RUNNING) @@ -382,7 +382,7 @@ private class LiveExecutionData(val executionId: Long) extends LiveEntity { private class LiveStageMetrics( val stageId: Int, var attemptId: Int, -val accumulatorIds: Array[Long], +val accumulatorIds: Set[Long], val taskMetrics: ConcurrentHashMap[Long, LiveTaskMetrics]) private class LiveTaskMetrics( - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-25458][SQL] Support FOR ALL COLUMNS in ANALYZE TABLE
Repository: spark Updated Branches: refs/heads/master b7d80349b -> 7deef7a49 [SPARK-25458][SQL] Support FOR ALL COLUMNS in ANALYZE TABLE ## What changes were proposed in this pull request? **Description from the JIRA :** Currently, to collect the statistics of all the columns, users need to specify the names of all the columns when calling the command "ANALYZE TABLE ... FOR COLUMNS...". This is not user friendly. Instead, we can introduce the following SQL command to achieve it without specifying the column names. ``` ANALYZE TABLE [db_name.]tablename COMPUTE STATISTICS FOR ALL COLUMNS; ``` ## How was this patch tested? Added new tests in SparkSqlParserSuite and StatisticsSuite Closes #22566 from dilipbiswal/SPARK-25458. Authored-by: Dilip Biswal Signed-off-by: gatorsmile Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7deef7a4 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7deef7a4 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7deef7a4 Branch: refs/heads/master Commit: 7deef7a49b95c5de5af10419ece8c6a36d96ac61 Parents: b7d8034 Author: Dilip Biswal Authored: Fri Sep 28 15:03:06 2018 -0700 Committer: gatorsmile Committed: Fri Sep 28 15:03:06 2018 -0700 -- .../apache/spark/sql/catalyst/parser/SqlBase.g4 | 2 +- .../spark/sql/execution/SparkSqlParser.scala| 26 ++--- .../command/AnalyzeColumnCommand.scala | 61 .../sql/execution/SparkSqlParserSuite.scala | 14 - .../apache/spark/sql/hive/StatisticsSuite.scala | 47 ++- 5 files changed, 115 insertions(+), 35 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/7deef7a4/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 -- diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 index 94283f5..16665eb 100644 --- a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 +++ b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 @@ -99,7 +99,7 @@ statement | CREATE TABLE (IF NOT EXISTS)? target=tableIdentifier LIKE source=tableIdentifier locationSpec? #createTableLike | ANALYZE TABLE tableIdentifier partitionSpec? COMPUTE STATISTICS -(identifier | FOR COLUMNS identifierSeq)? #analyze +(identifier | FOR COLUMNS identifierSeq | FOR ALL COLUMNS)?#analyze | ALTER TABLE tableIdentifier ADD COLUMNS '(' columns=colTypeList ')' #addTableColumns | ALTER (TABLE | VIEW) from=tableIdentifier http://git-wip-us.apache.org/repos/asf/spark/blob/7deef7a4/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala index 89cb637..4ed14d3 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala @@ -102,15 +102,29 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder(conf) { * {{{ * ANALYZE TABLE [db_name.]tablename COMPUTE STATISTICS FOR COLUMNS column1, column2; * }}} + * + * Example SQL for analyzing all columns of a table: + * {{{ + * ANALYZE TABLE [db_name.]tablename COMPUTE STATISTICS FOR ALL COLUMNS; + * }}} */ override def visitAnalyze(ctx: AnalyzeContext): LogicalPlan = withOrigin(ctx) { +def checkPartitionSpec(): Unit = { + if (ctx.partitionSpec != null) { +logWarning("Partition specification is ignored when collecting column statistics: " + + ctx.partitionSpec.getText) + } +} if (ctx.identifier != null && ctx.identifier.getText.toLowerCase(Locale.ROOT) != "noscan") { throw new ParseException(s"Expected `NOSCAN` instead of `${ctx.identifier.getText}`", ctx) } val table = visitTableIdentifier(ctx.tableIdentifier) -if (ctx.identifierSeq() == null) { +if (ctx.ALL() != null) { + checkPartitionSpec() + AnalyzeColumnCommand(table, None, allColumns = true) +} else if (ctx.identifierSeq() == null) { if (ctx.partitionSpec != null) { AnalyzePartitionCommand(table, visitPartitionSpec(ctx.partitionSpec), noscan = ctx.identifier != null) @@ -118,13 +132,9 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder(conf) { AnalyzeTableCommand(table, noscan = ct
spark git commit: [SPARK-25542][CORE][TEST] Move flaky test in OpenHashMapSuite to OpenHashSetSuite and make it against OpenHashSet
Repository: spark Updated Branches: refs/heads/branch-2.4 81391c274 -> 7614313c9 [SPARK-25542][CORE][TEST] Move flaky test in OpenHashMapSuite to OpenHashSetSuite and make it against OpenHashSet ## What changes were proposed in this pull request? The specified test in OpenHashMapSuite to test large items is somehow flaky to throw OOM. By considering the original work #6763 that added this test, the test can be against OpenHashSetSuite. And by doing this should be to save memory because OpenHashMap allocates two more arrays when growing the map/set. ## How was this patch tested? Existing tests. Closes #22569 from viirya/SPARK-25542. Authored-by: Liang-Chi Hsieh Signed-off-by: Dongjoon Hyun (cherry picked from commit b7d80349b0e367d78cab238e62c2ec353f0f12b3) Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7614313c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7614313c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7614313c Branch: refs/heads/branch-2.4 Commit: 7614313c9443712553332962d62dfe5aacc7ed34 Parents: 81391c2 Author: Liang-Chi Hsieh Authored: Fri Sep 28 14:29:56 2018 -0700 Committer: Dongjoon Hyun Committed: Fri Sep 28 14:30:12 2018 -0700 -- .../spark/util/collection/OpenHashMapSuite.scala | 10 -- .../spark/util/collection/OpenHashSetSuite.scala | 13 + 2 files changed, 13 insertions(+), 10 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/7614313c/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala index 151235d..68bcc5e 100644 --- a/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala @@ -185,16 +185,6 @@ class OpenHashMapSuite extends SparkFunSuite with Matchers { assert(map.contains(null)) } - test("support for more than 12M items") { -val cnt = 1200 // 12M -val map = new OpenHashMap[Int, Int](cnt) -for (i <- 0 until cnt) { - map(i) = 1 -} -val numInvalidValues = map.iterator.count(_._2 == 0) -assertResult(0)(numInvalidValues) - } - test("distinguish between the 0/0.0/0L and null") { val specializedMap1 = new OpenHashMap[String, Long] specializedMap1("a") = null.asInstanceOf[Long] http://git-wip-us.apache.org/repos/asf/spark/blob/7614313c/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala index b887f93..44d2118 100644 --- a/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala @@ -255,4 +255,17 @@ class OpenHashSetSuite extends SparkFunSuite with Matchers { val set = new OpenHashSet[Long](0) assert(set.size === 0) } + + test("support for more than 12M items") { +val cnt = 1200 // 12M +val set = new OpenHashSet[Int](cnt) +for (i <- 0 until cnt) { + set.add(i) + assert(set.contains(i)) + + val pos1 = set.getPos(i) + val pos2 = set.addWithoutResize(i) & OpenHashSet.POSITION_MASK + assert(pos1 == pos2) +} + } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-25542][CORE][TEST] Move flaky test in OpenHashMapSuite to OpenHashSetSuite and make it against OpenHashSet
Repository: spark Updated Branches: refs/heads/master 0b33f0868 -> b7d80349b [SPARK-25542][CORE][TEST] Move flaky test in OpenHashMapSuite to OpenHashSetSuite and make it against OpenHashSet ## What changes were proposed in this pull request? The specified test in OpenHashMapSuite to test large items is somehow flaky to throw OOM. By considering the original work #6763 that added this test, the test can be against OpenHashSetSuite. And by doing this should be to save memory because OpenHashMap allocates two more arrays when growing the map/set. ## How was this patch tested? Existing tests. Closes #22569 from viirya/SPARK-25542. Authored-by: Liang-Chi Hsieh Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b7d80349 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b7d80349 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b7d80349 Branch: refs/heads/master Commit: b7d80349b0e367d78cab238e62c2ec353f0f12b3 Parents: 0b33f08 Author: Liang-Chi Hsieh Authored: Fri Sep 28 14:29:56 2018 -0700 Committer: Dongjoon Hyun Committed: Fri Sep 28 14:29:56 2018 -0700 -- .../spark/util/collection/OpenHashMapSuite.scala | 10 -- .../spark/util/collection/OpenHashSetSuite.scala | 13 + 2 files changed, 13 insertions(+), 10 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b7d80349/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala index 151235d..68bcc5e 100644 --- a/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala @@ -185,16 +185,6 @@ class OpenHashMapSuite extends SparkFunSuite with Matchers { assert(map.contains(null)) } - test("support for more than 12M items") { -val cnt = 1200 // 12M -val map = new OpenHashMap[Int, Int](cnt) -for (i <- 0 until cnt) { - map(i) = 1 -} -val numInvalidValues = map.iterator.count(_._2 == 0) -assertResult(0)(numInvalidValues) - } - test("distinguish between the 0/0.0/0L and null") { val specializedMap1 = new OpenHashMap[String, Long] specializedMap1("a") = null.asInstanceOf[Long] http://git-wip-us.apache.org/repos/asf/spark/blob/b7d80349/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala index b887f93..44d2118 100644 --- a/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala @@ -255,4 +255,17 @@ class OpenHashSetSuite extends SparkFunSuite with Matchers { val set = new OpenHashSet[Long](0) assert(set.size === 0) } + + test("support for more than 12M items") { +val cnt = 1200 // 12M +val set = new OpenHashSet[Int](cnt) +for (i <- 0 until cnt) { + set.add(i) + assert(set.contains(i)) + + val pos1 = set.getPos(i) + val pos2 = set.addWithoutResize(i) & OpenHashSet.POSITION_MASK + assert(pos1 == pos2) +} + } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-23285][DOC][FOLLOWUP] Fix missing markup tag
Repository: spark Updated Branches: refs/heads/branch-2.4 b2a1e2f8d -> 81391c274 [SPARK-23285][DOC][FOLLOWUP] Fix missing markup tag ## What changes were proposed in this pull request? This adds a missing markup tag. This should go to `master/branch-2.4`. ## How was this patch tested? Manual via `SKIP_API=1 jekyll build`. Closes #22585 from dongjoon-hyun/SPARK-23285. Authored-by: Dongjoon Hyun Signed-off-by: Dongjoon Hyun (cherry picked from commit 0b33f08683a41f6f3a6ec02c327010c0722cc1d1) Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/81391c27 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/81391c27 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/81391c27 Branch: refs/heads/branch-2.4 Commit: 81391c274eb371dbf4cfed0afca47806f6fcfd00 Parents: b2a1e2f Author: Dongjoon Hyun Authored: Fri Sep 28 14:10:24 2018 -0700 Committer: Dongjoon Hyun Committed: Fri Sep 28 14:10:47 2018 -0700 -- docs/running-on-kubernetes.md | 1 + 1 file changed, 1 insertion(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/81391c27/docs/running-on-kubernetes.md -- diff --git a/docs/running-on-kubernetes.md b/docs/running-on-kubernetes.md index c83dad6..fc7c9a5 100644 --- a/docs/running-on-kubernetes.md +++ b/docs/running-on-kubernetes.md @@ -678,6 +678,7 @@ specific to Spark on Kubernetes. Example values include 0.1, 500m, 1.5, 5, etc., with the definition of cpu units documented in [CPU units](https://kubernetes.io/docs/tasks/configure-pod-container/assign-cpu-resource/#cpu-units). This is distinct from spark.executor.cores: it is only used and takes precedence over spark.executor.cores for specifying the executor pod cpu request if set. Task parallelism, e.g., number of tasks an executor can run concurrently is not affected by this. + spark.kubernetes.executor.limit.cores - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-23285][DOC][FOLLOWUP] Fix missing markup tag
Repository: spark Updated Branches: refs/heads/master e120a38c0 -> 0b33f0868 [SPARK-23285][DOC][FOLLOWUP] Fix missing markup tag ## What changes were proposed in this pull request? This adds a missing markup tag. This should go to `master/branch-2.4`. ## How was this patch tested? Manual via `SKIP_API=1 jekyll build`. Closes #22585 from dongjoon-hyun/SPARK-23285. Authored-by: Dongjoon Hyun Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0b33f086 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0b33f086 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0b33f086 Branch: refs/heads/master Commit: 0b33f08683a41f6f3a6ec02c327010c0722cc1d1 Parents: e120a38 Author: Dongjoon Hyun Authored: Fri Sep 28 14:10:24 2018 -0700 Committer: Dongjoon Hyun Committed: Fri Sep 28 14:10:24 2018 -0700 -- docs/running-on-kubernetes.md | 1 + 1 file changed, 1 insertion(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0b33f086/docs/running-on-kubernetes.md -- diff --git a/docs/running-on-kubernetes.md b/docs/running-on-kubernetes.md index 4ae7aca..840e306 100644 --- a/docs/running-on-kubernetes.md +++ b/docs/running-on-kubernetes.md @@ -691,6 +691,7 @@ specific to Spark on Kubernetes. Example values include 0.1, 500m, 1.5, 5, etc., with the definition of cpu units documented in [CPU units](https://kubernetes.io/docs/tasks/configure-pod-container/assign-cpu-resource/#cpu-units). This is distinct from spark.executor.cores: it is only used and takes precedence over spark.executor.cores for specifying the executor pod cpu request if set. Task parallelism, e.g., number of tasks an executor can run concurrently is not affected by this. + spark.kubernetes.executor.limit.cores - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
svn commit: r29762 - in /dev/spark/2.5.0-SNAPSHOT-2018_09_28_04_02-e120a38-docs: ./ _site/ _site/api/ _site/api/R/ _site/api/java/ _site/api/java/lib/ _site/api/java/org/ _site/api/java/org/apache/ _s
Author: pwendell Date: Fri Sep 28 11:17:10 2018 New Revision: 29762 Log: Apache Spark 2.5.0-SNAPSHOT-2018_09_28_04_02-e120a38 docs [This commit notification would consist of 1485 parts, which exceeds the limit of 50 ones, so it was shortened to the summary.] - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
svn commit: r29759 - in /dev/spark/2.4.1-SNAPSHOT-2018_09_28_02_03-b2a1e2f-docs: ./ _site/ _site/api/ _site/api/R/ _site/api/java/ _site/api/java/lib/ _site/api/java/org/ _site/api/java/org/apache/ _s
Author: pwendell Date: Fri Sep 28 09:17:41 2018 New Revision: 29759 Log: Apache Spark 2.4.1-SNAPSHOT-2018_09_28_02_03-b2a1e2f docs [This commit notification would consist of 1472 parts, which exceeds the limit of 50 ones, so it was shortened to the summary.] - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-25505][SQL] The output order of grouping columns in Pivot is different from the input order
Repository: spark Updated Branches: refs/heads/branch-2.4 a43a082e0 -> b2a1e2f8d [SPARK-25505][SQL] The output order of grouping columns in Pivot is different from the input order ## What changes were proposed in this pull request? The grouping columns from a Pivot query are inferred as "input columns - pivot columns - pivot aggregate columns", where input columns are the output of the child relation of Pivot. The grouping columns will be the leading columns in the pivot output and they should preserve the same order as specified by the input. For example, ``` SELECT * FROM ( SELECT course, earnings, "a" as a, "z" as z, "b" as b, "y" as y, "c" as c, "x" as x, "d" as d, "w" as w FROM courseSales ) PIVOT ( sum(earnings) FOR course IN ('dotNET', 'Java') ) ``` The output columns should be "a, z, b, y, c, x, d, w, ..." but now it is "a, b, c, d, w, x, y, z, ..." The fix is to use the child plan's `output` instead of `outputSet` so that the order can be preserved. ## How was this patch tested? Added UT. Closes #22519 from maryannxue/spark-25505. Authored-by: maryannxue Signed-off-by: gatorsmile (cherry picked from commit e120a38c0cdfb569c9151bef4d53e98175da2b25) Signed-off-by: gatorsmile Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b2a1e2f8 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b2a1e2f8 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b2a1e2f8 Branch: refs/heads/branch-2.4 Commit: b2a1e2f8d47876658debe95bffd9a848ccaa5819 Parents: a43a082 Author: maryannxue Authored: Fri Sep 28 00:09:06 2018 -0700 Committer: gatorsmile Committed: Fri Sep 28 00:09:21 2018 -0700 -- .../spark/sql/catalyst/analysis/Analyzer.scala | 7 +-- .../src/test/resources/sql-tests/inputs/pivot.sql | 10 ++ .../test/resources/sql-tests/results/pivot.sql.out | 17 - 3 files changed, 31 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b2a1e2f8/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index e3b1712..d303b43 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -554,8 +554,11 @@ class Analyzer( Cast(value, pivotColumn.dataType, Some(conf.sessionLocalTimeZone)).eval(EmptyRow) } // Group-by expressions coming from SQL are implicit and need to be deduced. -val groupByExprs = groupByExprsOpt.getOrElse( - (child.outputSet -- aggregates.flatMap(_.references) -- pivotColumn.references).toSeq) +val groupByExprs = groupByExprsOpt.getOrElse { + val pivotColAndAggRefs = +(pivotColumn.references ++ aggregates.flatMap(_.references)).toSet + child.output.filterNot(pivotColAndAggRefs.contains) +} val singleAgg = aggregates.size == 1 def outputName(value: Expression, aggregate: Expression): String = { val stringValue = value match { http://git-wip-us.apache.org/repos/asf/spark/blob/b2a1e2f8/sql/core/src/test/resources/sql-tests/inputs/pivot.sql -- diff --git a/sql/core/src/test/resources/sql-tests/inputs/pivot.sql b/sql/core/src/test/resources/sql-tests/inputs/pivot.sql index 1f607b3..81547ab 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/pivot.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/pivot.sql @@ -287,3 +287,13 @@ PIVOT ( sum(earnings) FOR (course, m) IN (('dotNET', map('1', 1)), ('Java', map('2', 2))) ); + +-- grouping columns output in the same order as input +SELECT * FROM ( + SELECT course, earnings, "a" as a, "z" as z, "b" as b, "y" as y, "c" as c, "x" as x, "d" as d, "w" as w + FROM courseSales +) +PIVOT ( + sum(earnings) + FOR course IN ('dotNET', 'Java') +); http://git-wip-us.apache.org/repos/asf/spark/blob/b2a1e2f8/sql/core/src/test/resources/sql-tests/results/pivot.sql.out -- diff --git a/sql/core/src/test/resources/sql-tests/results/pivot.sql.out b/sql/core/src/test/resources/sql-tests/results/pivot.sql.out index 2dd9293..487883a 100644 --- a/sql/core/src/test/resources/sql-tests/results/pivot.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/pivot.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 31 +-- Number of queries: 32 -- !query 0 @@ -
spark git commit: [SPARK-25505][SQL] The output order of grouping columns in Pivot is different from the input order
Repository: spark Updated Branches: refs/heads/master 3b7395fe0 -> e120a38c0 [SPARK-25505][SQL] The output order of grouping columns in Pivot is different from the input order ## What changes were proposed in this pull request? The grouping columns from a Pivot query are inferred as "input columns - pivot columns - pivot aggregate columns", where input columns are the output of the child relation of Pivot. The grouping columns will be the leading columns in the pivot output and they should preserve the same order as specified by the input. For example, ``` SELECT * FROM ( SELECT course, earnings, "a" as a, "z" as z, "b" as b, "y" as y, "c" as c, "x" as x, "d" as d, "w" as w FROM courseSales ) PIVOT ( sum(earnings) FOR course IN ('dotNET', 'Java') ) ``` The output columns should be "a, z, b, y, c, x, d, w, ..." but now it is "a, b, c, d, w, x, y, z, ..." The fix is to use the child plan's `output` instead of `outputSet` so that the order can be preserved. ## How was this patch tested? Added UT. Closes #22519 from maryannxue/spark-25505. Authored-by: maryannxue Signed-off-by: gatorsmile Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e120a38c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e120a38c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e120a38c Branch: refs/heads/master Commit: e120a38c0cdfb569c9151bef4d53e98175da2b25 Parents: 3b7395f Author: maryannxue Authored: Fri Sep 28 00:09:06 2018 -0700 Committer: gatorsmile Committed: Fri Sep 28 00:09:06 2018 -0700 -- .../spark/sql/catalyst/analysis/Analyzer.scala | 7 +-- .../src/test/resources/sql-tests/inputs/pivot.sql | 10 ++ .../test/resources/sql-tests/results/pivot.sql.out | 17 - 3 files changed, 31 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e120a38c/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index 7034dfd..c0a7308 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -554,8 +554,11 @@ class Analyzer( Cast(value, pivotColumn.dataType, Some(conf.sessionLocalTimeZone)).eval(EmptyRow) } // Group-by expressions coming from SQL are implicit and need to be deduced. -val groupByExprs = groupByExprsOpt.getOrElse( - (child.outputSet -- aggregates.flatMap(_.references) -- pivotColumn.references).toSeq) +val groupByExprs = groupByExprsOpt.getOrElse { + val pivotColAndAggRefs = +(pivotColumn.references ++ aggregates.flatMap(_.references)).toSet + child.output.filterNot(pivotColAndAggRefs.contains) +} val singleAgg = aggregates.size == 1 def outputName(value: Expression, aggregate: Expression): String = { val stringValue = value match { http://git-wip-us.apache.org/repos/asf/spark/blob/e120a38c/sql/core/src/test/resources/sql-tests/inputs/pivot.sql -- diff --git a/sql/core/src/test/resources/sql-tests/inputs/pivot.sql b/sql/core/src/test/resources/sql-tests/inputs/pivot.sql index 1f607b3..81547ab 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/pivot.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/pivot.sql @@ -287,3 +287,13 @@ PIVOT ( sum(earnings) FOR (course, m) IN (('dotNET', map('1', 1)), ('Java', map('2', 2))) ); + +-- grouping columns output in the same order as input +SELECT * FROM ( + SELECT course, earnings, "a" as a, "z" as z, "b" as b, "y" as y, "c" as c, "x" as x, "d" as d, "w" as w + FROM courseSales +) +PIVOT ( + sum(earnings) + FOR course IN ('dotNET', 'Java') +); http://git-wip-us.apache.org/repos/asf/spark/blob/e120a38c/sql/core/src/test/resources/sql-tests/results/pivot.sql.out -- diff --git a/sql/core/src/test/resources/sql-tests/results/pivot.sql.out b/sql/core/src/test/resources/sql-tests/results/pivot.sql.out index 2dd9293..487883a 100644 --- a/sql/core/src/test/resources/sql-tests/results/pivot.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/pivot.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 31 +-- Number of queries: 32 -- !query 0 @@ -476,3 +476,18 @@ struct<> -- !query 30 output org.apache.spark.sql.AnalysisException Invalid pivot co