spark git commit: [SPARK][EXAMPLE] Added missing semicolon in quick-start-guide example
Repository: spark Updated Branches: refs/heads/branch-2.0 1b1c849bf -> 5ecd3c23a [SPARK][EXAMPLE] Added missing semicolon in quick-start-guide example ## What changes were proposed in this pull request? Added missing semicolon in quick-start-guide java example code which wasn't compiling before. ## How was this patch tested? Locally by running and generating site for docs. You can see the last line contains ";" in the below snapshot. ![image](https://cloud.githubusercontent.com/assets/10628224/20751760/9a7e0402-b723-11e6-9aa8-3b6ca2d92ebf.png) Author: manishAtGit Closes #16081 from manishatGit/fixed-quick-start-guide. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5ecd3c23 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5ecd3c23 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5ecd3c23 Branch: refs/heads/branch-2.0 Commit: 5ecd3c23ab63de68addf2f7405377dbb2a7e08e9 Parents: 1b1c849 Author: manishAtGit Authored: Wed Nov 30 14:46:50 2016 -0500 Committer: Andrew Or Committed: Wed Nov 30 14:49:18 2016 -0500 -- docs/quick-start.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5ecd3c23/docs/quick-start.md -- diff --git a/docs/quick-start.md b/docs/quick-start.md index 04b0f0a..c67b010 100644 --- a/docs/quick-start.md +++ b/docs/quick-start.md @@ -330,7 +330,7 @@ public class SimpleApp { System.out.println("Lines with a: " + numAs + ", lines with b: " + numBs); -sc.stop() +sc.stop(); } } {% endhighlight %} - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-18640] Add synchronization to TaskScheduler.runningTasksByExecutors
Repository: spark Updated Branches: refs/heads/branch-2.0 8b33aa089 -> 1b1c849bf [SPARK-18640] Add synchronization to TaskScheduler.runningTasksByExecutors ## What changes were proposed in this pull request? The method `TaskSchedulerImpl.runningTasksByExecutors()` accesses the mutable `executorIdToRunningTaskIds` map without proper synchronization. In addition, as markhamstra pointed out in #15986, the signature's use of parentheses is a little odd given that this is a pure getter method. This patch fixes both issues. ## How was this patch tested? Covered by existing tests. Author: Josh Rosen Closes #16073 from JoshRosen/runningTasksByExecutors-thread-safety. (cherry picked from commit c51c7725944d60738e2bac3e11f6aea74812905c) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1b1c849b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1b1c849b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1b1c849b Branch: refs/heads/branch-2.0 Commit: 1b1c849bfc3802c02bbf4585adba85907c82ff3b Parents: 8b33aa0 Author: Josh Rosen Authored: Wed Nov 30 14:47:41 2016 -0500 Committer: Andrew Or Committed: Wed Nov 30 14:48:03 2016 -0500 -- core/src/main/scala/org/apache/spark/SparkStatusTracker.scala| 2 +- .../scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala | 2 +- .../org/apache/spark/scheduler/TaskSchedulerImplSuite.scala | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/1b1c849b/core/src/main/scala/org/apache/spark/SparkStatusTracker.scala -- diff --git a/core/src/main/scala/org/apache/spark/SparkStatusTracker.scala b/core/src/main/scala/org/apache/spark/SparkStatusTracker.scala index 52c4656..22a553e 100644 --- a/core/src/main/scala/org/apache/spark/SparkStatusTracker.scala +++ b/core/src/main/scala/org/apache/spark/SparkStatusTracker.scala @@ -112,7 +112,7 @@ class SparkStatusTracker private[spark] (sc: SparkContext) { */ def getExecutorInfos: Array[SparkExecutorInfo] = { val executorIdToRunningTasks: Map[String, Int] = - sc.taskScheduler.asInstanceOf[TaskSchedulerImpl].runningTasksByExecutors() + sc.taskScheduler.asInstanceOf[TaskSchedulerImpl].runningTasksByExecutors sc.getExecutorStorageStatus.map { status => val bmId = status.blockManagerId http://git-wip-us.apache.org/repos/asf/spark/blob/1b1c849b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala -- diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala index b2ef41e..feab4be 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala @@ -91,7 +91,7 @@ private[spark] class TaskSchedulerImpl( // IDs of the tasks running on each executor private val executorIdToRunningTaskIds = new HashMap[String, HashSet[Long]] - def runningTasksByExecutors(): Map[String, Int] = { + def runningTasksByExecutors: Map[String, Int] = synchronized { executorIdToRunningTaskIds.toMap.mapValues(_.size) } http://git-wip-us.apache.org/repos/asf/spark/blob/1b1c849b/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala index 19b6fec..46c6a93 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala @@ -304,7 +304,7 @@ class TaskSchedulerImplSuite extends SparkFunSuite with LocalSparkContext with L // Check that state associated with the lost task attempt is cleaned up: assert(taskScheduler.taskIdToExecutorId.isEmpty) assert(taskScheduler.taskIdToTaskSetManager.isEmpty) -assert(taskScheduler.runningTasksByExecutors().get("executor0").isEmpty) +assert(taskScheduler.runningTasksByExecutors.get("executor0").isEmpty) } test("if a task finishes with TaskState.LOST its executor is marked as dead") { @@ -335,7 +335,7 @@ class TaskSchedulerImplSuite extends SparkFunSuite with LocalSparkContext with L // Check that state associated with the lost task attempt is cleaned up: assert(taskScheduler.taskIdToExecutorId.isEmpty) assert(taskScheduler.taskIdToTaskSetManager.isEmpty) -assert(taskScheduler.runningTasksByExecutors().get("
spark git commit: [SPARK-18640] Add synchronization to TaskScheduler.runningTasksByExecutors
Repository: spark Updated Branches: refs/heads/branch-2.1 eae85da38 -> 7c0e2962d [SPARK-18640] Add synchronization to TaskScheduler.runningTasksByExecutors ## What changes were proposed in this pull request? The method `TaskSchedulerImpl.runningTasksByExecutors()` accesses the mutable `executorIdToRunningTaskIds` map without proper synchronization. In addition, as markhamstra pointed out in #15986, the signature's use of parentheses is a little odd given that this is a pure getter method. This patch fixes both issues. ## How was this patch tested? Covered by existing tests. Author: Josh Rosen Closes #16073 from JoshRosen/runningTasksByExecutors-thread-safety. (cherry picked from commit c51c7725944d60738e2bac3e11f6aea74812905c) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7c0e2962 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7c0e2962 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7c0e2962 Branch: refs/heads/branch-2.1 Commit: 7c0e2962d5e0fb80e4472d29dd467477f1cbcf8a Parents: eae85da Author: Josh Rosen Authored: Wed Nov 30 14:47:41 2016 -0500 Committer: Andrew Or Committed: Wed Nov 30 14:47:50 2016 -0500 -- core/src/main/scala/org/apache/spark/SparkStatusTracker.scala| 2 +- .../scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala | 2 +- .../org/apache/spark/scheduler/TaskSchedulerImplSuite.scala | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/7c0e2962/core/src/main/scala/org/apache/spark/SparkStatusTracker.scala -- diff --git a/core/src/main/scala/org/apache/spark/SparkStatusTracker.scala b/core/src/main/scala/org/apache/spark/SparkStatusTracker.scala index 52c4656..22a553e 100644 --- a/core/src/main/scala/org/apache/spark/SparkStatusTracker.scala +++ b/core/src/main/scala/org/apache/spark/SparkStatusTracker.scala @@ -112,7 +112,7 @@ class SparkStatusTracker private[spark] (sc: SparkContext) { */ def getExecutorInfos: Array[SparkExecutorInfo] = { val executorIdToRunningTasks: Map[String, Int] = - sc.taskScheduler.asInstanceOf[TaskSchedulerImpl].runningTasksByExecutors() + sc.taskScheduler.asInstanceOf[TaskSchedulerImpl].runningTasksByExecutors sc.getExecutorStorageStatus.map { status => val bmId = status.blockManagerId http://git-wip-us.apache.org/repos/asf/spark/blob/7c0e2962/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala -- diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala index 67446da..b03cfe4 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala @@ -96,7 +96,7 @@ private[spark] class TaskSchedulerImpl( // IDs of the tasks running on each executor private val executorIdToRunningTaskIds = new HashMap[String, HashSet[Long]] - def runningTasksByExecutors(): Map[String, Int] = { + def runningTasksByExecutors: Map[String, Int] = synchronized { executorIdToRunningTaskIds.toMap.mapValues(_.size) } http://git-wip-us.apache.org/repos/asf/spark/blob/7c0e2962/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala index 48ec04b..e736c6c 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala @@ -442,7 +442,7 @@ class TaskSchedulerImplSuite extends SparkFunSuite with LocalSparkContext with B // Check that state associated with the lost task attempt is cleaned up: assert(taskScheduler.taskIdToExecutorId.isEmpty) assert(taskScheduler.taskIdToTaskSetManager.isEmpty) -assert(taskScheduler.runningTasksByExecutors().get("executor0").isEmpty) +assert(taskScheduler.runningTasksByExecutors.get("executor0").isEmpty) } test("if a task finishes with TaskState.LOST its executor is marked as dead") { @@ -473,7 +473,7 @@ class TaskSchedulerImplSuite extends SparkFunSuite with LocalSparkContext with B // Check that state associated with the lost task attempt is cleaned up: assert(taskScheduler.taskIdToExecutorId.isEmpty) assert(taskScheduler.taskIdToTaskSetManager.isEmpty) -assert(taskScheduler.runningTasksByExecutors().get("
spark git commit: [SPARK-18640] Add synchronization to TaskScheduler.runningTasksByExecutors
Repository: spark Updated Branches: refs/heads/master bc95ea0be -> c51c77259 [SPARK-18640] Add synchronization to TaskScheduler.runningTasksByExecutors ## What changes were proposed in this pull request? The method `TaskSchedulerImpl.runningTasksByExecutors()` accesses the mutable `executorIdToRunningTaskIds` map without proper synchronization. In addition, as markhamstra pointed out in #15986, the signature's use of parentheses is a little odd given that this is a pure getter method. This patch fixes both issues. ## How was this patch tested? Covered by existing tests. Author: Josh Rosen Closes #16073 from JoshRosen/runningTasksByExecutors-thread-safety. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c51c7725 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c51c7725 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c51c7725 Branch: refs/heads/master Commit: c51c7725944d60738e2bac3e11f6aea74812905c Parents: bc95ea0 Author: Josh Rosen Authored: Wed Nov 30 14:47:41 2016 -0500 Committer: Andrew Or Committed: Wed Nov 30 14:47:41 2016 -0500 -- core/src/main/scala/org/apache/spark/SparkStatusTracker.scala| 2 +- .../scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala | 2 +- .../org/apache/spark/scheduler/TaskSchedulerImplSuite.scala | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c51c7725/core/src/main/scala/org/apache/spark/SparkStatusTracker.scala -- diff --git a/core/src/main/scala/org/apache/spark/SparkStatusTracker.scala b/core/src/main/scala/org/apache/spark/SparkStatusTracker.scala index 52c4656..22a553e 100644 --- a/core/src/main/scala/org/apache/spark/SparkStatusTracker.scala +++ b/core/src/main/scala/org/apache/spark/SparkStatusTracker.scala @@ -112,7 +112,7 @@ class SparkStatusTracker private[spark] (sc: SparkContext) { */ def getExecutorInfos: Array[SparkExecutorInfo] = { val executorIdToRunningTasks: Map[String, Int] = - sc.taskScheduler.asInstanceOf[TaskSchedulerImpl].runningTasksByExecutors() + sc.taskScheduler.asInstanceOf[TaskSchedulerImpl].runningTasksByExecutors sc.getExecutorStorageStatus.map { status => val bmId = status.blockManagerId http://git-wip-us.apache.org/repos/asf/spark/blob/c51c7725/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala -- diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala index 67446da..b03cfe4 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala @@ -96,7 +96,7 @@ private[spark] class TaskSchedulerImpl( // IDs of the tasks running on each executor private val executorIdToRunningTaskIds = new HashMap[String, HashSet[Long]] - def runningTasksByExecutors(): Map[String, Int] = { + def runningTasksByExecutors: Map[String, Int] = synchronized { executorIdToRunningTaskIds.toMap.mapValues(_.size) } http://git-wip-us.apache.org/repos/asf/spark/blob/c51c7725/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala index 59bea27..a0b6268 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala @@ -678,7 +678,7 @@ class TaskSchedulerImplSuite extends SparkFunSuite with LocalSparkContext with B // Check that state associated with the lost task attempt is cleaned up: assert(taskScheduler.taskIdToExecutorId.isEmpty) assert(taskScheduler.taskIdToTaskSetManager.isEmpty) -assert(taskScheduler.runningTasksByExecutors().get("executor0").isEmpty) +assert(taskScheduler.runningTasksByExecutors.get("executor0").isEmpty) } test("if a task finishes with TaskState.LOST its executor is marked as dead") { @@ -709,7 +709,7 @@ class TaskSchedulerImplSuite extends SparkFunSuite with LocalSparkContext with B // Check that state associated with the lost task attempt is cleaned up: assert(taskScheduler.taskIdToExecutorId.isEmpty) assert(taskScheduler.taskIdToTaskSetManager.isEmpty) -assert(taskScheduler.runningTasksByExecutors().get("executor0").isEmpty) +assert(taskScheduler.runningTasksByExecutors.get("executor0").isEmpty)
spark git commit: [SPARK][EXAMPLE] Added missing semicolon in quick-start-guide example
Repository: spark Updated Branches: refs/heads/branch-2.1 3de93fb48 -> eae85da38 [SPARK][EXAMPLE] Added missing semicolon in quick-start-guide example ## What changes were proposed in this pull request? Added missing semicolon in quick-start-guide java example code which wasn't compiling before. ## How was this patch tested? Locally by running and generating site for docs. You can see the last line contains ";" in the below snapshot. ![image](https://cloud.githubusercontent.com/assets/10628224/20751760/9a7e0402-b723-11e6-9aa8-3b6ca2d92ebf.png) Author: manishAtGit Closes #16081 from manishatGit/fixed-quick-start-guide. (cherry picked from commit bc95ea0be5b880673d452f5eec47fbfd403d94ce) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/eae85da3 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/eae85da3 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/eae85da3 Branch: refs/heads/branch-2.1 Commit: eae85da388e27c7eda8be3933f673ad7f1a3c6af Parents: 3de93fb Author: manishAtGit Authored: Wed Nov 30 14:46:50 2016 -0500 Committer: Andrew Or Committed: Wed Nov 30 14:47:06 2016 -0500 -- docs/quick-start.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/eae85da3/docs/quick-start.md -- diff --git a/docs/quick-start.md b/docs/quick-start.md index cb9a378..0836c60 100644 --- a/docs/quick-start.md +++ b/docs/quick-start.md @@ -330,7 +330,7 @@ public class SimpleApp { System.out.println("Lines with a: " + numAs + ", lines with b: " + numBs); -sc.stop() +sc.stop(); } } {% endhighlight %} - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK][EXAMPLE] Added missing semicolon in quick-start-guide example
Repository: spark Updated Branches: refs/heads/master 3f03c90a8 -> bc95ea0be [SPARK][EXAMPLE] Added missing semicolon in quick-start-guide example ## What changes were proposed in this pull request? Added missing semicolon in quick-start-guide java example code which wasn't compiling before. ## How was this patch tested? Locally by running and generating site for docs. You can see the last line contains ";" in the below snapshot. ![image](https://cloud.githubusercontent.com/assets/10628224/20751760/9a7e0402-b723-11e6-9aa8-3b6ca2d92ebf.png) Author: manishAtGit Closes #16081 from manishatGit/fixed-quick-start-guide. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/bc95ea0b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/bc95ea0b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/bc95ea0b Branch: refs/heads/master Commit: bc95ea0be5b880673d452f5eec47fbfd403d94ce Parents: 3f03c90 Author: manishAtGit Authored: Wed Nov 30 14:46:50 2016 -0500 Committer: Andrew Or Committed: Wed Nov 30 14:46:50 2016 -0500 -- docs/quick-start.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/bc95ea0b/docs/quick-start.md -- diff --git a/docs/quick-start.md b/docs/quick-start.md index cb9a378..0836c60 100644 --- a/docs/quick-start.md +++ b/docs/quick-start.md @@ -330,7 +330,7 @@ public class SimpleApp { System.out.println("Lines with a: " + numAs + ", lines with b: " + numBs); -sc.stop() +sc.stop(); } } {% endhighlight %} - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-17680][SQL][TEST] Added test cases for InMemoryRelation
Repository: spark Updated Branches: refs/heads/branch-2.1 81e3f9711 -> b386943b2 [SPARK-17680][SQL][TEST] Added test cases for InMemoryRelation ## What changes were proposed in this pull request? This pull request adds test cases for the following cases: - keep all data types with null or without null - access `CachedBatch` disabling whole stage codegen - access only some columns in `CachedBatch` This PR is a part of https://github.com/apache/spark/pull/15219. Here are motivations to add these tests. When https://github.com/apache/spark/pull/15219 is enabled, the first two cases are handled by specialized (generated) code. The third one is a pitfall. In general, even for now, it would be helpful to increase test coverage. ## How was this patch tested? added test suites itself Author: Kazuaki Ishizaki Closes #15462 from kiszk/columnartestsuites. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b386943b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b386943b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b386943b Branch: refs/heads/branch-2.1 Commit: b386943b2fe6af5237270bfa520295c1711bb341 Parents: 81e3f97 Author: Kazuaki Ishizaki Authored: Mon Nov 28 14:06:37 2016 -0500 Committer: Andrew Or Committed: Mon Nov 28 14:07:34 2016 -0500 -- .../columnar/InMemoryColumnarQuerySuite.scala | 148 ++- 1 file changed, 146 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b386943b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala index b272c8e..afeb478 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala @@ -20,18 +20,96 @@ package org.apache.spark.sql.execution.columnar import java.nio.charset.StandardCharsets import java.sql.{Date, Timestamp} -import org.apache.spark.sql.{QueryTest, Row} +import org.apache.spark.sql.{DataFrame, QueryTest, Row} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.test.SQLTestData._ import org.apache.spark.sql.types._ -import org.apache.spark.storage.StorageLevel.MEMORY_ONLY +import org.apache.spark.storage.StorageLevel._ class InMemoryColumnarQuerySuite extends QueryTest with SharedSQLContext { import testImplicits._ setupTestData() + private def cachePrimitiveTest(data: DataFrame, dataType: String) { +data.createOrReplaceTempView(s"testData$dataType") +val storageLevel = MEMORY_ONLY +val plan = spark.sessionState.executePlan(data.logicalPlan).sparkPlan +val inMemoryRelation = InMemoryRelation(useCompression = true, 5, storageLevel, plan, None) + +assert(inMemoryRelation.cachedColumnBuffers.getStorageLevel == storageLevel) +inMemoryRelation.cachedColumnBuffers.collect().head match { + case _: CachedBatch => + case other => fail(s"Unexpected cached batch type: ${other.getClass.getName}") +} +checkAnswer(inMemoryRelation, data.collect().toSeq) + } + + private def testPrimitiveType(nullability: Boolean): Unit = { +val dataTypes = Seq(BooleanType, ByteType, ShortType, IntegerType, LongType, + FloatType, DoubleType, DateType, TimestampType, DecimalType(25, 5), DecimalType(6, 5)) +val schema = StructType(dataTypes.zipWithIndex.map { case (dataType, index) => + StructField(s"col$index", dataType, nullability) +}) +val rdd = spark.sparkContext.parallelize((1 to 10).map(i => Row( + if (nullability && i % 3 == 0) null else if (i % 2 == 0) true else false, + if (nullability && i % 3 == 0) null else i.toByte, + if (nullability && i % 3 == 0) null else i.toShort, + if (nullability && i % 3 == 0) null else i.toInt, + if (nullability && i % 3 == 0) null else i.toLong, + if (nullability && i % 3 == 0) null else (i + 0.25).toFloat, + if (nullability && i % 3 == 0) null else (i + 0.75).toDouble, + if (nullability && i % 3 == 0) null else new Date(i), + if (nullability && i % 3 == 0) null else new Timestamp(i * 100L), + if (nullability && i % 3 == 0) null else BigDecimal(Long.MaxValue.toString + ".12345"), + if (nullability && i % 3 == 0) null + else new java.math.BigDecimal(s"${i % 9 + 1}" + ".23456") +))) +cachePrimitiveTest(spark.createDataFrame(rdd, schema), "primitivesDateTimeStamp") +
spark git commit: [SPARK-17680][SQL][TEST] Added test cases for InMemoryRelation
Repository: spark Updated Branches: refs/heads/master 0f5f52a3d -> ad67993b7 [SPARK-17680][SQL][TEST] Added test cases for InMemoryRelation ## What changes were proposed in this pull request? This pull request adds test cases for the following cases: - keep all data types with null or without null - access `CachedBatch` disabling whole stage codegen - access only some columns in `CachedBatch` This PR is a part of https://github.com/apache/spark/pull/15219. Here are motivations to add these tests. When https://github.com/apache/spark/pull/15219 is enabled, the first two cases are handled by specialized (generated) code. The third one is a pitfall. In general, even for now, it would be helpful to increase test coverage. ## How was this patch tested? added test suites itself Author: Kazuaki Ishizaki Closes #15462 from kiszk/columnartestsuites. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ad67993b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ad67993b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ad67993b Branch: refs/heads/master Commit: ad67993b73490a24e7012df23810dab1712e7689 Parents: 0f5f52a Author: Kazuaki Ishizaki Authored: Mon Nov 28 14:06:37 2016 -0500 Committer: Andrew Or Committed: Mon Nov 28 14:06:37 2016 -0500 -- .../columnar/InMemoryColumnarQuerySuite.scala | 148 ++- 1 file changed, 146 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ad67993b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala index b272c8e..afeb478 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala @@ -20,18 +20,96 @@ package org.apache.spark.sql.execution.columnar import java.nio.charset.StandardCharsets import java.sql.{Date, Timestamp} -import org.apache.spark.sql.{QueryTest, Row} +import org.apache.spark.sql.{DataFrame, QueryTest, Row} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.test.SQLTestData._ import org.apache.spark.sql.types._ -import org.apache.spark.storage.StorageLevel.MEMORY_ONLY +import org.apache.spark.storage.StorageLevel._ class InMemoryColumnarQuerySuite extends QueryTest with SharedSQLContext { import testImplicits._ setupTestData() + private def cachePrimitiveTest(data: DataFrame, dataType: String) { +data.createOrReplaceTempView(s"testData$dataType") +val storageLevel = MEMORY_ONLY +val plan = spark.sessionState.executePlan(data.logicalPlan).sparkPlan +val inMemoryRelation = InMemoryRelation(useCompression = true, 5, storageLevel, plan, None) + +assert(inMemoryRelation.cachedColumnBuffers.getStorageLevel == storageLevel) +inMemoryRelation.cachedColumnBuffers.collect().head match { + case _: CachedBatch => + case other => fail(s"Unexpected cached batch type: ${other.getClass.getName}") +} +checkAnswer(inMemoryRelation, data.collect().toSeq) + } + + private def testPrimitiveType(nullability: Boolean): Unit = { +val dataTypes = Seq(BooleanType, ByteType, ShortType, IntegerType, LongType, + FloatType, DoubleType, DateType, TimestampType, DecimalType(25, 5), DecimalType(6, 5)) +val schema = StructType(dataTypes.zipWithIndex.map { case (dataType, index) => + StructField(s"col$index", dataType, nullability) +}) +val rdd = spark.sparkContext.parallelize((1 to 10).map(i => Row( + if (nullability && i % 3 == 0) null else if (i % 2 == 0) true else false, + if (nullability && i % 3 == 0) null else i.toByte, + if (nullability && i % 3 == 0) null else i.toShort, + if (nullability && i % 3 == 0) null else i.toInt, + if (nullability && i % 3 == 0) null else i.toLong, + if (nullability && i % 3 == 0) null else (i + 0.25).toFloat, + if (nullability && i % 3 == 0) null else (i + 0.75).toDouble, + if (nullability && i % 3 == 0) null else new Date(i), + if (nullability && i % 3 == 0) null else new Timestamp(i * 100L), + if (nullability && i % 3 == 0) null else BigDecimal(Long.MaxValue.toString + ".12345"), + if (nullability && i % 3 == 0) null + else new java.math.BigDecimal(s"${i % 9 + 1}" + ".23456") +))) +cachePrimitiveTest(spark.createDataFrame(rdd, schema), "primitivesDateTimeStamp") + } + +
spark git commit: [SPARK-18050][SQL] do not create default database if it already exists
Repository: spark Updated Branches: refs/heads/master 70ad07a9d -> f129ebcd3 [SPARK-18050][SQL] do not create default database if it already exists ## What changes were proposed in this pull request? When we try to create the default database, we ask hive to do nothing if it already exists. However, Hive will log an error message instead of doing nothing, and the error message is quite annoying and confusing. In this PR, we only create default database if it doesn't exist. ## How was this patch tested? N/A Author: Wenchen Fan Closes #15993 from cloud-fan/default-db. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f129ebcd Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f129ebcd Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f129ebcd Branch: refs/heads/master Commit: f129ebcd302168b628f47705f4a7d6b7e7b057b0 Parents: 70ad07a Author: Wenchen Fan Authored: Wed Nov 23 12:54:18 2016 -0500 Committer: Andrew Or Committed: Wed Nov 23 12:54:18 2016 -0500 -- .../scala/org/apache/spark/sql/internal/SharedState.scala| 8 ++-- 1 file changed, 6 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f129ebcd/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala index 6232c18..8de95fe 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala @@ -92,8 +92,12 @@ private[sql] class SharedState(val sparkContext: SparkContext) extends Logging { { val defaultDbDefinition = CatalogDatabase( SessionCatalog.DEFAULT_DATABASE, "default database", warehousePath, Map()) -// Initialize default database if it doesn't already exist -externalCatalog.createDatabase(defaultDbDefinition, ignoreIfExists = true) +// Initialize default database if it doesn't exist +if (!externalCatalog.databaseExists(SessionCatalog.DEFAULT_DATABASE)) { + // There may be another Spark application creating default database at the same time, here we + // set `ignoreIfExists = true` to avoid `DatabaseAlreadyExists` exception. + externalCatalog.createDatabase(defaultDbDefinition, ignoreIfExists = true) +} } /** - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-18050][SQL] do not create default database if it already exists
Repository: spark Updated Branches: refs/heads/branch-2.1 599dac159 -> 835f03f34 [SPARK-18050][SQL] do not create default database if it already exists ## What changes were proposed in this pull request? When we try to create the default database, we ask hive to do nothing if it already exists. However, Hive will log an error message instead of doing nothing, and the error message is quite annoying and confusing. In this PR, we only create default database if it doesn't exist. ## How was this patch tested? N/A Author: Wenchen Fan Closes #15993 from cloud-fan/default-db. (cherry picked from commit f129ebcd302168b628f47705f4a7d6b7e7b057b0) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/835f03f3 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/835f03f3 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/835f03f3 Branch: refs/heads/branch-2.1 Commit: 835f03f344f2dea2134409d09e06b34feaae09f9 Parents: 599dac1 Author: Wenchen Fan Authored: Wed Nov 23 12:54:18 2016 -0500 Committer: Andrew Or Committed: Wed Nov 23 12:54:32 2016 -0500 -- .../scala/org/apache/spark/sql/internal/SharedState.scala| 8 ++-- 1 file changed, 6 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/835f03f3/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala index 6232c18..8de95fe 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala @@ -92,8 +92,12 @@ private[sql] class SharedState(val sparkContext: SparkContext) extends Logging { { val defaultDbDefinition = CatalogDatabase( SessionCatalog.DEFAULT_DATABASE, "default database", warehousePath, Map()) -// Initialize default database if it doesn't already exist -externalCatalog.createDatabase(defaultDbDefinition, ignoreIfExists = true) +// Initialize default database if it doesn't exist +if (!externalCatalog.databaseExists(SessionCatalog.DEFAULT_DATABASE)) { + // There may be another Spark application creating default database at the same time, here we + // set `ignoreIfExists = true` to avoid `DatabaseAlreadyExists` exception. + externalCatalog.createDatabase(defaultDbDefinition, ignoreIfExists = true) +} } /** - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-18507][SQL] HiveExternalCatalog.listPartitions should only call getTable once
Repository: spark Updated Branches: refs/heads/branch-2.1 0e624e990 -> fa360134d [SPARK-18507][SQL] HiveExternalCatalog.listPartitions should only call getTable once ## What changes were proposed in this pull request? HiveExternalCatalog.listPartitions should only call `getTable` once, instead of calling it for every partitions. ## How was this patch tested? N/A Author: Wenchen Fan Closes #15978 from cloud-fan/perf. (cherry picked from commit 702cd403fc8e5ce8281fe8828197ead46bdb8832) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/fa360134 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/fa360134 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/fa360134 Branch: refs/heads/branch-2.1 Commit: fa360134d06e5bfb423f0bd769edb47dbda1d9af Parents: 0e624e9 Author: Wenchen Fan Authored: Tue Nov 22 15:25:22 2016 -0500 Committer: Andrew Or Committed: Tue Nov 22 15:25:33 2016 -0500 -- .../scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/fa360134/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala -- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala index 5dbb402..ff0923f 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala @@ -907,8 +907,9 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat db: String, table: String, partialSpec: Option[TablePartitionSpec] = None): Seq[CatalogTablePartition] = withClient { +val actualPartColNames = getTable(db, table).partitionColumnNames client.getPartitions(db, table, partialSpec.map(lowerCasePartitionSpec)).map { part => - part.copy(spec = restorePartitionSpec(part.spec, getTable(db, table).partitionColumnNames)) + part.copy(spec = restorePartitionSpec(part.spec, actualPartColNames)) } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-18507][SQL] HiveExternalCatalog.listPartitions should only call getTable once
Repository: spark Updated Branches: refs/heads/master 45ea46b7b -> 702cd403f [SPARK-18507][SQL] HiveExternalCatalog.listPartitions should only call getTable once ## What changes were proposed in this pull request? HiveExternalCatalog.listPartitions should only call `getTable` once, instead of calling it for every partitions. ## How was this patch tested? N/A Author: Wenchen Fan Closes #15978 from cloud-fan/perf. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/702cd403 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/702cd403 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/702cd403 Branch: refs/heads/master Commit: 702cd403fc8e5ce8281fe8828197ead46bdb8832 Parents: 45ea46b Author: Wenchen Fan Authored: Tue Nov 22 15:25:22 2016 -0500 Committer: Andrew Or Committed: Tue Nov 22 15:25:22 2016 -0500 -- .../scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/702cd403/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala -- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala index 5dbb402..ff0923f 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala @@ -907,8 +907,9 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat db: String, table: String, partialSpec: Option[TablePartitionSpec] = None): Seq[CatalogTablePartition] = withClient { +val actualPartColNames = getTable(db, table).partitionColumnNames client.getPartitions(db, table, partialSpec.map(lowerCasePartitionSpec)).map { part => - part.copy(spec = restorePartitionSpec(part.spec, getTable(db, table).partitionColumnNames)) + part.copy(spec = restorePartitionSpec(part.spec, actualPartColNames)) } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-18361][PYSPARK] Expose RDD localCheckpoint in PySpark
Repository: spark Updated Branches: refs/heads/branch-2.1 b0a73c9be -> 406f33987 [SPARK-18361][PYSPARK] Expose RDD localCheckpoint in PySpark ## What changes were proposed in this pull request? Expose RDD's localCheckpoint() and associated functions in PySpark. ## How was this patch tested? I added a UnitTest in python/pyspark/tests.py which passes. I certify that this is my original work, and I license it to the project under the project's open source license. Gabriel HUANG Developer at Cardabel (http://cardabel.com/) Author: Gabriel Huang Closes #15811 from gabrielhuang/pyspark-localcheckpoint. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/406f3398 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/406f3398 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/406f3398 Branch: refs/heads/branch-2.1 Commit: 406f33987ac078fb20d2f5e81b7e1f646ea53fed Parents: b0a73c9 Author: Gabriel Huang Authored: Mon Nov 21 16:08:34 2016 -0500 Committer: Andrew Or Committed: Mon Nov 21 16:16:59 2016 -0500 -- python/pyspark/rdd.py | 33 - python/pyspark/tests.py | 17 + 2 files changed, 49 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/406f3398/python/pyspark/rdd.py -- diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py index 641787e..f21a364 100644 --- a/python/pyspark/rdd.py +++ b/python/pyspark/rdd.py @@ -263,13 +263,44 @@ class RDD(object): def isCheckpointed(self): """ -Return whether this RDD has been checkpointed or not +Return whether this RDD is checkpointed and materialized, either reliably or locally. """ return self._jrdd.rdd().isCheckpointed() +def localCheckpoint(self): +""" +Mark this RDD for local checkpointing using Spark's existing caching layer. + +This method is for users who wish to truncate RDD lineages while skipping the expensive +step of replicating the materialized data in a reliable distributed file system. This is +useful for RDDs with long lineages that need to be truncated periodically (e.g. GraphX). + +Local checkpointing sacrifices fault-tolerance for performance. In particular, checkpointed +data is written to ephemeral local storage in the executors instead of to a reliable, +fault-tolerant storage. The effect is that if an executor fails during the computation, +the checkpointed data may no longer be accessible, causing an irrecoverable job failure. + +This is NOT safe to use with dynamic allocation, which removes executors along +with their cached blocks. If you must use both features, you are advised to set +L{spark.dynamicAllocation.cachedExecutorIdleTimeout} to a high value. + +The checkpoint directory set through L{SparkContext.setCheckpointDir()} is not used. +""" +self._jrdd.rdd().localCheckpoint() + +def isLocallyCheckpointed(self): +""" +Return whether this RDD is marked for local checkpointing. + +Exposed for testing. +""" +return self._jrdd.rdd().isLocallyCheckpointed() + def getCheckpointFile(self): """ Gets the name of the file to which this RDD was checkpointed + +Not defined if RDD is checkpointed locally. """ checkpointFile = self._jrdd.rdd().getCheckpointFile() if checkpointFile.isDefined(): http://git-wip-us.apache.org/repos/asf/spark/blob/406f3398/python/pyspark/tests.py -- diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py index 3e0bd16..ab4bef8 100644 --- a/python/pyspark/tests.py +++ b/python/pyspark/tests.py @@ -390,6 +390,23 @@ class CheckpointTests(ReusedPySparkTestCase): self.assertEqual([1, 2, 3, 4], recovered.collect()) +class LocalCheckpointTests(ReusedPySparkTestCase): + +def test_basic_localcheckpointing(self): +parCollection = self.sc.parallelize([1, 2, 3, 4]) +flatMappedRDD = parCollection.flatMap(lambda x: range(1, x + 1)) + +self.assertFalse(flatMappedRDD.isCheckpointed()) +self.assertFalse(flatMappedRDD.isLocallyCheckpointed()) + +flatMappedRDD.localCheckpoint() +result = flatMappedRDD.collect() +time.sleep(1) # 1 second +self.assertTrue(flatMappedRDD.isCheckpointed()) +self.assertTrue(flatMappedRDD.isLocallyCheckpointed()) +self.assertEqual(flatMappedRDD.collect(), result) + + class AddFileTests(PySparkTestCase): def test_add_py_file(self):
spark git commit: [SPARK-18517][SQL] DROP TABLE IF EXISTS should not warn for non-existing tables
Repository: spark Updated Branches: refs/heads/branch-2.1 251a99276 -> b0a73c9be [SPARK-18517][SQL] DROP TABLE IF EXISTS should not warn for non-existing tables ## What changes were proposed in this pull request? Currently, `DROP TABLE IF EXISTS` shows warning for non-existing tables. However, it had better be quiet for this case by definition of the command. **BEFORE** ```scala scala> sql("DROP TABLE IF EXISTS nonexist") 16/11/20 20:48:26 WARN DropTableCommand: org.apache.spark.sql.catalyst.analysis.NoSuchTableException: Table or view 'nonexist' not found in database 'default'; ``` **AFTER** ```scala scala> sql("DROP TABLE IF EXISTS nonexist") res0: org.apache.spark.sql.DataFrame = [] ``` ## How was this patch tested? Manual because this is related to the warning messages instead of exceptions. Author: Dongjoon Hyun Closes #15953 from dongjoon-hyun/SPARK-18517. (cherry picked from commit ddd02f50bb7458410d65427321efc75da5e65224) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b0a73c9b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b0a73c9b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b0a73c9b Branch: refs/heads/branch-2.1 Commit: b0a73c9be3b691f95d2f6ace3d6304db7f69705f Parents: 251a992 Author: Dongjoon Hyun Authored: Mon Nov 21 16:14:59 2016 -0500 Committer: Andrew Or Committed: Mon Nov 21 16:15:41 2016 -0500 -- .../main/scala/org/apache/spark/sql/execution/command/ddl.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b0a73c9b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala index 570a996..0f126d0 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala @@ -28,7 +28,7 @@ import org.apache.hadoop.mapred.{FileInputFormat, JobConf} import org.apache.spark.sql.{AnalysisException, Row, SparkSession} import org.apache.spark.sql.catalyst.TableIdentifier -import org.apache.spark.sql.catalyst.analysis.Resolver +import org.apache.spark.sql.catalyst.analysis.{NoSuchTableException, Resolver} import org.apache.spark.sql.catalyst.catalog._ import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} @@ -202,6 +202,7 @@ case class DropTableCommand( sparkSession.sharedState.cacheManager.uncacheQuery( sparkSession.table(tableName.quotedString)) } catch { + case _: NoSuchTableException if ifExists => case NonFatal(e) => log.warn(e.toString, e) } catalog.refreshTable(tableName) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-18517][SQL] DROP TABLE IF EXISTS should not warn for non-existing tables
Repository: spark Updated Branches: refs/heads/master 70176871a -> ddd02f50b [SPARK-18517][SQL] DROP TABLE IF EXISTS should not warn for non-existing tables ## What changes were proposed in this pull request? Currently, `DROP TABLE IF EXISTS` shows warning for non-existing tables. However, it had better be quiet for this case by definition of the command. **BEFORE** ```scala scala> sql("DROP TABLE IF EXISTS nonexist") 16/11/20 20:48:26 WARN DropTableCommand: org.apache.spark.sql.catalyst.analysis.NoSuchTableException: Table or view 'nonexist' not found in database 'default'; ``` **AFTER** ```scala scala> sql("DROP TABLE IF EXISTS nonexist") res0: org.apache.spark.sql.DataFrame = [] ``` ## How was this patch tested? Manual because this is related to the warning messages instead of exceptions. Author: Dongjoon Hyun Closes #15953 from dongjoon-hyun/SPARK-18517. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ddd02f50 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ddd02f50 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ddd02f50 Branch: refs/heads/master Commit: ddd02f50bb7458410d65427321efc75da5e65224 Parents: 7017687 Author: Dongjoon Hyun Authored: Mon Nov 21 16:14:59 2016 -0500 Committer: Andrew Or Committed: Mon Nov 21 16:14:59 2016 -0500 -- .../main/scala/org/apache/spark/sql/execution/command/ddl.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ddd02f50/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala index 588aa05..d80b000 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala @@ -28,7 +28,7 @@ import org.apache.hadoop.mapred.{FileInputFormat, JobConf} import org.apache.spark.sql.{AnalysisException, Row, SparkSession} import org.apache.spark.sql.catalyst.TableIdentifier -import org.apache.spark.sql.catalyst.analysis.Resolver +import org.apache.spark.sql.catalyst.analysis.{NoSuchTableException, Resolver} import org.apache.spark.sql.catalyst.catalog._ import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, BinaryComparison} @@ -203,6 +203,7 @@ case class DropTableCommand( sparkSession.sharedState.cacheManager.uncacheQuery( sparkSession.table(tableName.quotedString)) } catch { + case _: NoSuchTableException if ifExists => case NonFatal(e) => log.warn(e.toString, e) } catalog.refreshTable(tableName) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-18361][PYSPARK] Expose RDD localCheckpoint in PySpark
Repository: spark Updated Branches: refs/heads/master 07beb5d21 -> 70176871a [SPARK-18361][PYSPARK] Expose RDD localCheckpoint in PySpark ## What changes were proposed in this pull request? Expose RDD's localCheckpoint() and associated functions in PySpark. ## How was this patch tested? I added a UnitTest in python/pyspark/tests.py which passes. I certify that this is my original work, and I license it to the project under the project's open source license. Gabriel HUANG Developer at Cardabel (http://cardabel.com/) Author: Gabriel Huang Closes #15811 from gabrielhuang/pyspark-localcheckpoint. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/70176871 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/70176871 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/70176871 Branch: refs/heads/master Commit: 70176871ae10509f1a727a96e96b3da7762605b1 Parents: 07beb5d Author: Gabriel Huang Authored: Mon Nov 21 16:08:34 2016 -0500 Committer: Andrew Or Committed: Mon Nov 21 16:08:34 2016 -0500 -- python/pyspark/rdd.py | 33 - python/pyspark/tests.py | 17 + 2 files changed, 49 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/70176871/python/pyspark/rdd.py -- diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py index 641787e..f21a364 100644 --- a/python/pyspark/rdd.py +++ b/python/pyspark/rdd.py @@ -263,13 +263,44 @@ class RDD(object): def isCheckpointed(self): """ -Return whether this RDD has been checkpointed or not +Return whether this RDD is checkpointed and materialized, either reliably or locally. """ return self._jrdd.rdd().isCheckpointed() +def localCheckpoint(self): +""" +Mark this RDD for local checkpointing using Spark's existing caching layer. + +This method is for users who wish to truncate RDD lineages while skipping the expensive +step of replicating the materialized data in a reliable distributed file system. This is +useful for RDDs with long lineages that need to be truncated periodically (e.g. GraphX). + +Local checkpointing sacrifices fault-tolerance for performance. In particular, checkpointed +data is written to ephemeral local storage in the executors instead of to a reliable, +fault-tolerant storage. The effect is that if an executor fails during the computation, +the checkpointed data may no longer be accessible, causing an irrecoverable job failure. + +This is NOT safe to use with dynamic allocation, which removes executors along +with their cached blocks. If you must use both features, you are advised to set +L{spark.dynamicAllocation.cachedExecutorIdleTimeout} to a high value. + +The checkpoint directory set through L{SparkContext.setCheckpointDir()} is not used. +""" +self._jrdd.rdd().localCheckpoint() + +def isLocallyCheckpointed(self): +""" +Return whether this RDD is marked for local checkpointing. + +Exposed for testing. +""" +return self._jrdd.rdd().isLocallyCheckpointed() + def getCheckpointFile(self): """ Gets the name of the file to which this RDD was checkpointed + +Not defined if RDD is checkpointed locally. """ checkpointFile = self._jrdd.rdd().getCheckpointFile() if checkpointFile.isDefined(): http://git-wip-us.apache.org/repos/asf/spark/blob/70176871/python/pyspark/tests.py -- diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py index 3e0bd16..ab4bef8 100644 --- a/python/pyspark/tests.py +++ b/python/pyspark/tests.py @@ -390,6 +390,23 @@ class CheckpointTests(ReusedPySparkTestCase): self.assertEqual([1, 2, 3, 4], recovered.collect()) +class LocalCheckpointTests(ReusedPySparkTestCase): + +def test_basic_localcheckpointing(self): +parCollection = self.sc.parallelize([1, 2, 3, 4]) +flatMappedRDD = parCollection.flatMap(lambda x: range(1, x + 1)) + +self.assertFalse(flatMappedRDD.isCheckpointed()) +self.assertFalse(flatMappedRDD.isLocallyCheckpointed()) + +flatMappedRDD.localCheckpoint() +result = flatMappedRDD.collect() +time.sleep(1) # 1 second +self.assertTrue(flatMappedRDD.isCheckpointed()) +self.assertTrue(flatMappedRDD.isLocallyCheckpointed()) +self.assertEqual(flatMappedRDD.collect(), result) + + class AddFileTests(PySparkTestCase): def test_add_py_file(self):
spark git commit: [SPARK-17686][CORE] Support printing out scala and java version with spark-submit --version command
Repository: spark Updated Branches: refs/heads/master db8784fea -> 7bf8a4049 [SPARK-17686][CORE] Support printing out scala and java version with spark-submit --version command ## What changes were proposed in this pull request? In our universal gateway service we need to specify different jars to Spark according to scala version. For now only after launching Spark application can we know which version of Scala it depends on. It makes hard for us to support different Scala + Spark versions to pick the right jars. So here propose to print out Scala version according to Spark version in "spark-submit --version", so that user could leverage this output to make the choice without needing to launching application. ## How was this patch tested? Manually verified in local environment. Author: jerryshao Closes #15456 from jerryshao/SPARK-17686. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7bf8a404 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7bf8a404 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7bf8a404 Branch: refs/heads/master Commit: 7bf8a4049866b2ec7fdf0406b1ad0c3a12488645 Parents: db8784f Author: jerryshao Authored: Thu Oct 13 03:29:14 2016 -0400 Committer: Andrew Or Committed: Thu Oct 13 03:29:14 2016 -0400 -- core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/7bf8a404/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala -- diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala index 8061165..5c05228 100644 --- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala +++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala @@ -24,6 +24,7 @@ import java.security.PrivilegedExceptionAction import scala.annotation.tailrec import scala.collection.mutable.{ArrayBuffer, HashMap, Map} +import scala.util.Properties import org.apache.commons.lang3.StringUtils import org.apache.hadoop.fs.Path @@ -47,7 +48,6 @@ import org.apache.spark.deploy.rest._ import org.apache.spark.launcher.SparkLauncher import org.apache.spark.util.{ChildFirstURLClassLoader, MutableURLClassLoader, Utils} - /** * Whether to submit, kill, or request the status of an application. * The latter two operations are currently supported only for standalone and Mesos cluster modes. @@ -104,6 +104,8 @@ object SparkSubmit { /___/ .__/\_,_/_/ /_/\_\ version %s /_/ """.format(SPARK_VERSION)) +printStream.println("Using Scala %s, %s, %s".format( + Properties.versionString, Properties.javaVmName, Properties.javaVersion)) printStream.println("Branch %s".format(SPARK_BRANCH)) printStream.println("Compiled by user %s on %s".format(SPARK_BUILD_USER, SPARK_BUILD_DATE)) printStream.println("Revision %s".format(SPARK_REVISION)) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-17899][SQL] add a debug mode to keep raw table properties in HiveExternalCatalog
Repository: spark Updated Branches: refs/heads/master 6f2fa6c54 -> db8784fea [SPARK-17899][SQL] add a debug mode to keep raw table properties in HiveExternalCatalog ## What changes were proposed in this pull request? Currently `HiveExternalCatalog` will filter out the Spark SQL internal table properties, e.g. `spark.sql.sources.provider`, `spark.sql.sources.schema`, etc. This is reasonable for external users as they don't want to see these internal properties in `DESC TABLE`. However, as a Spark developer, sometimes we do wanna see the raw table properties. This PR adds a new internal SQL conf, `spark.sql.debug`, to enable debug mode and keep these raw table properties. This config can also be used in similar places where we wanna retain debug information in the future. ## How was this patch tested? new test in MetastoreDataSourcesSuite Author: Wenchen Fan Closes #15458 from cloud-fan/debug. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/db8784fe Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/db8784fe Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/db8784fe Branch: refs/heads/master Commit: db8784feaa605adcbd37af4bc8b7146479b631f8 Parents: 6f2fa6c Author: Wenchen Fan Authored: Thu Oct 13 03:26:29 2016 -0400 Committer: Andrew Or Committed: Thu Oct 13 03:26:29 2016 -0400 -- .../org/apache/spark/sql/internal/SQLConf.scala | 5 .../spark/sql/internal/SQLConfSuite.scala | 24 .../spark/sql/hive/HiveExternalCatalog.scala| 9 ++-- .../sql/hive/MetastoreDataSourcesSuite.scala| 17 +- 4 files changed, 42 insertions(+), 13 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/db8784fe/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 9e7c1ec..192083e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -915,4 +915,9 @@ object StaticSQLConf { .internal() .intConf .createWithDefault(4000) + + val DEBUG_MODE = buildConf("spark.sql.debug") +.internal() +.booleanConf +.createWithDefault(false) } http://git-wip-us.apache.org/repos/asf/spark/blob/db8784fe/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala index f545de0..df640ff 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala @@ -19,6 +19,7 @@ package org.apache.spark.sql.internal import org.apache.hadoop.fs.Path +import org.apache.spark.SparkContext import org.apache.spark.sql._ import org.apache.spark.sql.execution.WholeStageCodegenExec import org.apache.spark.sql.internal.StaticSQLConf._ @@ -254,18 +255,21 @@ class SQLConfSuite extends QueryTest with SharedSQLContext { } } - test("global SQL conf comes from SparkConf") { -val newSession = SparkSession.builder() - .config(SCHEMA_STRING_LENGTH_THRESHOLD.key, "2000") - .getOrCreate() - -assert(newSession.conf.get(SCHEMA_STRING_LENGTH_THRESHOLD.key) == "2000") -checkAnswer( - newSession.sql(s"SET ${SCHEMA_STRING_LENGTH_THRESHOLD.key}"), - Row(SCHEMA_STRING_LENGTH_THRESHOLD.key, "2000")) + test("static SQL conf comes from SparkConf") { +val previousValue = sparkContext.conf.get(SCHEMA_STRING_LENGTH_THRESHOLD) +try { + sparkContext.conf.set(SCHEMA_STRING_LENGTH_THRESHOLD, 2000) + val newSession = new SparkSession(sparkContext) + assert(newSession.conf.get(SCHEMA_STRING_LENGTH_THRESHOLD) == 2000) + checkAnswer( +newSession.sql(s"SET ${SCHEMA_STRING_LENGTH_THRESHOLD.key}"), +Row(SCHEMA_STRING_LENGTH_THRESHOLD.key, "2000")) +} finally { + sparkContext.conf.set(SCHEMA_STRING_LENGTH_THRESHOLD, previousValue) +} } - test("cannot set/unset global SQL conf") { + test("cannot set/unset static SQL conf") { val e1 = intercept[AnalysisException](sql(s"SET ${SCHEMA_STRING_LENGTH_THRESHOLD.key}=10")) assert(e1.message.contains("Cannot modify the value of a static config")) val e2 = intercept[AnalysisException](spark.conf.unset(SCHEMA_STRING_LENGTH_THRESHOLD.key)) http://git-wip-us.apache.org/repos/asf/spark/blob/db8784fe/sql/hive/src/main/scala/org/
spark git commit: [SPARK-11272][WEB UI] Add support for downloading event logs from HistoryServer UI
Repository: spark Updated Branches: refs/heads/master 7222a25a1 -> 6f2fa6c54 [SPARK-11272][WEB UI] Add support for downloading event logs from HistoryServer UI ## What changes were proposed in this pull request? This is a reworked PR based on feedback in #9238 after it was closed and not reopened. As suggested in that PR I've only added the download feature. This functionality already exists in the api and this allows easier access to download event logs to share with others. I've attached a screenshot of the committed version, but I will also include alternate options with screen shots in the comments below. I'm personally not sure which option is best. ## How was this patch tested? Manual testing ![screen shot 2016-10-07 at 6 11 12 pm](https://cloud.githubusercontent.com/assets/13952758/19209213/832fe48e-8cba-11e6-9840-749b1be4d399.png) Author: Alex Bozarth Closes #15400 from ajbozarth/spark11272. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6f2fa6c5 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6f2fa6c5 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6f2fa6c5 Branch: refs/heads/master Commit: 6f2fa6c54a11caccd446d5560d2014c645fcf7cc Parents: 7222a25 Author: Alex Bozarth Authored: Thu Oct 13 03:24:37 2016 -0400 Committer: Andrew Or Committed: Thu Oct 13 03:24:37 2016 -0400 -- .../org/apache/spark/ui/static/historypage-template.html | 7 ++- .../main/resources/org/apache/spark/ui/static/historypage.js | 1 + 2 files changed, 7 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/6f2fa6c5/core/src/main/resources/org/apache/spark/ui/static/historypage-template.html -- diff --git a/core/src/main/resources/org/apache/spark/ui/static/historypage-template.html b/core/src/main/resources/org/apache/spark/ui/static/historypage-template.html index a2b3826..1fd6ef4 100644 --- a/core/src/main/resources/org/apache/spark/ui/static/historypage-template.html +++ b/core/src/main/resources/org/apache/spark/ui/static/historypage-template.html @@ -59,7 +59,11 @@ Last Updated - + + + Event Log + + {{#applications}} @@ -73,6 +77,7 @@ {{duration}} {{sparkUser}} {{lastUpdated}} + Download {{/attempts}} {{/applications}} http://git-wip-us.apache.org/repos/asf/spark/blob/6f2fa6c5/core/src/main/resources/org/apache/spark/ui/static/historypage.js -- diff --git a/core/src/main/resources/org/apache/spark/ui/static/historypage.js b/core/src/main/resources/org/apache/spark/ui/static/historypage.js index c809400..2a32e18 100644 --- a/core/src/main/resources/org/apache/spark/ui/static/historypage.js +++ b/core/src/main/resources/org/apache/spark/ui/static/historypage.js @@ -133,6 +133,7 @@ $(document).ready(function() { {name: 'sixth', type: "title-numeric"}, {name: 'seventh'}, {name: 'eighth'}, +{name: 'ninth'}, ], "autoWidth": false, "order": [[ 4, "desc" ]] - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-16827] Stop reporting spill metrics as shuffle metrics
Repository: spark Updated Branches: refs/heads/master 2b01d3c70 -> e56614cba [SPARK-16827] Stop reporting spill metrics as shuffle metrics ## What changes were proposed in this pull request? Fix a bug where spill metrics were being reported as shuffle metrics. Eventually these spill metrics should be reported (SPARK-3577), but separate from shuffle metrics. The fix itself basically reverts the line to what it was in 1.6. ## How was this patch tested? Tested on a job that was reporting shuffle writes even for the final stage, when no shuffle writes should take place. After the change the job no longer shows these writes. Before: ![screen shot 2016-10-03 at 6 39 59 pm](https://cloud.githubusercontent.com/assets/1514239/19085897/dbf59a92-8a20-11e6-9f68-a978860c0d74.png) After: https://cloud.githubusercontent.com/assets/1514239/19085903/e173a860-8a20-11e6-85e3-d47f9835f494.png";> Author: Brian Cho Closes #15347 from dafrista/shuffle-metrics. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e56614cb Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e56614cb Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e56614cb Branch: refs/heads/master Commit: e56614cba99bfdf5fa8a6c617fdd56eca2b34694 Parents: 2b01d3c Author: Brian Cho Authored: Fri Oct 7 11:37:18 2016 -0400 Committer: Andrew Or Committed: Fri Oct 7 11:37:18 2016 -0400 -- .../spark/util/collection/unsafe/sort/UnsafeExternalSorter.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e56614cb/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java -- diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java index 428ff72..7835017 100644 --- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java +++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java @@ -145,7 +145,9 @@ public final class UnsafeExternalSorter extends MemoryConsumer { // Use getSizeAsKb (not bytes) to maintain backwards compatibility for units // this.fileBufferSizeBytes = (int) conf.getSizeAsKb("spark.shuffle.file.buffer", "32k") * 1024; this.fileBufferSizeBytes = 32 * 1024; -this.writeMetrics = taskContext.taskMetrics().shuffleWriteMetrics(); +// The spill metrics are stored in a new ShuffleWriteMetrics, and then discarded (this fixes SPARK-16827). +// TODO: Instead, separate spill metrics should be stored and reported (tracked in SPARK-3577). +this.writeMetrics = new ShuffleWriteMetrics(); if (existingInMemorySorter == null) { this.inMemSorter = new UnsafeInMemorySorter( - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-17715][SCHEDULER] Make task launch logs DEBUG
Repository: spark Updated Branches: refs/heads/master cb87b3ced -> 027dea8f2 [SPARK-17715][SCHEDULER] Make task launch logs DEBUG ## What changes were proposed in this pull request? Ramp down the task launch logs from INFO to DEBUG. Task launches can happen orders of magnitude more than executor registration so it makes the logs easier to handle if they are different log levels. For larger jobs, there can be 100,000s of task launches which makes the driver log huge. ## How was this patch tested? No tests, as this is a trivial change. Author: Brian Cho Closes #15290 from dafrista/ramp-down-task-logging. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/027dea8f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/027dea8f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/027dea8f Branch: refs/heads/master Commit: 027dea8f294504bc5cd8bfedde546d171cb78657 Parents: cb87b3c Author: Brian Cho Authored: Thu Sep 29 15:59:17 2016 -0400 Committer: Andrew Or Committed: Thu Sep 29 15:59:17 2016 -0400 -- .../spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala| 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/027dea8f/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala -- diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala index 2d09863..0dae0e6 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala @@ -265,7 +265,7 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp val executorData = executorDataMap(task.executorId) executorData.freeCores -= scheduler.CPUS_PER_TASK - logInfo(s"Launching task ${task.taskId} on executor id: ${task.executorId} hostname: " + + logDebug(s"Launching task ${task.taskId} on executor id: ${task.executorId} hostname: " + s"${executorData.executorHost}.") executorData.executorEndpoint.send(LaunchTask(new SerializableBuffer(serializedTask))) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-17672] Spark 2.0 history server web Ui takes too long for a single application
Repository: spark Updated Branches: refs/heads/branch-2.0 f7839e47c -> 7c9450b00 [SPARK-17672] Spark 2.0 history server web Ui takes too long for a single application Added a new API getApplicationInfo(appId: String) in class ApplicationHistoryProvider and class SparkUI to get app info. In this change, FsHistoryProvider can directly fetch one app info in O(1) time complexity compared to O(n) before the change which used an Iterator.find() interface. Both ApplicationCache and OneApplicationResource classes adopt this new api. manual tests Author: Gang Wu Closes #15247 from wgtmac/SPARK-17671. (cherry picked from commit cb87b3ced9453b5717fa8e8637b97a2f3f25fdd7) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7c9450b0 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7c9450b0 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7c9450b0 Branch: refs/heads/branch-2.0 Commit: 7c9450b007205958984f39a881415cdbe75e0c34 Parents: f7839e4 Author: Gang Wu Authored: Thu Sep 29 15:51:05 2016 -0400 Committer: Andrew Or Committed: Thu Sep 29 15:51:57 2016 -0400 -- .../spark/deploy/history/ApplicationHistoryProvider.scala | 5 + .../org/apache/spark/deploy/history/FsHistoryProvider.scala | 4 .../scala/org/apache/spark/deploy/history/HistoryServer.scala | 4 .../scala/org/apache/spark/status/api/v1/ApiRootResource.scala | 1 + .../org/apache/spark/status/api/v1/OneApplicationResource.scala | 2 +- core/src/main/scala/org/apache/spark/ui/SparkUI.scala | 4 6 files changed, 19 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/7c9450b0/core/src/main/scala/org/apache/spark/deploy/history/ApplicationHistoryProvider.scala -- diff --git a/core/src/main/scala/org/apache/spark/deploy/history/ApplicationHistoryProvider.scala b/core/src/main/scala/org/apache/spark/deploy/history/ApplicationHistoryProvider.scala index 44661ed..ba42b48 100644 --- a/core/src/main/scala/org/apache/spark/deploy/history/ApplicationHistoryProvider.scala +++ b/core/src/main/scala/org/apache/spark/deploy/history/ApplicationHistoryProvider.scala @@ -109,4 +109,9 @@ private[history] abstract class ApplicationHistoryProvider { @throws(classOf[SparkException]) def writeEventLogs(appId: String, attemptId: Option[String], zipStream: ZipOutputStream): Unit + /** + * @return the [[ApplicationHistoryInfo]] for the appId if it exists. + */ + def getApplicationInfo(appId: String): Option[ApplicationHistoryInfo] + } http://git-wip-us.apache.org/repos/asf/spark/blob/7c9450b0/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala -- diff --git a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala index 110d882..cf4a401 100644 --- a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala +++ b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala @@ -222,6 +222,10 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) override def getListing(): Iterable[FsApplicationHistoryInfo] = applications.values + override def getApplicationInfo(appId: String): Option[FsApplicationHistoryInfo] = { +applications.get(appId) + } + override def getAppUI(appId: String, attemptId: Option[String]): Option[LoadedAppUI] = { try { applications.get(appId).flatMap { appInfo => http://git-wip-us.apache.org/repos/asf/spark/blob/7c9450b0/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala -- diff --git a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala index c178917..735aa43 100644 --- a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala +++ b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala @@ -182,6 +182,10 @@ class HistoryServer( getApplicationList().iterator.map(ApplicationsListResource.appHistoryInfoToPublicAppInfo) } + def getApplicationInfo(appId: String): Option[ApplicationInfo] = { + provider.getApplicationInfo(appId).map(ApplicationsListResource.appHistoryInfoToPublicAppInfo) + } + override def writeEventLogs( appId: String, attemptId: Option[String], http://git-wip-us.apache.org/repos/asf/spark/blob/7c9450b0/core/src/main/scala/org/apache/spark/status/api/v1/ApiRootResource.scala --
spark git commit: [SPARK-17672] Spark 2.0 history server web Ui takes too long for a single application
Repository: spark Updated Branches: refs/heads/master 7f779e743 -> cb87b3ced [SPARK-17672] Spark 2.0 history server web Ui takes too long for a single application Added a new API getApplicationInfo(appId: String) in class ApplicationHistoryProvider and class SparkUI to get app info. In this change, FsHistoryProvider can directly fetch one app info in O(1) time complexity compared to O(n) before the change which used an Iterator.find() interface. Both ApplicationCache and OneApplicationResource classes adopt this new api. manual tests Author: Gang Wu Closes #15247 from wgtmac/SPARK-17671. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/cb87b3ce Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/cb87b3ce Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/cb87b3ce Branch: refs/heads/master Commit: cb87b3ced9453b5717fa8e8637b97a2f3f25fdd7 Parents: 7f779e7 Author: Gang Wu Authored: Thu Sep 29 15:51:05 2016 -0400 Committer: Andrew Or Committed: Thu Sep 29 15:51:38 2016 -0400 -- .../spark/deploy/history/ApplicationHistoryProvider.scala | 5 + .../org/apache/spark/deploy/history/FsHistoryProvider.scala | 4 .../scala/org/apache/spark/deploy/history/HistoryServer.scala | 4 .../scala/org/apache/spark/status/api/v1/ApiRootResource.scala | 1 + .../org/apache/spark/status/api/v1/OneApplicationResource.scala | 2 +- core/src/main/scala/org/apache/spark/ui/SparkUI.scala | 4 6 files changed, 19 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/cb87b3ce/core/src/main/scala/org/apache/spark/deploy/history/ApplicationHistoryProvider.scala -- diff --git a/core/src/main/scala/org/apache/spark/deploy/history/ApplicationHistoryProvider.scala b/core/src/main/scala/org/apache/spark/deploy/history/ApplicationHistoryProvider.scala index 44661ed..ba42b48 100644 --- a/core/src/main/scala/org/apache/spark/deploy/history/ApplicationHistoryProvider.scala +++ b/core/src/main/scala/org/apache/spark/deploy/history/ApplicationHistoryProvider.scala @@ -109,4 +109,9 @@ private[history] abstract class ApplicationHistoryProvider { @throws(classOf[SparkException]) def writeEventLogs(appId: String, attemptId: Option[String], zipStream: ZipOutputStream): Unit + /** + * @return the [[ApplicationHistoryInfo]] for the appId if it exists. + */ + def getApplicationInfo(appId: String): Option[ApplicationHistoryInfo] + } http://git-wip-us.apache.org/repos/asf/spark/blob/cb87b3ce/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala -- diff --git a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala index 6874aa5..d494ff0 100644 --- a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala +++ b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala @@ -224,6 +224,10 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) override def getListing(): Iterable[FsApplicationHistoryInfo] = applications.values + override def getApplicationInfo(appId: String): Option[FsApplicationHistoryInfo] = { +applications.get(appId) + } + override def getAppUI(appId: String, attemptId: Option[String]): Option[LoadedAppUI] = { try { applications.get(appId).flatMap { appInfo => http://git-wip-us.apache.org/repos/asf/spark/blob/cb87b3ce/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala -- diff --git a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala index c178917..735aa43 100644 --- a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala +++ b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala @@ -182,6 +182,10 @@ class HistoryServer( getApplicationList().iterator.map(ApplicationsListResource.appHistoryInfoToPublicAppInfo) } + def getApplicationInfo(appId: String): Option[ApplicationInfo] = { + provider.getApplicationInfo(appId).map(ApplicationsListResource.appHistoryInfoToPublicAppInfo) + } + override def writeEventLogs( appId: String, attemptId: Option[String], http://git-wip-us.apache.org/repos/asf/spark/blob/cb87b3ce/core/src/main/scala/org/apache/spark/status/api/v1/ApiRootResource.scala -- diff --git a/core/src/main/scala/org/apache/spark/status/api/v1/ApiRootRes
spark git commit: [SPARK-17648][CORE] TaskScheduler really needs offers to be an IndexedSeq
Repository: spark Updated Branches: refs/heads/master 958200497 -> 7f779e743 [SPARK-17648][CORE] TaskScheduler really needs offers to be an IndexedSeq ## What changes were proposed in this pull request? The Seq[WorkerOffer] is accessed by index, so it really should be an IndexedSeq, otherwise an O(n) operation becomes O(n^2). In practice this hasn't been an issue b/c where these offers are generated, the call to `.toSeq` just happens to create an IndexedSeq anyway.I got bitten by this in performance tests I was doing, and its better for the types to be more precise so eg. a change in Scala doesn't destroy performance. ## How was this patch tested? Unit tests via jenkins. Author: Imran Rashid Closes #15221 from squito/SPARK-17648. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7f779e74 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7f779e74 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7f779e74 Branch: refs/heads/master Commit: 7f779e7439127efa0e3611f7745e1c8423845198 Parents: 9582004 Author: Imran Rashid Authored: Thu Sep 29 15:36:40 2016 -0400 Committer: Andrew Or Committed: Thu Sep 29 15:36:40 2016 -0400 -- .../spark/scheduler/TaskSchedulerImpl.scala | 4 +-- .../cluster/CoarseGrainedSchedulerBackend.scala | 4 +-- .../scheduler/local/LocalSchedulerBackend.scala | 2 +- .../scheduler/SchedulerIntegrationSuite.scala | 7 ++--- .../scheduler/TaskSchedulerImplSuite.scala | 32 ++-- .../MesosFineGrainedSchedulerBackend.scala | 2 +- .../MesosFineGrainedSchedulerBackendSuite.scala | 2 +- 7 files changed, 26 insertions(+), 27 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/7f779e74/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala -- diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala index 52a7186..0ad4730 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala @@ -252,7 +252,7 @@ private[spark] class TaskSchedulerImpl( maxLocality: TaskLocality, shuffledOffers: Seq[WorkerOffer], availableCpus: Array[Int], - tasks: Seq[ArrayBuffer[TaskDescription]]) : Boolean = { + tasks: IndexedSeq[ArrayBuffer[TaskDescription]]) : Boolean = { var launchedTask = false for (i <- 0 until shuffledOffers.size) { val execId = shuffledOffers(i).executorId @@ -286,7 +286,7 @@ private[spark] class TaskSchedulerImpl( * sets for tasks in order of priority. We fill each node with tasks in a round-robin manner so * that tasks are balanced across the cluster. */ - def resourceOffers(offers: Seq[WorkerOffer]): Seq[Seq[TaskDescription]] = synchronized { + def resourceOffers(offers: IndexedSeq[WorkerOffer]): Seq[Seq[TaskDescription]] = synchronized { // Mark each slave as alive and remember its hostname // Also track if new executor is added var newExecAvail = false http://git-wip-us.apache.org/repos/asf/spark/blob/7f779e74/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala -- diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala index edc3c19..2d09863 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala @@ -216,7 +216,7 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp val activeExecutors = executorDataMap.filterKeys(executorIsAlive) val workOffers = activeExecutors.map { case (id, executorData) => new WorkerOffer(id, executorData.executorHost, executorData.freeCores) - }.toSeq + }.toIndexedSeq launchTasks(scheduler.resourceOffers(workOffers)) } @@ -233,7 +233,7 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp // Filter out executors under killing if (executorIsAlive(executorId)) { val executorData = executorDataMap(executorId) -val workOffers = Seq( +val workOffers = IndexedSeq( new WorkerOffer(executorId, executorData.executorHost, executorData.freeCores)) launchTasks(scheduler.resourceOffers(workOffers)) } http://git-wip-us.apache.org/repos/asf/spark/blob/7f779e74/core/src/main/scala/o
spark git commit: [Docs] Update spark-standalone.md to fix link
Repository: spark Updated Branches: refs/heads/branch-2.0 8a58f2e8e -> f4594900d [Docs] Update spark-standalone.md to fix link Corrected a link to the configuration.html page, it was pointing to a page that does not exist (configurations.html). Documentation change, verified in preview. Author: Andrew Mills Closes #15244 from ammills01/master. (cherry picked from commit 00be16df642317137f17d2d7d2887c41edac3680) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f4594900 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f4594900 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f4594900 Branch: refs/heads/branch-2.0 Commit: f4594900d86bb39358ff19047dfa8c1e4b78aa6b Parents: 8a58f2e Author: Andrew Mills Authored: Mon Sep 26 16:41:10 2016 -0400 Committer: Andrew Or Committed: Mon Sep 26 16:41:33 2016 -0400 -- docs/spark-standalone.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f4594900/docs/spark-standalone.md -- diff --git a/docs/spark-standalone.md b/docs/spark-standalone.md index 5ae63fe..6f0f665 100644 --- a/docs/spark-standalone.md +++ b/docs/spark-standalone.md @@ -348,7 +348,7 @@ Learn more about getting started with ZooKeeper [here](http://zookeeper.apache.o **Configuration** In order to enable this recovery mode, you can set SPARK_DAEMON_JAVA_OPTS in spark-env by configuring `spark.deploy.recoveryMode` and related spark.deploy.zookeeper.* configurations. -For more information about these configurations please refer to the configurations (doc)[configurations.html#deploy] +For more information about these configurations please refer to the [configuration doc](configuration.html#deploy) Possible gotcha: If you have multiple Masters in your cluster but fail to correctly configure the Masters to use ZooKeeper, the Masters will fail to discover each other and think they're all leaders. This will not lead to a healthy cluster state (as all Masters will schedule independently). - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [Docs] Update spark-standalone.md to fix link
Repository: spark Updated Branches: refs/heads/master 7c7586aef -> 00be16df6 [Docs] Update spark-standalone.md to fix link Corrected a link to the configuration.html page, it was pointing to a page that does not exist (configurations.html). Documentation change, verified in preview. Author: Andrew Mills Closes #15244 from ammills01/master. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/00be16df Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/00be16df Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/00be16df Branch: refs/heads/master Commit: 00be16df642317137f17d2d7d2887c41edac3680 Parents: 7c7586a Author: Andrew Mills Authored: Mon Sep 26 16:41:10 2016 -0400 Committer: Andrew Or Committed: Mon Sep 26 16:41:14 2016 -0400 -- docs/spark-standalone.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/00be16df/docs/spark-standalone.md -- diff --git a/docs/spark-standalone.md b/docs/spark-standalone.md index 1097f1f..7b82b95 100644 --- a/docs/spark-standalone.md +++ b/docs/spark-standalone.md @@ -348,7 +348,7 @@ Learn more about getting started with ZooKeeper [here](http://zookeeper.apache.o **Configuration** In order to enable this recovery mode, you can set SPARK_DAEMON_JAVA_OPTS in spark-env by configuring `spark.deploy.recoveryMode` and related spark.deploy.zookeeper.* configurations. -For more information about these configurations please refer to the configurations (doc)[configurations.html#deploy] +For more information about these configurations please refer to the [configuration doc](configuration.html#deploy) Possible gotcha: If you have multiple Masters in your cluster but fail to correctly configure the Masters to use ZooKeeper, the Masters will fail to discover each other and think they're all leaders. This will not lead to a healthy cluster state (as all Masters will schedule independently). - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-17512][CORE] Avoid formatting to python path for yarn and mesos cluster mode
Repository: spark Updated Branches: refs/heads/branch-2.0 cd0bd89d7 -> 59e6ab11a [SPARK-17512][CORE] Avoid formatting to python path for yarn and mesos cluster mode ## What changes were proposed in this pull request? Yarn and mesos cluster mode support remote python path (HDFS/S3 scheme) by their own mechanism, it is not necessary to check and format the python when running on these modes. This is a potential regression compared to 1.6, so here propose to fix it. ## How was this patch tested? Unit test to verify SparkSubmit arguments, also with local cluster verification. Because of lack of `MiniDFSCluster` support in Spark unit test, there's no integration test added. Author: jerryshao Closes #15137 from jerryshao/SPARK-17512. (cherry picked from commit 8c3ee2bc42e6320b9341cebdba51a00162c897ea) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/59e6ab11 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/59e6ab11 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/59e6ab11 Branch: refs/heads/branch-2.0 Commit: 59e6ab11a9e27d30ae3477fdc03337ff5f8ab4ec Parents: cd0bd89 Author: jerryshao Authored: Wed Sep 21 17:57:21 2016 -0400 Committer: Andrew Or Committed: Wed Sep 21 17:57:33 2016 -0400 -- .../org/apache/spark/deploy/SparkSubmit.scala| 13 ++--- .../apache/spark/deploy/SparkSubmitSuite.scala | 19 +++ 2 files changed, 29 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/59e6ab11/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala -- diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala index 7b6d5a3..8061165 100644 --- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala +++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala @@ -311,7 +311,7 @@ object SparkSubmit { // In Mesos cluster mode, non-local python files are automatically downloaded by Mesos. if (args.isPython && !isYarnCluster && !isMesosCluster) { if (Utils.nonLocalPaths(args.primaryResource).nonEmpty) { -printErrorAndExit(s"Only local python files are supported: $args.primaryResource") +printErrorAndExit(s"Only local python files are supported: ${args.primaryResource}") } val nonLocalPyFiles = Utils.nonLocalPaths(args.pyFiles).mkString(",") if (nonLocalPyFiles.nonEmpty) { @@ -322,7 +322,7 @@ object SparkSubmit { // Require all R files to be local if (args.isR && !isYarnCluster) { if (Utils.nonLocalPaths(args.primaryResource).nonEmpty) { -printErrorAndExit(s"Only local R files are supported: $args.primaryResource") +printErrorAndExit(s"Only local R files are supported: ${args.primaryResource}") } } @@ -633,7 +633,14 @@ object SparkSubmit { // explicitly sets `spark.submit.pyFiles` in his/her default properties file. sysProps.get("spark.submit.pyFiles").foreach { pyFiles => val resolvedPyFiles = Utils.resolveURIs(pyFiles) - val formattedPyFiles = PythonRunner.formatPaths(resolvedPyFiles).mkString(",") + val formattedPyFiles = if (!isYarnCluster && !isMesosCluster) { +PythonRunner.formatPaths(resolvedPyFiles).mkString(",") + } else { +// Ignoring formatting python path in yarn and mesos cluster mode, these two modes +// support dealing with remote python files, they could distribute and add python files +// locally. +resolvedPyFiles + } sysProps("spark.submit.pyFiles") = formattedPyFiles } http://git-wip-us.apache.org/repos/asf/spark/blob/59e6ab11/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala index b2bc886..54693c1 100644 --- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala @@ -577,6 +577,25 @@ class SparkSubmitSuite val sysProps3 = SparkSubmit.prepareSubmitEnvironment(appArgs3)._3 sysProps3("spark.submit.pyFiles") should be( PythonRunner.formatPaths(Utils.resolveURIs(pyFiles)).mkString(",")) + +// Test remote python files +val f4 = File.createTempFile("test-submit-remote-python-files", "", tmpDir) +val writer4 = new PrintWriter(f4) +val remotePyFiles = "hdfs:///tmp/file1.py,hdfs:///tmp/file2.py" +writer4.println("spark.submit.pyFiles " + remotePyFiles) +writer4.close() +
spark git commit: [SPARK-17512][CORE] Avoid formatting to python path for yarn and mesos cluster mode
Repository: spark Updated Branches: refs/heads/master 9fcf1c51d -> 8c3ee2bc4 [SPARK-17512][CORE] Avoid formatting to python path for yarn and mesos cluster mode ## What changes were proposed in this pull request? Yarn and mesos cluster mode support remote python path (HDFS/S3 scheme) by their own mechanism, it is not necessary to check and format the python when running on these modes. This is a potential regression compared to 1.6, so here propose to fix it. ## How was this patch tested? Unit test to verify SparkSubmit arguments, also with local cluster verification. Because of lack of `MiniDFSCluster` support in Spark unit test, there's no integration test added. Author: jerryshao Closes #15137 from jerryshao/SPARK-17512. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8c3ee2bc Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8c3ee2bc Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8c3ee2bc Branch: refs/heads/master Commit: 8c3ee2bc42e6320b9341cebdba51a00162c897ea Parents: 9fcf1c5 Author: jerryshao Authored: Wed Sep 21 17:57:21 2016 -0400 Committer: Andrew Or Committed: Wed Sep 21 17:57:21 2016 -0400 -- .../org/apache/spark/deploy/SparkSubmit.scala| 13 ++--- .../apache/spark/deploy/SparkSubmitSuite.scala | 19 +++ 2 files changed, 29 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/8c3ee2bc/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala -- diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala index 7b6d5a3..8061165 100644 --- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala +++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala @@ -311,7 +311,7 @@ object SparkSubmit { // In Mesos cluster mode, non-local python files are automatically downloaded by Mesos. if (args.isPython && !isYarnCluster && !isMesosCluster) { if (Utils.nonLocalPaths(args.primaryResource).nonEmpty) { -printErrorAndExit(s"Only local python files are supported: $args.primaryResource") +printErrorAndExit(s"Only local python files are supported: ${args.primaryResource}") } val nonLocalPyFiles = Utils.nonLocalPaths(args.pyFiles).mkString(",") if (nonLocalPyFiles.nonEmpty) { @@ -322,7 +322,7 @@ object SparkSubmit { // Require all R files to be local if (args.isR && !isYarnCluster) { if (Utils.nonLocalPaths(args.primaryResource).nonEmpty) { -printErrorAndExit(s"Only local R files are supported: $args.primaryResource") +printErrorAndExit(s"Only local R files are supported: ${args.primaryResource}") } } @@ -633,7 +633,14 @@ object SparkSubmit { // explicitly sets `spark.submit.pyFiles` in his/her default properties file. sysProps.get("spark.submit.pyFiles").foreach { pyFiles => val resolvedPyFiles = Utils.resolveURIs(pyFiles) - val formattedPyFiles = PythonRunner.formatPaths(resolvedPyFiles).mkString(",") + val formattedPyFiles = if (!isYarnCluster && !isMesosCluster) { +PythonRunner.formatPaths(resolvedPyFiles).mkString(",") + } else { +// Ignoring formatting python path in yarn and mesos cluster mode, these two modes +// support dealing with remote python files, they could distribute and add python files +// locally. +resolvedPyFiles + } sysProps("spark.submit.pyFiles") = formattedPyFiles } http://git-wip-us.apache.org/repos/asf/spark/blob/8c3ee2bc/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala index 961ece3..31c8fb2 100644 --- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala @@ -582,6 +582,25 @@ class SparkSubmitSuite val sysProps3 = SparkSubmit.prepareSubmitEnvironment(appArgs3)._3 sysProps3("spark.submit.pyFiles") should be( PythonRunner.formatPaths(Utils.resolveURIs(pyFiles)).mkString(",")) + +// Test remote python files +val f4 = File.createTempFile("test-submit-remote-python-files", "", tmpDir) +val writer4 = new PrintWriter(f4) +val remotePyFiles = "hdfs:///tmp/file1.py,hdfs:///tmp/file2.py" +writer4.println("spark.submit.pyFiles " + remotePyFiles) +writer4.close() +val clArgs4 = Seq( + "--master", "yarn", + "--deploy-mode", "cluster", + "--properties-
spark git commit: [SPARK-17623][CORE] Clarify type of TaskEndReason with a failed task.
Repository: spark Updated Branches: refs/heads/master 2cd1bfa4f -> 9fcf1c51d [SPARK-17623][CORE] Clarify type of TaskEndReason with a failed task. ## What changes were proposed in this pull request? In TaskResultGetter, enqueueFailedTask currently deserializes the result as a TaskEndReason. But the type is actually more specific, its a TaskFailedReason. This just leads to more blind casting later on – it would be more clear if the msg was cast to the right type immediately, so method parameter types could be tightened. ## How was this patch tested? Existing unit tests via jenkins. Note that the code was already performing a blind-cast to a TaskFailedReason before in any case, just in a different spot, so there shouldn't be any behavior change. Author: Imran Rashid Closes #15181 from squito/SPARK-17623. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9fcf1c51 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9fcf1c51 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9fcf1c51 Branch: refs/heads/master Commit: 9fcf1c51d518847eda7f5ea71337cfa7def3c45c Parents: 2cd1bfa Author: Imran Rashid Authored: Wed Sep 21 17:49:36 2016 -0400 Committer: Andrew Or Committed: Wed Sep 21 17:49:36 2016 -0400 -- .../apache/spark/executor/CommitDeniedException.scala | 4 ++-- .../main/scala/org/apache/spark/executor/Executor.scala | 4 ++-- .../org/apache/spark/scheduler/TaskResultGetter.scala | 4 ++-- .../org/apache/spark/scheduler/TaskSchedulerImpl.scala | 2 +- .../org/apache/spark/scheduler/TaskSetManager.scala | 12 +++- .../org/apache/spark/shuffle/FetchFailedException.scala | 4 ++-- .../scala/org/apache/spark/util/JsonProtocolSuite.scala | 2 +- 7 files changed, 13 insertions(+), 19 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/9fcf1c51/core/src/main/scala/org/apache/spark/executor/CommitDeniedException.scala -- diff --git a/core/src/main/scala/org/apache/spark/executor/CommitDeniedException.scala b/core/src/main/scala/org/apache/spark/executor/CommitDeniedException.scala index 7d84889..326e042 100644 --- a/core/src/main/scala/org/apache/spark/executor/CommitDeniedException.scala +++ b/core/src/main/scala/org/apache/spark/executor/CommitDeniedException.scala @@ -17,7 +17,7 @@ package org.apache.spark.executor -import org.apache.spark.{TaskCommitDenied, TaskEndReason} +import org.apache.spark.{TaskCommitDenied, TaskFailedReason} /** * Exception thrown when a task attempts to commit output to HDFS but is denied by the driver. @@ -29,5 +29,5 @@ private[spark] class CommitDeniedException( attemptNumber: Int) extends Exception(msg) { - def toTaskEndReason: TaskEndReason = TaskCommitDenied(jobID, splitID, attemptNumber) + def toTaskFailedReason: TaskFailedReason = TaskCommitDenied(jobID, splitID, attemptNumber) } http://git-wip-us.apache.org/repos/asf/spark/blob/9fcf1c51/core/src/main/scala/org/apache/spark/executor/Executor.scala -- diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala b/core/src/main/scala/org/apache/spark/executor/Executor.scala index fbf2b86..668ec41 100644 --- a/core/src/main/scala/org/apache/spark/executor/Executor.scala +++ b/core/src/main/scala/org/apache/spark/executor/Executor.scala @@ -355,7 +355,7 @@ private[spark] class Executor( } catch { case ffe: FetchFailedException => - val reason = ffe.toTaskEndReason + val reason = ffe.toTaskFailedReason setTaskFinishedAndClearInterruptStatus() execBackend.statusUpdate(taskId, TaskState.FAILED, ser.serialize(reason)) @@ -370,7 +370,7 @@ private[spark] class Executor( execBackend.statusUpdate(taskId, TaskState.KILLED, ser.serialize(TaskKilled)) case CausedBy(cDE: CommitDeniedException) => - val reason = cDE.toTaskEndReason + val reason = cDE.toTaskFailedReason setTaskFinishedAndClearInterruptStatus() execBackend.statusUpdate(taskId, TaskState.FAILED, ser.serialize(reason)) http://git-wip-us.apache.org/repos/asf/spark/blob/9fcf1c51/core/src/main/scala/org/apache/spark/scheduler/TaskResultGetter.scala -- diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskResultGetter.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskResultGetter.scala index 685ef55..1c3fcbd 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/TaskResultGetter.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/TaskResultGetter.scala @@ -118,14 +118,14 @@ private[spark] class TaskResultGetter(sparkEnv
spark git commit: [SPARK-17438][WEBUI] Show Application.executorLimit in the application page
Repository: spark Updated Branches: refs/heads/branch-2.0 f56035ba6 -> d6191a067 [SPARK-17438][WEBUI] Show Application.executorLimit in the application page ## What changes were proposed in this pull request? This PR adds `Application.executorLimit` to the applicatino page ## How was this patch tested? Checked the UI manually. Screenshots: 1. Dynamic allocation is disabled https://cloud.githubusercontent.com/assets/1000778/18332029/210056ea-7518-11e6-9f52-76d96046c1c0.png";> 2. Dynamic allocation is enabled. https://cloud.githubusercontent.com/assets/1000778/18332034/2c07700a-7518-11e6-8fce-aebe25014902.png";> Author: Shixiong Zhu Closes #15001 from zsxwing/fix-core-info. (cherry picked from commit 80d6655921bea9b1bb27c1d95c2b46654e7a8cca) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d6191a06 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d6191a06 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d6191a06 Branch: refs/heads/branch-2.0 Commit: d6191a0671effe32f5c07397679c17a62e1cdaff Parents: f56035b Author: Shixiong Zhu Authored: Mon Sep 19 14:00:42 2016 -0400 Committer: Andrew Or Committed: Mon Sep 19 14:01:02 2016 -0400 -- .../apache/spark/deploy/master/ui/ApplicationPage.scala | 12 +++- core/src/main/scala/org/apache/spark/ui/ToolTips.scala | 6 ++ 2 files changed, 17 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d6191a06/core/src/main/scala/org/apache/spark/deploy/master/ui/ApplicationPage.scala -- diff --git a/core/src/main/scala/org/apache/spark/deploy/master/ui/ApplicationPage.scala b/core/src/main/scala/org/apache/spark/deploy/master/ui/ApplicationPage.scala index 8875fc2..18c5d0b 100644 --- a/core/src/main/scala/org/apache/spark/deploy/master/ui/ApplicationPage.scala +++ b/core/src/main/scala/org/apache/spark/deploy/master/ui/ApplicationPage.scala @@ -24,7 +24,7 @@ import scala.xml.Node import org.apache.spark.deploy.DeployMessages.{MasterStateResponse, RequestMasterState} import org.apache.spark.deploy.ExecutorState import org.apache.spark.deploy.master.ExecutorDesc -import org.apache.spark.ui.{UIUtils, WebUIPage} +import org.apache.spark.ui.{ToolTips, UIUtils, WebUIPage} import org.apache.spark.util.Utils private[ui] class ApplicationPage(parent: MasterWebUI) extends WebUIPage("app") { @@ -70,6 +70,16 @@ private[ui] class ApplicationPage(parent: MasterWebUI) extends WebUIPage("app") } + +Executor Limit: +{ + if (app.executorLimit == Int.MaxValue) "Unlimited" else app.executorLimit +} +({app.executors.size} granted) + + + Executor Memory: {Utils.megabytesToString(app.desc.memoryPerExecutorMB)} http://git-wip-us.apache.org/repos/asf/spark/blob/d6191a06/core/src/main/scala/org/apache/spark/ui/ToolTips.scala -- diff --git a/core/src/main/scala/org/apache/spark/ui/ToolTips.scala b/core/src/main/scala/org/apache/spark/ui/ToolTips.scala index 2d2d80b..3cc5353 100644 --- a/core/src/main/scala/org/apache/spark/ui/ToolTips.scala +++ b/core/src/main/scala/org/apache/spark/ui/ToolTips.scala @@ -90,4 +90,10 @@ private[spark] object ToolTips { val TASK_TIME = "Shaded red when garbage collection (GC) time is over 10% of task time" + + val APPLICATION_EXECUTOR_LIMIT = +"""Maximum number of executors that this application will use. This limit is finite only when + dynamic allocation is enabled. The number of granted executors may exceed the limit + ephemerally when executors are being killed. +""" } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-17438][WEBUI] Show Application.executorLimit in the application page
Repository: spark Updated Branches: refs/heads/master cdea1d134 -> 80d665592 [SPARK-17438][WEBUI] Show Application.executorLimit in the application page ## What changes were proposed in this pull request? This PR adds `Application.executorLimit` to the applicatino page ## How was this patch tested? Checked the UI manually. Screenshots: 1. Dynamic allocation is disabled https://cloud.githubusercontent.com/assets/1000778/18332029/210056ea-7518-11e6-9f52-76d96046c1c0.png";> 2. Dynamic allocation is enabled. https://cloud.githubusercontent.com/assets/1000778/18332034/2c07700a-7518-11e6-8fce-aebe25014902.png";> Author: Shixiong Zhu Closes #15001 from zsxwing/fix-core-info. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/80d66559 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/80d66559 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/80d66559 Branch: refs/heads/master Commit: 80d6655921bea9b1bb27c1d95c2b46654e7a8cca Parents: cdea1d1 Author: Shixiong Zhu Authored: Mon Sep 19 14:00:42 2016 -0400 Committer: Andrew Or Committed: Mon Sep 19 14:00:42 2016 -0400 -- .../apache/spark/deploy/master/ui/ApplicationPage.scala | 12 +++- core/src/main/scala/org/apache/spark/ui/ToolTips.scala | 6 ++ 2 files changed, 17 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/80d66559/core/src/main/scala/org/apache/spark/deploy/master/ui/ApplicationPage.scala -- diff --git a/core/src/main/scala/org/apache/spark/deploy/master/ui/ApplicationPage.scala b/core/src/main/scala/org/apache/spark/deploy/master/ui/ApplicationPage.scala index 17c521c..18cff31 100644 --- a/core/src/main/scala/org/apache/spark/deploy/master/ui/ApplicationPage.scala +++ b/core/src/main/scala/org/apache/spark/deploy/master/ui/ApplicationPage.scala @@ -24,7 +24,7 @@ import scala.xml.Node import org.apache.spark.deploy.DeployMessages.{MasterStateResponse, RequestMasterState} import org.apache.spark.deploy.ExecutorState import org.apache.spark.deploy.master.ExecutorDesc -import org.apache.spark.ui.{UIUtils, WebUIPage} +import org.apache.spark.ui.{ToolTips, UIUtils, WebUIPage} import org.apache.spark.util.Utils private[ui] class ApplicationPage(parent: MasterWebUI) extends WebUIPage("app") { @@ -70,6 +70,16 @@ private[ui] class ApplicationPage(parent: MasterWebUI) extends WebUIPage("app") } + +Executor Limit: +{ + if (app.executorLimit == Int.MaxValue) "Unlimited" else app.executorLimit +} +({app.executors.size} granted) + + + Executor Memory: {Utils.megabytesToString(app.desc.memoryPerExecutorMB)} http://git-wip-us.apache.org/repos/asf/spark/blob/80d66559/core/src/main/scala/org/apache/spark/ui/ToolTips.scala -- diff --git a/core/src/main/scala/org/apache/spark/ui/ToolTips.scala b/core/src/main/scala/org/apache/spark/ui/ToolTips.scala index 2d2d80b..3cc5353 100644 --- a/core/src/main/scala/org/apache/spark/ui/ToolTips.scala +++ b/core/src/main/scala/org/apache/spark/ui/ToolTips.scala @@ -90,4 +90,10 @@ private[spark] object ToolTips { val TASK_TIME = "Shaded red when garbage collection (GC) time is over 10% of task time" + + val APPLICATION_EXECUTOR_LIMIT = +"""Maximum number of executors that this application will use. This limit is finite only when + dynamic allocation is enabled. The number of granted executors may exceed the limit + ephemerally when executors are being killed. +""" } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15966][DOC] Add closing tag to fix rendering issue for Spark monitoring
Repository: spark Updated Branches: refs/heads/master 9040d83bc -> f1bf0d2f3 [SPARK-15966][DOC] Add closing tag to fix rendering issue for Spark monitoring ## What changes were proposed in this pull request? Adds the missing closing tag for spark.ui.view.acls.groups ## How was this patch tested? I built the docs locally and verified the changed in browser. (If this patch involves UI changes, please attach a screenshot; otherwise, remove this) **Before:** ![image](https://cloud.githubusercontent.com/assets/7732317/16135005/49fc0724-33e6-11e6-9390-98711593fa5b.png) **After:** ![image](https://cloud.githubusercontent.com/assets/7732317/16135021/62b5c4a8-33e6-11e6-8118-b22fda5c66eb.png) Author: Dhruve Ashar Closes #13719 from dhruve/doc/SPARK-15966. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f1bf0d2f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f1bf0d2f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f1bf0d2f Branch: refs/heads/master Commit: f1bf0d2f3a61d81686f36763e83d3be89c98435f Parents: 9040d83 Author: Dhruve Ashar Authored: Thu Jun 16 16:44:54 2016 -0700 Committer: Andrew Or Committed: Thu Jun 16 17:46:19 2016 -0700 -- docs/monitoring.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f1bf0d2f/docs/monitoring.md -- diff --git a/docs/monitoring.md b/docs/monitoring.md index 78a3470..fa6c899 100644 --- a/docs/monitoring.md +++ b/docs/monitoring.md @@ -157,7 +157,7 @@ The history server can be configured as follows: If enabled, access control checks are made regardless of what the individual application had set for spark.ui.acls.enable when the application was run. The application owner will always have authorization to view their own application and any users specified via - spark.ui.view.acls and groups specified via spark.ui.view.acls.groups + spark.ui.view.acls and groups specified via spark.ui.view.acls.groups when the application was run will also have authorization to view that application. If disabled, no access control checks are made. - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15749][SQL] make the error message more meaningful
Repository: spark Updated Branches: refs/heads/master e849285df -> 62d2fa5e9 [SPARK-15749][SQL] make the error message more meaningful ## What changes were proposed in this pull request? For table test1 (C1 varchar (10), C2 varchar (10)), when I insert a row using ``` sqlContext.sql("insert into test1 values ('abc', 'def', 1)") ``` I got error message ``` Exception in thread "main" java.lang.RuntimeException: RelationC1#0,C2#1 JDBCRelation(test1) requires that the query in the SELECT clause of the INSERT INTO/OVERWRITE statement generates the same number of columns as its schema. ``` The error message is a little confusing. In my simple insert statement, it doesn't have a SELECT clause. I will change the error message to a more general one ``` Exception in thread "main" java.lang.RuntimeException: RelationC1#0,C2#1 JDBCRelation(test1) requires that the data to be inserted have the same number of columns as the target table. ``` ## How was this patch tested? I tested the patch using my simple unit test, but it's a very trivial change and I don't think I need to check in any test. Author: Huaxin Gao Closes #13492 from huaxingao/spark-15749. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/62d2fa5e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/62d2fa5e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/62d2fa5e Branch: refs/heads/master Commit: 62d2fa5e996d428caaea005041b17ec115473762 Parents: e849285 Author: Huaxin Gao Authored: Thu Jun 16 14:37:10 2016 -0700 Committer: Andrew Or Committed: Thu Jun 16 14:37:10 2016 -0700 -- .../org/apache/spark/sql/execution/datasources/rules.scala | 5 +++-- .../test/scala/org/apache/spark/sql/sources/InsertSuite.scala | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/62d2fa5e/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala index 7ac62fb..543389e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala @@ -78,8 +78,9 @@ private[sql] object PreInsertCastAndRename extends Rule[LogicalPlan] { // schema of the relation. if (l.output.size != child.output.size) { sys.error( -s"$l requires that the query in the SELECT clause of the INSERT INTO/OVERWRITE " + - s"statement generates the same number of columns as its schema.") +s"$l requires that the data to be inserted have the same number of columns as the " + + s"target table: target table has ${l.output.size} column(s) but " + + s"the inserted data has ${child.output.size} column(s).") } castAndRenameChildOutput(i, l.output, child) } http://git-wip-us.apache.org/repos/asf/spark/blob/62d2fa5e/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala index bade41b..d717955 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala @@ -95,7 +95,7 @@ class InsertSuite extends DataSourceTest with SharedSQLContext { """.stripMargin) }.getMessage assert( - message.contains("generates the same number of columns as its schema"), + message.contains("requires that the data to be inserted have the same number of columns"), "SELECT clause generating a different number of columns should not be not allowed." ) } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15749][SQL] make the error message more meaningful
Repository: spark Updated Branches: refs/heads/branch-2.0 27e274c3e -> 2280ad8a3 [SPARK-15749][SQL] make the error message more meaningful ## What changes were proposed in this pull request? For table test1 (C1 varchar (10), C2 varchar (10)), when I insert a row using ``` sqlContext.sql("insert into test1 values ('abc', 'def', 1)") ``` I got error message ``` Exception in thread "main" java.lang.RuntimeException: RelationC1#0,C2#1 JDBCRelation(test1) requires that the query in the SELECT clause of the INSERT INTO/OVERWRITE statement generates the same number of columns as its schema. ``` The error message is a little confusing. In my simple insert statement, it doesn't have a SELECT clause. I will change the error message to a more general one ``` Exception in thread "main" java.lang.RuntimeException: RelationC1#0,C2#1 JDBCRelation(test1) requires that the data to be inserted have the same number of columns as the target table. ``` ## How was this patch tested? I tested the patch using my simple unit test, but it's a very trivial change and I don't think I need to check in any test. Author: Huaxin Gao Closes #13492 from huaxingao/spark-15749. (cherry picked from commit 62d2fa5e996d428caaea005041b17ec115473762) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2280ad8a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2280ad8a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2280ad8a Branch: refs/heads/branch-2.0 Commit: 2280ad8a3ddfff0b7cc10de6eadb2cc93423bbcf Parents: 27e274c Author: Huaxin Gao Authored: Thu Jun 16 14:37:10 2016 -0700 Committer: Andrew Or Committed: Thu Jun 16 14:37:19 2016 -0700 -- .../org/apache/spark/sql/execution/datasources/rules.scala | 5 +++-- .../test/scala/org/apache/spark/sql/sources/InsertSuite.scala | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/2280ad8a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala index 7ac62fb..543389e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala @@ -78,8 +78,9 @@ private[sql] object PreInsertCastAndRename extends Rule[LogicalPlan] { // schema of the relation. if (l.output.size != child.output.size) { sys.error( -s"$l requires that the query in the SELECT clause of the INSERT INTO/OVERWRITE " + - s"statement generates the same number of columns as its schema.") +s"$l requires that the data to be inserted have the same number of columns as the " + + s"target table: target table has ${l.output.size} column(s) but " + + s"the inserted data has ${child.output.size} column(s).") } castAndRenameChildOutput(i, l.output, child) } http://git-wip-us.apache.org/repos/asf/spark/blob/2280ad8a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala index bade41b..d717955 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala @@ -95,7 +95,7 @@ class InsertSuite extends DataSourceTest with SharedSQLContext { """.stripMargin) }.getMessage assert( - message.contains("generates the same number of columns as its schema"), + message.contains("requires that the data to be inserted have the same number of columns"), "SELECT clause generating a different number of columns should not be not allowed." ) } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15868][WEB UI] Executors table in Executors tab should sort Executor IDs in numerical order
Repository: spark Updated Branches: refs/heads/branch-2.0 fb0fab63c -> 27e274c3e [SPARK-15868][WEB UI] Executors table in Executors tab should sort Executor IDs in numerical order ## What changes were proposed in this pull request? Currently the Executors table sorts by id using a string sort (since that's what it is stored as). Since the id is a number (other than the driver) we should be sorting numerically. I have changed both the initial sort on page load as well as the table sort to sort on id numerically, treating non-numeric strings (like the driver) as "-1" ## How was this patch tested? Manually tested and dev/run-tests ![pageload](https://cloud.githubusercontent.com/assets/13952758/16027882/d32edd0a-318e-11e6-9faf-fc972b7c36ab.png) ![sorted](https://cloud.githubusercontent.com/assets/13952758/16027883/d34541c6-318e-11e6-9ed7-6bfc0cd4152e.png) Author: Alex Bozarth Closes #13654 from ajbozarth/spark15868. (cherry picked from commit e849285df03b1233d5f647f1b6c5a6dad0665855) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/27e274c3 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/27e274c3 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/27e274c3 Branch: refs/heads/branch-2.0 Commit: 27e274c3e8cad29fc684a1611cef19d60acdfbc0 Parents: fb0fab6 Author: Alex Bozarth Authored: Thu Jun 16 14:29:11 2016 -0700 Committer: Andrew Or Committed: Thu Jun 16 14:29:21 2016 -0700 -- .../main/scala/org/apache/spark/ui/exec/ExecutorsPage.scala | 9 +++-- 1 file changed, 7 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/27e274c3/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsPage.scala -- diff --git a/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsPage.scala b/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsPage.scala index 791dbe5..67deb7b 100644 --- a/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsPage.scala +++ b/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsPage.scala @@ -20,6 +20,7 @@ package org.apache.spark.ui.exec import java.net.URLEncoder import javax.servlet.http.HttpServletRequest +import scala.util.Try import scala.xml.Node import org.apache.spark.status.api.v1.ExecutorSummary @@ -53,6 +54,9 @@ private[ui] class ExecutorsPage( // When GCTimePercent is edited change ToolTips.TASK_TIME to match private val GCTimePercent = 0.1 + // a safe String to Int for sorting ids (converts non-numeric Strings to -1) + private def idStrToInt(str: String) : Int = Try(str.toInt).getOrElse(-1) + def render(request: HttpServletRequest): Seq[Node] = { val (activeExecutorInfo, deadExecutorInfo) = listener.synchronized { // The follow codes should be protected by `listener` to make sure no executors will be @@ -69,13 +73,14 @@ private[ui] class ExecutorsPage( } val execInfo = activeExecutorInfo ++ deadExecutorInfo +implicit val idOrder = Ordering[Int].on((s: String) => idStrToInt(s)).reverse val execInfoSorted = execInfo.sortBy(_.id) val logsExist = execInfo.filter(_.executorLogs.nonEmpty).nonEmpty val execTable = { - Executor ID + Executor ID Address Status RDD Blocks @@ -136,7 +141,7 @@ private[ui] class ExecutorsPage( } - {info.id} + {info.id} {info.hostPort} {executorStatus} - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15868][WEB UI] Executors table in Executors tab should sort Executor IDs in numerical order
Repository: spark Updated Branches: refs/heads/master 2d27eb1e7 -> e849285df [SPARK-15868][WEB UI] Executors table in Executors tab should sort Executor IDs in numerical order ## What changes were proposed in this pull request? Currently the Executors table sorts by id using a string sort (since that's what it is stored as). Since the id is a number (other than the driver) we should be sorting numerically. I have changed both the initial sort on page load as well as the table sort to sort on id numerically, treating non-numeric strings (like the driver) as "-1" ## How was this patch tested? Manually tested and dev/run-tests ![pageload](https://cloud.githubusercontent.com/assets/13952758/16027882/d32edd0a-318e-11e6-9faf-fc972b7c36ab.png) ![sorted](https://cloud.githubusercontent.com/assets/13952758/16027883/d34541c6-318e-11e6-9ed7-6bfc0cd4152e.png) Author: Alex Bozarth Closes #13654 from ajbozarth/spark15868. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e849285d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e849285d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e849285d Branch: refs/heads/master Commit: e849285df03b1233d5f647f1b6c5a6dad0665855 Parents: 2d27eb1 Author: Alex Bozarth Authored: Thu Jun 16 14:29:11 2016 -0700 Committer: Andrew Or Committed: Thu Jun 16 14:29:11 2016 -0700 -- .../main/scala/org/apache/spark/ui/exec/ExecutorsPage.scala | 9 +++-- 1 file changed, 7 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e849285d/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsPage.scala -- diff --git a/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsPage.scala b/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsPage.scala index 791dbe5..67deb7b 100644 --- a/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsPage.scala +++ b/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsPage.scala @@ -20,6 +20,7 @@ package org.apache.spark.ui.exec import java.net.URLEncoder import javax.servlet.http.HttpServletRequest +import scala.util.Try import scala.xml.Node import org.apache.spark.status.api.v1.ExecutorSummary @@ -53,6 +54,9 @@ private[ui] class ExecutorsPage( // When GCTimePercent is edited change ToolTips.TASK_TIME to match private val GCTimePercent = 0.1 + // a safe String to Int for sorting ids (converts non-numeric Strings to -1) + private def idStrToInt(str: String) : Int = Try(str.toInt).getOrElse(-1) + def render(request: HttpServletRequest): Seq[Node] = { val (activeExecutorInfo, deadExecutorInfo) = listener.synchronized { // The follow codes should be protected by `listener` to make sure no executors will be @@ -69,13 +73,14 @@ private[ui] class ExecutorsPage( } val execInfo = activeExecutorInfo ++ deadExecutorInfo +implicit val idOrder = Ordering[Int].on((s: String) => idStrToInt(s)).reverse val execInfoSorted = execInfo.sortBy(_.id) val logsExist = execInfo.filter(_.executorLogs.nonEmpty).nonEmpty val execTable = { - Executor ID + Executor ID Address Status RDD Blocks @@ -136,7 +141,7 @@ private[ui] class ExecutorsPage( } - {info.id} + {info.id} {info.hostPort} {executorStatus} - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [MINOR][DOCS][SQL] Fix some comments about types(TypeCoercion, Partition) and exceptions.
Repository: spark Updated Branches: refs/heads/branch-2.0 7d8cddfb4 -> fb0fab63c [MINOR][DOCS][SQL] Fix some comments about types(TypeCoercion,Partition) and exceptions. ## What changes were proposed in this pull request? This PR contains a few changes on code comments. - `HiveTypeCoercion` is renamed into `TypeCoercion`. - `NoSuchDatabaseException` is only used for the absence of database. - For partition type inference, only `DoubleType` is considered. ## How was this patch tested? N/A Author: Dongjoon Hyun Closes #13674 from dongjoon-hyun/minor_doc_types. (cherry picked from commit 2d27eb1e753daefbd311136fc7de1a3e8fb9dc63) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/fb0fab63 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/fb0fab63 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/fb0fab63 Branch: refs/heads/branch-2.0 Commit: fb0fab63cb005d9efc624aeb0ac85476a9ddc4f4 Parents: 7d8cddf Author: Dongjoon Hyun Authored: Thu Jun 16 14:27:09 2016 -0700 Committer: Andrew Or Committed: Thu Jun 16 14:27:17 2016 -0700 -- .../org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala| 4 ++-- .../org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala | 2 +- .../src/main/scala/org/apache/spark/sql/types/Decimal.scala | 2 +- .../spark/sql/execution/datasources/PartitioningUtils.scala | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/fb0fab63/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala index 16df628..baec6d1 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala @@ -73,7 +73,7 @@ object TypeCoercion { DoubleType) /** - * Case 1 type widening (see the classdoc comment above for HiveTypeCoercion). + * Case 1 type widening (see the classdoc comment above for TypeCoercion). * * Find the tightest common type of two types that might be used in a binary expression. * This handles all numeric types except fixed-precision decimals interacting with each other or @@ -132,7 +132,7 @@ object TypeCoercion { } /** - * Case 2 type widening (see the classdoc comment above for HiveTypeCoercion). + * Case 2 type widening (see the classdoc comment above for TypeCoercion). * * i.e. the main difference with [[findTightestCommonTypeOfTwo]] is that here we allow some * loss of precision when widening decimal and double. http://git-wip-us.apache.org/repos/asf/spark/blob/fb0fab63/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala index 81974b2..6714846 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala @@ -27,7 +27,7 @@ import org.apache.spark.sql.catalyst.analysis.NoSuchDatabaseException * can be accessed in multiple threads. This is an external catalog because it is expected to * interact with external systems. * - * Implementations should throw [[NoSuchDatabaseException]] when table or database don't exist. + * Implementations should throw [[NoSuchDatabaseException]] when databases don't exist. */ abstract class ExternalCatalog { import CatalogTypes.TablePartitionSpec http://git-wip-us.apache.org/repos/asf/spark/blob/fb0fab63/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala index 52e0210..cc8175c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala @@ -322,7 +322,7 @@ final class Decimal extends Ordered[Decimal] with Serializable { } } - // HiveTypeCoercion will take care of the precision, scale of result + // TypeCoercion will take care of the precision, scale of result def * (that: Decima
spark git commit: [MINOR][DOCS][SQL] Fix some comments about types(TypeCoercion, Partition) and exceptions.
Repository: spark Updated Branches: refs/heads/master 796429d71 -> 2d27eb1e7 [MINOR][DOCS][SQL] Fix some comments about types(TypeCoercion,Partition) and exceptions. ## What changes were proposed in this pull request? This PR contains a few changes on code comments. - `HiveTypeCoercion` is renamed into `TypeCoercion`. - `NoSuchDatabaseException` is only used for the absence of database. - For partition type inference, only `DoubleType` is considered. ## How was this patch tested? N/A Author: Dongjoon Hyun Closes #13674 from dongjoon-hyun/minor_doc_types. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2d27eb1e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2d27eb1e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2d27eb1e Branch: refs/heads/master Commit: 2d27eb1e753daefbd311136fc7de1a3e8fb9dc63 Parents: 796429d Author: Dongjoon Hyun Authored: Thu Jun 16 14:27:09 2016 -0700 Committer: Andrew Or Committed: Thu Jun 16 14:27:09 2016 -0700 -- .../org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala| 4 ++-- .../org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala | 2 +- .../src/main/scala/org/apache/spark/sql/types/Decimal.scala | 2 +- .../spark/sql/execution/datasources/PartitioningUtils.scala | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/2d27eb1e/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala index 16df628..baec6d1 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala @@ -73,7 +73,7 @@ object TypeCoercion { DoubleType) /** - * Case 1 type widening (see the classdoc comment above for HiveTypeCoercion). + * Case 1 type widening (see the classdoc comment above for TypeCoercion). * * Find the tightest common type of two types that might be used in a binary expression. * This handles all numeric types except fixed-precision decimals interacting with each other or @@ -132,7 +132,7 @@ object TypeCoercion { } /** - * Case 2 type widening (see the classdoc comment above for HiveTypeCoercion). + * Case 2 type widening (see the classdoc comment above for TypeCoercion). * * i.e. the main difference with [[findTightestCommonTypeOfTwo]] is that here we allow some * loss of precision when widening decimal and double. http://git-wip-us.apache.org/repos/asf/spark/blob/2d27eb1e/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala index 81974b2..6714846 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala @@ -27,7 +27,7 @@ import org.apache.spark.sql.catalyst.analysis.NoSuchDatabaseException * can be accessed in multiple threads. This is an external catalog because it is expected to * interact with external systems. * - * Implementations should throw [[NoSuchDatabaseException]] when table or database don't exist. + * Implementations should throw [[NoSuchDatabaseException]] when databases don't exist. */ abstract class ExternalCatalog { import CatalogTypes.TablePartitionSpec http://git-wip-us.apache.org/repos/asf/spark/blob/2d27eb1e/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala index 52e0210..cc8175c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala @@ -322,7 +322,7 @@ final class Decimal extends Ordered[Decimal] with Serializable { } } - // HiveTypeCoercion will take care of the precision, scale of result + // TypeCoercion will take care of the precision, scale of result def * (that: Decimal): Decimal = Decimal(toJavaBigDecimal.multiply(that.toJavaBigDecimal, MATH_CONTEXT)) http://git
spark git commit: [SPARK-15998][SQL] Verification of SQLConf HIVE_METASTORE_PARTITION_PRUNING
Repository: spark Updated Branches: refs/heads/branch-2.0 1230516d9 -> 7d8cddfb4 [SPARK-15998][SQL] Verification of SQLConf HIVE_METASTORE_PARTITION_PRUNING What changes were proposed in this pull request? `HIVE_METASTORE_PARTITION_PRUNING` is a public `SQLConf`. When `true`, some predicates will be pushed down into the Hive metastore so that unmatching partitions can be eliminated earlier. The current default value is `false`. For performance improvement, users might turn this parameter on. So far, the code base does not have such a test case to verify whether this `SQLConf` properly works. This PR is to improve the test case coverage for avoiding future regression. How was this patch tested? N/A Author: gatorsmile Closes #13716 from gatorsmile/addTestMetastorePartitionPruning. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7d8cddfb Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7d8cddfb Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7d8cddfb Branch: refs/heads/branch-2.0 Commit: 7d8cddfb495d406b9f2fb5216edd14dea442ec73 Parents: 1230516 Author: gatorsmile Authored: Thu Jun 16 14:23:17 2016 -0700 Committer: Andrew Or Committed: Thu Jun 16 14:26:46 2016 -0700 -- .../sql/hive/execution/HiveTableScanSuite.scala | 60 +++- 1 file changed, 57 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/7d8cddfb/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala -- diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala index 60f8be5..76d3f3d 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala @@ -18,13 +18,14 @@ package org.apache.spark.sql.hive.execution import org.apache.spark.sql.Row -import org.apache.spark.sql.functions._ -import org.apache.spark.sql.hive.test.TestHive +import org.apache.spark.sql.hive.test.{TestHive, TestHiveSingleton} import org.apache.spark.sql.hive.test.TestHive._ import org.apache.spark.sql.hive.test.TestHive.implicits._ +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.util.Utils -class HiveTableScanSuite extends HiveComparisonTest { +class HiveTableScanSuite extends HiveComparisonTest with SQLTestUtils with TestHiveSingleton { createQueryTest("partition_based_table_scan_with_different_serde", """ @@ -89,4 +90,57 @@ class HiveTableScanSuite extends HiveComparisonTest { assert(sql("select CaseSensitiveColName from spark_4959_2").head() === Row("hi")) assert(sql("select casesensitivecolname from spark_4959_2").head() === Row("hi")) } + + private def checkNumScannedPartitions(stmt: String, expectedNumParts: Int): Unit = { +val plan = sql(stmt).queryExecution.sparkPlan +val numPartitions = plan.collectFirst { + case p: HiveTableScanExec => +p.relation.getHiveQlPartitions(p.partitionPruningPred).length +}.getOrElse(0) +assert(numPartitions == expectedNumParts) + } + + test("Verify SQLConf HIVE_METASTORE_PARTITION_PRUNING") { +val view = "src" +withTempTable(view) { + spark.range(1, 5).createOrReplaceTempView(view) + val table = "table_with_partition" + withTable(table) { +sql( + s""" + |CREATE TABLE $table(id string) + |PARTITIONED BY (p1 string,p2 string,p3 string,p4 string,p5 string) + """.stripMargin) +sql( + s""" + |FROM $view v + |INSERT INTO TABLE $table + |PARTITION (p1='a',p2='b',p3='c',p4='d',p5='e') + |SELECT v.id + |INSERT INTO TABLE $table + |PARTITION (p1='a',p2='c',p3='c',p4='d',p5='e') + |SELECT v.id + """.stripMargin) + +Seq("true", "false").foreach { hivePruning => + withSQLConf(SQLConf.HIVE_METASTORE_PARTITION_PRUNING.key -> hivePruning) { +// If the pruning predicate is used, getHiveQlPartitions should only return the +// qualified partition; Otherwise, it return all the partitions. +val expectedNumPartitions = if (hivePruning == "true") 1 else 2 +checkNumScannedPartitions( + stmt = s"SELECT id, p2 FROM $table WHERE p2 <= 'b'", expectedNumPartitions) + } +} + +Seq("true", "false").foreach { hivePruning => + withSQLConf(SQLConf.HIVE_METASTORE_PARTITION_PRUNING.key -> hivePruning) { +
spark git commit: [SPARK-15998][SQL] Verification of SQLConf HIVE_METASTORE_PARTITION_PRUNING
Repository: spark Updated Branches: refs/heads/master 7a89f2adb -> 796429d71 [SPARK-15998][SQL] Verification of SQLConf HIVE_METASTORE_PARTITION_PRUNING What changes were proposed in this pull request? `HIVE_METASTORE_PARTITION_PRUNING` is a public `SQLConf`. When `true`, some predicates will be pushed down into the Hive metastore so that unmatching partitions can be eliminated earlier. The current default value is `false`. For performance improvement, users might turn this parameter on. So far, the code base does not have such a test case to verify whether this `SQLConf` properly works. This PR is to improve the test case coverage for avoiding future regression. How was this patch tested? N/A Author: gatorsmile Closes #13716 from gatorsmile/addTestMetastorePartitionPruning. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/796429d7 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/796429d7 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/796429d7 Branch: refs/heads/master Commit: 796429d7117e2544207bd9d67bda8b603cb1a535 Parents: 7a89f2a Author: gatorsmile Authored: Thu Jun 16 14:23:17 2016 -0700 Committer: Andrew Or Committed: Thu Jun 16 14:23:17 2016 -0700 -- .../sql/hive/execution/HiveTableScanSuite.scala | 60 +++- 1 file changed, 57 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/796429d7/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala -- diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala index 60f8be5..76d3f3d 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala @@ -18,13 +18,14 @@ package org.apache.spark.sql.hive.execution import org.apache.spark.sql.Row -import org.apache.spark.sql.functions._ -import org.apache.spark.sql.hive.test.TestHive +import org.apache.spark.sql.hive.test.{TestHive, TestHiveSingleton} import org.apache.spark.sql.hive.test.TestHive._ import org.apache.spark.sql.hive.test.TestHive.implicits._ +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.util.Utils -class HiveTableScanSuite extends HiveComparisonTest { +class HiveTableScanSuite extends HiveComparisonTest with SQLTestUtils with TestHiveSingleton { createQueryTest("partition_based_table_scan_with_different_serde", """ @@ -89,4 +90,57 @@ class HiveTableScanSuite extends HiveComparisonTest { assert(sql("select CaseSensitiveColName from spark_4959_2").head() === Row("hi")) assert(sql("select casesensitivecolname from spark_4959_2").head() === Row("hi")) } + + private def checkNumScannedPartitions(stmt: String, expectedNumParts: Int): Unit = { +val plan = sql(stmt).queryExecution.sparkPlan +val numPartitions = plan.collectFirst { + case p: HiveTableScanExec => +p.relation.getHiveQlPartitions(p.partitionPruningPred).length +}.getOrElse(0) +assert(numPartitions == expectedNumParts) + } + + test("Verify SQLConf HIVE_METASTORE_PARTITION_PRUNING") { +val view = "src" +withTempTable(view) { + spark.range(1, 5).createOrReplaceTempView(view) + val table = "table_with_partition" + withTable(table) { +sql( + s""" + |CREATE TABLE $table(id string) + |PARTITIONED BY (p1 string,p2 string,p3 string,p4 string,p5 string) + """.stripMargin) +sql( + s""" + |FROM $view v + |INSERT INTO TABLE $table + |PARTITION (p1='a',p2='b',p3='c',p4='d',p5='e') + |SELECT v.id + |INSERT INTO TABLE $table + |PARTITION (p1='a',p2='c',p3='c',p4='d',p5='e') + |SELECT v.id + """.stripMargin) + +Seq("true", "false").foreach { hivePruning => + withSQLConf(SQLConf.HIVE_METASTORE_PARTITION_PRUNING.key -> hivePruning) { +// If the pruning predicate is used, getHiveQlPartitions should only return the +// qualified partition; Otherwise, it return all the partitions. +val expectedNumPartitions = if (hivePruning == "true") 1 else 2 +checkNumScannedPartitions( + stmt = s"SELECT id, p2 FROM $table WHERE p2 <= 'b'", expectedNumPartitions) + } +} + +Seq("true", "false").foreach { hivePruning => + withSQLConf(SQLConf.HIVE_METASTORE_PARTITION_PRUNING.key -> hivePruning) { +
spark git commit: [SQL] Minor HashAggregateExec string output fixes
Repository: spark Updated Branches: refs/heads/branch-2.0 938988757 -> 1230516d9 [SQL] Minor HashAggregateExec string output fixes ## What changes were proposed in this pull request? This PR fixes some minor `.toString` format issues for `HashAggregateExec`. Before: ``` *HashAggregate(key=[a#234L,b#235L], functions=[count(1),max(c#236L)], output=[a#234L,b#235L,count(c)#247L,max(c)#248L]) ``` After: ``` *HashAggregate(keys=[a#234L, b#235L], functions=[count(1), max(c#236L)], output=[a#234L, b#235L, count(c)#247L, max(c)#248L]) ``` ## How was this patch tested? Manually tested. Author: Cheng Lian Closes #13710 from liancheng/minor-agg-string-fix. (cherry picked from commit 7a89f2adbbc82a23f06638806ffc8596a7efe7f3) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1230516d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1230516d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1230516d Branch: refs/heads/branch-2.0 Commit: 1230516d9314f55183bfa542eb7cdfac9d8dfec5 Parents: 9389887 Author: Cheng Lian Authored: Thu Jun 16 14:20:44 2016 -0700 Committer: Andrew Or Committed: Thu Jun 16 14:20:52 2016 -0700 -- .../spark/sql/execution/aggregate/HashAggregateExec.scala | 10 +- 1 file changed, 5 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/1230516d/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala index caeeba1..54d7340 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala @@ -774,13 +774,13 @@ case class HashAggregateExec( testFallbackStartsAt match { case None => -val keyString = Utils.truncatedString(groupingExpressions, "[", ",", "]") -val functionString = Utils.truncatedString(allAggregateExpressions, "[", ",", "]") -val outputString = Utils.truncatedString(output, "[", ",", "]") +val keyString = Utils.truncatedString(groupingExpressions, "[", ", ", "]") +val functionString = Utils.truncatedString(allAggregateExpressions, "[", ", ", "]") +val outputString = Utils.truncatedString(output, "[", ", ", "]") if (verbose) { - s"HashAggregate(key=$keyString, functions=$functionString, output=$outputString)" + s"HashAggregate(keys=$keyString, functions=$functionString, output=$outputString)" } else { - s"HashAggregate(key=$keyString, functions=$functionString)" + s"HashAggregate(keys=$keyString, functions=$functionString)" } case Some(fallbackStartsAt) => s"HashAggregateWithControlledFallback $groupingExpressions " + - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SQL] Minor HashAggregateExec string output fixes
Repository: spark Updated Branches: refs/heads/master acef843f6 -> 7a89f2adb [SQL] Minor HashAggregateExec string output fixes ## What changes were proposed in this pull request? This PR fixes some minor `.toString` format issues for `HashAggregateExec`. Before: ``` *HashAggregate(key=[a#234L,b#235L], functions=[count(1),max(c#236L)], output=[a#234L,b#235L,count(c)#247L,max(c)#248L]) ``` After: ``` *HashAggregate(keys=[a#234L, b#235L], functions=[count(1), max(c#236L)], output=[a#234L, b#235L, count(c)#247L, max(c)#248L]) ``` ## How was this patch tested? Manually tested. Author: Cheng Lian Closes #13710 from liancheng/minor-agg-string-fix. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7a89f2ad Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7a89f2ad Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7a89f2ad Branch: refs/heads/master Commit: 7a89f2adbbc82a23f06638806ffc8596a7efe7f3 Parents: acef843 Author: Cheng Lian Authored: Thu Jun 16 14:20:44 2016 -0700 Committer: Andrew Or Committed: Thu Jun 16 14:20:44 2016 -0700 -- .../spark/sql/execution/aggregate/HashAggregateExec.scala | 10 +- 1 file changed, 5 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/7a89f2ad/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala index caeeba1..54d7340 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala @@ -774,13 +774,13 @@ case class HashAggregateExec( testFallbackStartsAt match { case None => -val keyString = Utils.truncatedString(groupingExpressions, "[", ",", "]") -val functionString = Utils.truncatedString(allAggregateExpressions, "[", ",", "]") -val outputString = Utils.truncatedString(output, "[", ",", "]") +val keyString = Utils.truncatedString(groupingExpressions, "[", ", ", "]") +val functionString = Utils.truncatedString(allAggregateExpressions, "[", ", ", "]") +val outputString = Utils.truncatedString(output, "[", ", ", "]") if (verbose) { - s"HashAggregate(key=$keyString, functions=$functionString, output=$outputString)" + s"HashAggregate(keys=$keyString, functions=$functionString, output=$outputString)" } else { - s"HashAggregate(key=$keyString, functions=$functionString)" + s"HashAggregate(keys=$keyString, functions=$functionString)" } case Some(fallbackStartsAt) => s"HashAggregateWithControlledFallback $groupingExpressions " + - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15975] Fix improper Popen retcode code handling in dev/run-tests
Repository: spark Updated Branches: refs/heads/branch-1.6 cffc0800b -> 0a8ada506 [SPARK-15975] Fix improper Popen retcode code handling in dev/run-tests In the `dev/run-tests.py` script we check a `Popen.retcode` for success using `retcode > 0`, but this is subtlety wrong because Popen's return code will be negative if the child process was terminated by a signal: https://docs.python.org/2/library/subprocess.html#subprocess.Popen.returncode In order to properly handle signals, we should change this to check `retcode != 0` instead. Author: Josh Rosen Closes #13692 from JoshRosen/dev-run-tests-return-code-handling. (cherry picked from commit acef843f67e770f0a2709fb3fbd1a53c200b2bc5) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0a8ada50 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0a8ada50 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0a8ada50 Branch: refs/heads/branch-1.6 Commit: 0a8ada5064bec22116363f93ed476352776b49e4 Parents: cffc080 Author: Josh Rosen Authored: Thu Jun 16 14:18:58 2016 -0700 Committer: Andrew Or Committed: Thu Jun 16 14:19:19 2016 -0700 -- dev/run-tests.py | 2 +- dev/sparktestsupport/shellutils.py | 5 - 2 files changed, 5 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0a8ada50/dev/run-tests.py -- diff --git a/dev/run-tests.py b/dev/run-tests.py index 4a18d1a..e42e073 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -284,7 +284,7 @@ def exec_sbt(sbt_args=()): print(line, end='') retcode = sbt_proc.wait() -if retcode > 0: +if retcode != 0: exit_from_command_with_retcode(sbt_cmd, retcode) http://git-wip-us.apache.org/repos/asf/spark/blob/0a8ada50/dev/sparktestsupport/shellutils.py -- diff --git a/dev/sparktestsupport/shellutils.py b/dev/sparktestsupport/shellutils.py index d280e79..05af871 100644 --- a/dev/sparktestsupport/shellutils.py +++ b/dev/sparktestsupport/shellutils.py @@ -53,7 +53,10 @@ else: def exit_from_command_with_retcode(cmd, retcode): -print("[error] running", ' '.join(cmd), "; received return code", retcode) +if retcode < 0: +print("[error] running", ' '.join(cmd), "; process was terminated by signal", -retcode) +else: +print("[error] running", ' '.join(cmd), "; received return code", retcode) sys.exit(int(os.environ.get("CURRENT_BLOCK", 255))) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15975] Fix improper Popen retcode code handling in dev/run-tests
Repository: spark Updated Branches: refs/heads/branch-2.0 d9dd46edd -> 938988757 [SPARK-15975] Fix improper Popen retcode code handling in dev/run-tests In the `dev/run-tests.py` script we check a `Popen.retcode` for success using `retcode > 0`, but this is subtlety wrong because Popen's return code will be negative if the child process was terminated by a signal: https://docs.python.org/2/library/subprocess.html#subprocess.Popen.returncode In order to properly handle signals, we should change this to check `retcode != 0` instead. Author: Josh Rosen Closes #13692 from JoshRosen/dev-run-tests-return-code-handling. (cherry picked from commit acef843f67e770f0a2709fb3fbd1a53c200b2bc5) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/93898875 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/93898875 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/93898875 Branch: refs/heads/branch-2.0 Commit: 9389887571705e03d18e695301f0cb0aa5bd9e21 Parents: d9dd46e Author: Josh Rosen Authored: Thu Jun 16 14:18:58 2016 -0700 Committer: Andrew Or Committed: Thu Jun 16 14:19:08 2016 -0700 -- dev/run-tests.py | 2 +- dev/sparktestsupport/shellutils.py | 5 - 2 files changed, 5 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/93898875/dev/run-tests.py -- diff --git a/dev/run-tests.py b/dev/run-tests.py index dcf1be9..930d7f8 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -294,7 +294,7 @@ def exec_sbt(sbt_args=()): print(line, end='') retcode = sbt_proc.wait() -if retcode > 0: +if retcode != 0: exit_from_command_with_retcode(sbt_cmd, retcode) http://git-wip-us.apache.org/repos/asf/spark/blob/93898875/dev/sparktestsupport/shellutils.py -- diff --git a/dev/sparktestsupport/shellutils.py b/dev/sparktestsupport/shellutils.py index d280e79..05af871 100644 --- a/dev/sparktestsupport/shellutils.py +++ b/dev/sparktestsupport/shellutils.py @@ -53,7 +53,10 @@ else: def exit_from_command_with_retcode(cmd, retcode): -print("[error] running", ' '.join(cmd), "; received return code", retcode) +if retcode < 0: +print("[error] running", ' '.join(cmd), "; process was terminated by signal", -retcode) +else: +print("[error] running", ' '.join(cmd), "; received return code", retcode) sys.exit(int(os.environ.get("CURRENT_BLOCK", 255))) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15975] Fix improper Popen retcode code handling in dev/run-tests
Repository: spark Updated Branches: refs/heads/branch-1.5 6043fa8df -> 1891e04a6 [SPARK-15975] Fix improper Popen retcode code handling in dev/run-tests In the `dev/run-tests.py` script we check a `Popen.retcode` for success using `retcode > 0`, but this is subtlety wrong because Popen's return code will be negative if the child process was terminated by a signal: https://docs.python.org/2/library/subprocess.html#subprocess.Popen.returncode In order to properly handle signals, we should change this to check `retcode != 0` instead. Author: Josh Rosen Closes #13692 from JoshRosen/dev-run-tests-return-code-handling. (cherry picked from commit acef843f67e770f0a2709fb3fbd1a53c200b2bc5) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1891e04a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1891e04a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1891e04a Branch: refs/heads/branch-1.5 Commit: 1891e04a6441606f9bb14cf39f06a7d39cce456b Parents: 6043fa8 Author: Josh Rosen Authored: Thu Jun 16 14:18:58 2016 -0700 Committer: Andrew Or Committed: Thu Jun 16 14:19:32 2016 -0700 -- dev/run-tests.py | 2 +- dev/sparktestsupport/shellutils.py | 5 - 2 files changed, 5 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/1891e04a/dev/run-tests.py -- diff --git a/dev/run-tests.py b/dev/run-tests.py index 623b93c..bc54968 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -255,7 +255,7 @@ def exec_sbt(sbt_args=()): print(line, end='') retcode = sbt_proc.wait() -if retcode > 0: +if retcode != 0: exit_from_command_with_retcode(sbt_cmd, retcode) http://git-wip-us.apache.org/repos/asf/spark/blob/1891e04a/dev/sparktestsupport/shellutils.py -- diff --git a/dev/sparktestsupport/shellutils.py b/dev/sparktestsupport/shellutils.py index 12bd0bf..af483a9 100644 --- a/dev/sparktestsupport/shellutils.py +++ b/dev/sparktestsupport/shellutils.py @@ -23,7 +23,10 @@ import sys def exit_from_command_with_retcode(cmd, retcode): -print("[error] running", ' '.join(cmd), "; received return code", retcode) +if retcode < 0: +print("[error] running", ' '.join(cmd), "; process was terminated by signal", -retcode) +else: +print("[error] running", ' '.join(cmd), "; received return code", retcode) sys.exit(int(os.environ.get("CURRENT_BLOCK", 255))) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15975] Fix improper Popen retcode code handling in dev/run-tests
Repository: spark Updated Branches: refs/heads/master bbad4cb48 -> acef843f6 [SPARK-15975] Fix improper Popen retcode code handling in dev/run-tests In the `dev/run-tests.py` script we check a `Popen.retcode` for success using `retcode > 0`, but this is subtlety wrong because Popen's return code will be negative if the child process was terminated by a signal: https://docs.python.org/2/library/subprocess.html#subprocess.Popen.returncode In order to properly handle signals, we should change this to check `retcode != 0` instead. Author: Josh Rosen Closes #13692 from JoshRosen/dev-run-tests-return-code-handling. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/acef843f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/acef843f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/acef843f Branch: refs/heads/master Commit: acef843f67e770f0a2709fb3fbd1a53c200b2bc5 Parents: bbad4cb Author: Josh Rosen Authored: Thu Jun 16 14:18:58 2016 -0700 Committer: Andrew Or Committed: Thu Jun 16 14:18:58 2016 -0700 -- dev/run-tests.py | 2 +- dev/sparktestsupport/shellutils.py | 5 - 2 files changed, 5 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/acef843f/dev/run-tests.py -- diff --git a/dev/run-tests.py b/dev/run-tests.py index dcf1be9..930d7f8 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -294,7 +294,7 @@ def exec_sbt(sbt_args=()): print(line, end='') retcode = sbt_proc.wait() -if retcode > 0: +if retcode != 0: exit_from_command_with_retcode(sbt_cmd, retcode) http://git-wip-us.apache.org/repos/asf/spark/blob/acef843f/dev/sparktestsupport/shellutils.py -- diff --git a/dev/sparktestsupport/shellutils.py b/dev/sparktestsupport/shellutils.py index d280e79..05af871 100644 --- a/dev/sparktestsupport/shellutils.py +++ b/dev/sparktestsupport/shellutils.py @@ -53,7 +53,10 @@ else: def exit_from_command_with_retcode(cmd, retcode): -print("[error] running", ' '.join(cmd), "; received return code", retcode) +if retcode < 0: +print("[error] running", ' '.join(cmd), "; process was terminated by signal", -retcode) +else: +print("[error] running", ' '.join(cmd), "; received return code", retcode) sys.exit(int(os.environ.get("CURRENT_BLOCK", 255))) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15978][SQL] improve 'show tables' command related codes
Repository: spark Updated Branches: refs/heads/branch-2.0 095ddb4c9 -> d9dd46edd [SPARK-15978][SQL] improve 'show tables' command related codes ## What changes were proposed in this pull request? I've found some minor issues in "show tables" command: 1. In the `SessionCatalog.scala`, `listTables(db: String)` method will call `listTables(formatDatabaseName(db), "*")` to list all the tables for certain db, but in the method `listTables(db: String, pattern: String)`, this db name is formatted once more. So I think we should remove `formatDatabaseName()` in the caller. 2. I suggest to add sort to listTables(db: String) in InMemoryCatalog.scala, just like listDatabases(). ## How was this patch tested? The existing test cases should cover it. Author: bomeng Closes #13695 from bomeng/SPARK-15978. (cherry picked from commit bbad4cb48df2ac3ed7edb4c02db79540bd4085d8) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d9dd46ed Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d9dd46ed Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d9dd46ed Branch: refs/heads/branch-2.0 Commit: d9dd46edd3635ed79134a1521403c4478a34d3b3 Parents: 095ddb4 Author: bomeng Authored: Thu Jun 16 14:18:02 2016 -0700 Committer: Andrew Or Committed: Thu Jun 16 14:18:12 2016 -0700 -- .../org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala| 2 +- .../org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d9dd46ed/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala index 14da30a..fb3e1b3 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala @@ -286,7 +286,7 @@ class InMemoryCatalog(hadoopConfig: Configuration = new Configuration) extends E override def listTables(db: String): Seq[String] = synchronized { requireDbExists(db) -catalog(db).tables.keySet.toSeq +catalog(db).tables.keySet.toSeq.sorted } override def listTables(db: String, pattern: String): Seq[String] = synchronized { http://git-wip-us.apache.org/repos/asf/spark/blob/d9dd46ed/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala index 1ec1bb1..7ab10d1 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala @@ -445,7 +445,7 @@ class SessionCatalog( /** * List all tables in the specified database, including temporary tables. */ - def listTables(db: String): Seq[TableIdentifier] = listTables(formatDatabaseName(db), "*") + def listTables(db: String): Seq[TableIdentifier] = listTables(db, "*") /** * List all matching tables in the specified database, including temporary tables. - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15978][SQL] improve 'show tables' command related codes
Repository: spark Updated Branches: refs/heads/master 457126e42 -> bbad4cb48 [SPARK-15978][SQL] improve 'show tables' command related codes ## What changes were proposed in this pull request? I've found some minor issues in "show tables" command: 1. In the `SessionCatalog.scala`, `listTables(db: String)` method will call `listTables(formatDatabaseName(db), "*")` to list all the tables for certain db, but in the method `listTables(db: String, pattern: String)`, this db name is formatted once more. So I think we should remove `formatDatabaseName()` in the caller. 2. I suggest to add sort to listTables(db: String) in InMemoryCatalog.scala, just like listDatabases(). ## How was this patch tested? The existing test cases should cover it. Author: bomeng Closes #13695 from bomeng/SPARK-15978. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/bbad4cb4 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/bbad4cb4 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/bbad4cb4 Branch: refs/heads/master Commit: bbad4cb48df2ac3ed7edb4c02db79540bd4085d8 Parents: 457126e Author: bomeng Authored: Thu Jun 16 14:18:02 2016 -0700 Committer: Andrew Or Committed: Thu Jun 16 14:18:02 2016 -0700 -- .../org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala| 2 +- .../org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/bbad4cb4/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala index 14da30a..fb3e1b3 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala @@ -286,7 +286,7 @@ class InMemoryCatalog(hadoopConfig: Configuration = new Configuration) extends E override def listTables(db: String): Seq[String] = synchronized { requireDbExists(db) -catalog(db).tables.keySet.toSeq +catalog(db).tables.keySet.toSeq.sorted } override def listTables(db: String, pattern: String): Seq[String] = synchronized { http://git-wip-us.apache.org/repos/asf/spark/blob/bbad4cb4/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala index 1ec1bb1..7ab10d1 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala @@ -445,7 +445,7 @@ class SessionCatalog( /** * List all tables in the specified database, including temporary tables. */ - def listTables(db: String): Seq[TableIdentifier] = listTables(formatDatabaseName(db), "*") + def listTables(db: String): Seq[TableIdentifier] = listTables(db, "*") /** * List all matching tables in the specified database, including temporary tables. - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15736][CORE][BRANCH-1.6] Gracefully handle loss of DiskStore files
Repository: spark Updated Branches: refs/heads/branch-1.6 0a13e4c07 -> 4259a2858 [SPARK-15736][CORE][BRANCH-1.6] Gracefully handle loss of DiskStore files If an RDD partition is cached on disk and the DiskStore file is lost, then reads of that cached partition will fail and the missing partition is supposed to be recomputed by a new task attempt. In the current BlockManager implementation, however, the missing file does not trigger any metadata updates / does not invalidate the cache, so subsequent task attempts will be scheduled on the same executor and the doomed read will be repeatedly retried, leading to repeated task failures and eventually a total job failure. In order to fix this problem, the executor with the missing file needs to properly mark the corresponding block as missing so that it stops advertising itself as a cache location for that block. This patch fixes this bug and adds an end-to-end regression test (in `FailureSuite`) and a set of unit tests (`in BlockManagerSuite`). This is a branch-1.6 backport of #13473. Author: Josh Rosen Closes #13479 from JoshRosen/handle-missing-cache-files-branch-1.6. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4259a285 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4259a285 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4259a285 Branch: refs/heads/branch-1.6 Commit: 4259a28588a4dceb55d7bf1bf9327065dd751863 Parents: 0a13e4c Author: Josh Rosen Authored: Thu Jun 2 17:47:31 2016 -0700 Committer: Andrew Or Committed: Thu Jun 2 17:47:31 2016 -0700 -- .../org/apache/spark/storage/BlockManager.scala | 13 --- .../scala/org/apache/spark/FailureSuite.scala | 12 ++ .../spark/storage/BlockManagerSuite.scala | 41 3 files changed, 61 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4259a285/core/src/main/scala/org/apache/spark/storage/BlockManager.scala -- diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala index 288f756..339ee144 100644 --- a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala +++ b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala @@ -507,11 +507,14 @@ private[spark] class BlockManager( // Look for block on disk, potentially storing it back in memory if required if (level.useDisk) { logDebug(s"Getting block $blockId from disk") - val bytes: ByteBuffer = diskStore.getBytes(blockId) match { -case Some(b) => b -case None => - throw new BlockException( -blockId, s"Block $blockId not found on disk, though it should be") + val bytes: ByteBuffer = if (diskStore.contains(blockId)) { +// DiskStore.getBytes() always returns Some, so this .get() is guaranteed to be safe +diskStore.getBytes(blockId).get + } else { +// Remove the missing block so that its unavailability is reported to the driver +removeBlock(blockId) +throw new BlockException( + blockId, s"Block $blockId not found on disk, though it should be") } assert(0 == bytes.position()) http://git-wip-us.apache.org/repos/asf/spark/blob/4259a285/core/src/test/scala/org/apache/spark/FailureSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/FailureSuite.scala b/core/src/test/scala/org/apache/spark/FailureSuite.scala index 203dab9..85983b2 100644 --- a/core/src/test/scala/org/apache/spark/FailureSuite.scala +++ b/core/src/test/scala/org/apache/spark/FailureSuite.scala @@ -17,6 +17,7 @@ package org.apache.spark +import org.apache.spark.storage.StorageLevel import org.apache.spark.util.NonSerializable import java.io.{IOException, NotSerializableException, ObjectInputStream} @@ -238,6 +239,17 @@ class FailureSuite extends SparkFunSuite with LocalSparkContext { FailureSuiteState.clear() } + test("failure because cached RDD files are missing") { +sc = new SparkContext("local[1,2]", "test") +val rdd = sc.parallelize(1 to 2, 2).persist(StorageLevel.DISK_ONLY) +rdd.count() +// Directly delete all files from the disk store, triggering failures when reading cached data: + SparkEnv.get.blockManager.diskBlockManager.getAllFiles().foreach(_.delete()) +// Each task should fail once due to missing cached data, but then should succeed on its second +// attempt because the missing cache locations will be purged and the blocks will be recomputed. +rdd.count() + } + // TODO: Ne
spark git commit: [SPARK-15715][SQL] Fix alter partition with storage information in Hive
Repository: spark Updated Branches: refs/heads/branch-2.0 0c721eedc -> d02f2926b [SPARK-15715][SQL] Fix alter partition with storage information in Hive ## What changes were proposed in this pull request? This command didn't work for Hive tables. Now it does: ``` ALTER TABLE boxes PARTITION (width=3) SET SERDE 'com.sparkbricks.serde.ColumnarSerDe' WITH SERDEPROPERTIES ('compress'='true') ``` ## How was this patch tested? `HiveExternalCatalogSuite` Author: Andrew Or Closes #13453 from andrewor14/alter-partition-storage. (cherry picked from commit d1c1fbc345a704a2c8210960683f33f945660d5a) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d02f2926 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d02f2926 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d02f2926 Branch: refs/heads/branch-2.0 Commit: d02f2926bfbbeb6593cc1deccfb0360ba5b4f0f0 Parents: 0c721ee Author: Andrew Or Authored: Thu Jun 2 17:44:48 2016 -0700 Committer: Andrew Or Committed: Thu Jun 2 17:44:56 2016 -0700 -- .../catalyst/catalog/ExternalCatalogSuite.scala | 10 +++ .../spark/sql/hive/client/HiveClientImpl.scala | 30 ++-- .../spark/sql/hive/client/VersionsSuite.scala | 5 +++- .../spark/sql/hive/execution/HiveDDLSuite.scala | 22 ++ 4 files changed, 57 insertions(+), 10 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d02f2926/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogSuite.scala -- diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogSuite.scala index 377e64b..0c4d363 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogSuite.scala @@ -382,6 +382,8 @@ abstract class ExternalCatalogSuite extends SparkFunSuite with BeforeAndAfterEac // See HIVE-2742 for more detail. catalog.setCurrentDatabase("db2") val newLocation = newUriForDatabase() + val newSerde = "com.sparkbricks.text.EasySerde" + val newSerdeProps = Map("spark" -> "bricks", "compressed" -> "false") // alter but keep spec the same val oldPart1 = catalog.getPartition("db2", "tbl2", part1.spec) val oldPart2 = catalog.getPartition("db2", "tbl2", part2.spec) @@ -394,6 +396,14 @@ abstract class ExternalCatalogSuite extends SparkFunSuite with BeforeAndAfterEac assert(newPart2.storage.locationUri == Some(newLocation)) assert(oldPart1.storage.locationUri != Some(newLocation)) assert(oldPart2.storage.locationUri != Some(newLocation)) + // alter other storage information + catalog.alterPartitions("db2", "tbl2", Seq( +oldPart1.copy(storage = storageFormat.copy(serde = Some(newSerde))), +oldPart2.copy(storage = storageFormat.copy(serdeProperties = newSerdeProps + val newPart1b = catalog.getPartition("db2", "tbl2", part1.spec) + val newPart2b = catalog.getPartition("db2", "tbl2", part2.spec) + assert(newPart1b.storage.serde == Some(newSerde)) + assert(newPart2b.storage.serdeProperties == newSerdeProps) // alter but change spec, should fail because new partition specs do not exist yet val badPart1 = part1.copy(spec = Map("a" -> "v1", "b" -> "v2")) val badPart2 = part2.copy(spec = Map("a" -> "v3", "b" -> "v4")) http://git-wip-us.apache.org/repos/asf/spark/blob/d02f2926/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala -- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala index 47fa418..1c89d8c 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala @@ -28,6 +28,7 @@ import org.apache.hadoop.fs.Path import org.apache.hadoop.hive.conf.HiveConf import org.apache.hadoop.hive.metastore.{TableType => HiveTableType} import org.apache.hadoop.hive.metastore.api.{Database => HiveDatabase, FieldSchema} +import org.apach
spark git commit: [SPARK-15715][SQL] Fix alter partition with storage information in Hive
Repository: spark Updated Branches: refs/heads/master e23370ec6 -> d1c1fbc34 [SPARK-15715][SQL] Fix alter partition with storage information in Hive ## What changes were proposed in this pull request? This command didn't work for Hive tables. Now it does: ``` ALTER TABLE boxes PARTITION (width=3) SET SERDE 'com.sparkbricks.serde.ColumnarSerDe' WITH SERDEPROPERTIES ('compress'='true') ``` ## How was this patch tested? `HiveExternalCatalogSuite` Author: Andrew Or Closes #13453 from andrewor14/alter-partition-storage. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d1c1fbc3 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d1c1fbc3 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d1c1fbc3 Branch: refs/heads/master Commit: d1c1fbc345a704a2c8210960683f33f945660d5a Parents: e23370e Author: Andrew Or Authored: Thu Jun 2 17:44:48 2016 -0700 Committer: Andrew Or Committed: Thu Jun 2 17:44:48 2016 -0700 -- .../catalyst/catalog/ExternalCatalogSuite.scala | 10 +++ .../spark/sql/hive/client/HiveClientImpl.scala | 30 ++-- .../spark/sql/hive/client/VersionsSuite.scala | 5 +++- .../spark/sql/hive/execution/HiveDDLSuite.scala | 22 ++ 4 files changed, 57 insertions(+), 10 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d1c1fbc3/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogSuite.scala -- diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogSuite.scala index 377e64b..0c4d363 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogSuite.scala @@ -382,6 +382,8 @@ abstract class ExternalCatalogSuite extends SparkFunSuite with BeforeAndAfterEac // See HIVE-2742 for more detail. catalog.setCurrentDatabase("db2") val newLocation = newUriForDatabase() + val newSerde = "com.sparkbricks.text.EasySerde" + val newSerdeProps = Map("spark" -> "bricks", "compressed" -> "false") // alter but keep spec the same val oldPart1 = catalog.getPartition("db2", "tbl2", part1.spec) val oldPart2 = catalog.getPartition("db2", "tbl2", part2.spec) @@ -394,6 +396,14 @@ abstract class ExternalCatalogSuite extends SparkFunSuite with BeforeAndAfterEac assert(newPart2.storage.locationUri == Some(newLocation)) assert(oldPart1.storage.locationUri != Some(newLocation)) assert(oldPart2.storage.locationUri != Some(newLocation)) + // alter other storage information + catalog.alterPartitions("db2", "tbl2", Seq( +oldPart1.copy(storage = storageFormat.copy(serde = Some(newSerde))), +oldPart2.copy(storage = storageFormat.copy(serdeProperties = newSerdeProps + val newPart1b = catalog.getPartition("db2", "tbl2", part1.spec) + val newPart2b = catalog.getPartition("db2", "tbl2", part2.spec) + assert(newPart1b.storage.serde == Some(newSerde)) + assert(newPart2b.storage.serdeProperties == newSerdeProps) // alter but change spec, should fail because new partition specs do not exist yet val badPart1 = part1.copy(spec = Map("a" -> "v1", "b" -> "v2")) val badPart2 = part2.copy(spec = Map("a" -> "v3", "b" -> "v4")) http://git-wip-us.apache.org/repos/asf/spark/blob/d1c1fbc3/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala -- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala index 47fa418..1c89d8c 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala @@ -28,6 +28,7 @@ import org.apache.hadoop.fs.Path import org.apache.hadoop.hive.conf.HiveConf import org.apache.hadoop.hive.metastore.{TableType => HiveTableType} import org.apache.hadoop.hive.metastore.api.{Database => HiveDatabase, FieldSchema} +import org.apache.hadoop.hive.metastore.api.{SerDeInfo, StorageDescriptor} import org.apache.hadoop.hive.ql.Driver i
spark git commit: [SPARK-15740][MLLIB] ignore big model load / save in Word2VecSuite
Repository: spark Updated Branches: refs/heads/branch-2.0 1bce96db5 -> 0c721eedc [SPARK-15740][MLLIB] ignore big model load / save in Word2VecSuite ## What changes were proposed in this pull request? andrewor14 noticed some OOM errors caused by "test big model load / save" in Word2VecSuite, e.g., https://amplab.cs.berkeley.edu/jenkins/view/Spark%20QA%20Test/job/spark-master-test-maven-hadoop-2.2/1168/consoleFull. It doesn't show up in the test result because it was OOMed. This PR disables the test. I will leave the JIRA open for a proper fix ## How was this patch tested? No new features. Author: Xiangrui Meng Closes #13478 from mengxr/SPARK-15740. (cherry picked from commit e23370ec617c527ffa3a1f7d285ee2c4ffc51b77) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0c721eed Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0c721eed Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0c721eed Branch: refs/heads/branch-2.0 Commit: 0c721eedcc8a20092e3867c4eacb1341c4cb1831 Parents: 1bce96d Author: Xiangrui Meng Authored: Thu Jun 2 17:41:31 2016 -0700 Committer: Andrew Or Committed: Thu Jun 2 17:41:39 2016 -0700 -- .../test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0c721eed/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala index 6d69944..c9fb976 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala @@ -91,7 +91,7 @@ class Word2VecSuite extends SparkFunSuite with MLlibTestSparkContext { } - test("big model load / save") { + ignore("big model load / save") { // create a model bigger than 32MB since 9000 * 1000 * 4 > 2^25 val word2VecMap = Map((0 to 9000).map(i => s"$i" -> Array.fill(1000)(0.1f)): _*) val model = new Word2VecModel(word2VecMap) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15740][MLLIB] ignore big model load / save in Word2VecSuite
Repository: spark Updated Branches: refs/heads/master f34aadc54 -> e23370ec6 [SPARK-15740][MLLIB] ignore big model load / save in Word2VecSuite ## What changes were proposed in this pull request? andrewor14 noticed some OOM errors caused by "test big model load / save" in Word2VecSuite, e.g., https://amplab.cs.berkeley.edu/jenkins/view/Spark%20QA%20Test/job/spark-master-test-maven-hadoop-2.2/1168/consoleFull. It doesn't show up in the test result because it was OOMed. This PR disables the test. I will leave the JIRA open for a proper fix ## How was this patch tested? No new features. Author: Xiangrui Meng Closes #13478 from mengxr/SPARK-15740. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e23370ec Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e23370ec Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e23370ec Branch: refs/heads/master Commit: e23370ec617c527ffa3a1f7d285ee2c4ffc51b77 Parents: f34aadc Author: Xiangrui Meng Authored: Thu Jun 2 17:41:31 2016 -0700 Committer: Andrew Or Committed: Thu Jun 2 17:41:31 2016 -0700 -- .../test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e23370ec/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala index 6d69944..c9fb976 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala @@ -91,7 +91,7 @@ class Word2VecSuite extends SparkFunSuite with MLlibTestSparkContext { } - test("big model load / save") { + ignore("big model load / save") { // create a model bigger than 32MB since 9000 * 1000 * 4 > 2^25 val word2VecMap = Map((0 to 9000).map(i => s"$i" -> Array.fill(1000)(0.1f)): _*) val model = new Word2VecModel(word2VecMap) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15718][SQL] better error message for writing bucketed data
Repository: spark Updated Branches: refs/heads/master 229f90225 -> f34aadc54 [SPARK-15718][SQL] better error message for writing bucketed data ## What changes were proposed in this pull request? Currently we don't support bucketing for `save` and `insertInto`. For `save`, we just write the data out into a directory users specified, and it's not a table, we don't keep its metadata. When we read it back, we have no idea if the data is bucketed or not, so it doesn't make sense to use `save` to write bucketed data, as we can't use the bucket information anyway. We can support it in the future, once we have features like bucket discovery, or we save bucket information in the data directory too, so that we don't need to rely on a metastore. For `insertInto`, it inserts data into an existing table, so it doesn't make sense to specify bucket information, as we should get the bucket information from the existing table. This PR improves the error message for the above 2 cases. ## How was this patch tested? new test in `BukctedWriteSuite` Author: Wenchen Fan Closes #13452 from cloud-fan/error-msg. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f34aadc5 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f34aadc5 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f34aadc5 Branch: refs/heads/master Commit: f34aadc54ca1a9fd4236a928d342324b26fb3a12 Parents: 229f902 Author: Wenchen Fan Authored: Thu Jun 2 17:39:56 2016 -0700 Committer: Andrew Or Committed: Thu Jun 2 17:39:56 2016 -0700 -- .../org/apache/spark/sql/DataFrameWriter.scala | 10 +- .../test/DataFrameReaderWriterSuite.scala| 4 ++-- .../spark/sql/sources/BucketedWriteSuite.scala | 19 +++ 3 files changed, 22 insertions(+), 11 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f34aadc5/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala index 50ae966..1dd8818 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala @@ -281,7 +281,7 @@ final class DataFrameWriter private[sql](df: DataFrame) { * @since 1.4.0 */ def save(): Unit = { -assertNotBucketed() +assertNotBucketed("save") assertNotStreaming("save() can only be called on non-continuous queries") val dataSource = DataSource( df.sparkSession, @@ -330,7 +330,7 @@ final class DataFrameWriter private[sql](df: DataFrame) { */ @Experimental def startStream(): ContinuousQuery = { -assertNotBucketed() +assertNotBucketed("startStream") assertStreaming("startStream() can only be called on continuous queries") if (source == "memory") { @@ -430,7 +430,7 @@ final class DataFrameWriter private[sql](df: DataFrame) { } private def insertInto(tableIdent: TableIdentifier): Unit = { -assertNotBucketed() +assertNotBucketed("insertInto") assertNotStreaming("insertInto() can only be called on non-continuous queries") val partitions = normalizedParCols.map(_.map(col => col -> (None: Option[String])).toMap) val overwrite = mode == SaveMode.Overwrite @@ -500,10 +500,10 @@ final class DataFrameWriter private[sql](df: DataFrame) { s"existing columns (${validColumnNames.mkString(", ")})")) } - private def assertNotBucketed(): Unit = { + private def assertNotBucketed(operation: String): Unit = { if (numBuckets.isDefined || sortColumnNames.isDefined) { throw new IllegalArgumentException( -"Currently we don't support writing bucketed data to this data source.") +s"'$operation' does not support bucketing right now.") } } http://git-wip-us.apache.org/repos/asf/spark/blob/f34aadc5/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataFrameReaderWriterSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataFrameReaderWriterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataFrameReaderWriterSuite.scala index a2aac69..431a943 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataFrameReaderWriterSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataFrameReaderWriterSuite.scala @@ -456,7 +456,7 @@ class DataFrameReaderWriterSuite extends StreamTest with BeforeAndAfter { .stream() val w = df.write val e = intercept[IllegalArgumentException](w.bucketBy(1, "text").s
spark git commit: [SPARK-15718][SQL] better error message for writing bucketed data
Repository: spark Updated Branches: refs/heads/branch-2.0 1551a72cb -> 1bce96db5 [SPARK-15718][SQL] better error message for writing bucketed data ## What changes were proposed in this pull request? Currently we don't support bucketing for `save` and `insertInto`. For `save`, we just write the data out into a directory users specified, and it's not a table, we don't keep its metadata. When we read it back, we have no idea if the data is bucketed or not, so it doesn't make sense to use `save` to write bucketed data, as we can't use the bucket information anyway. We can support it in the future, once we have features like bucket discovery, or we save bucket information in the data directory too, so that we don't need to rely on a metastore. For `insertInto`, it inserts data into an existing table, so it doesn't make sense to specify bucket information, as we should get the bucket information from the existing table. This PR improves the error message for the above 2 cases. ## How was this patch tested? new test in `BukctedWriteSuite` Author: Wenchen Fan Closes #13452 from cloud-fan/error-msg. (cherry picked from commit f34aadc54ca1a9fd4236a928d342324b26fb3a12) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1bce96db Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1bce96db Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1bce96db Branch: refs/heads/branch-2.0 Commit: 1bce96db5f366099d09d9083a21e1b34d15fae19 Parents: 1551a72 Author: Wenchen Fan Authored: Thu Jun 2 17:39:56 2016 -0700 Committer: Andrew Or Committed: Thu Jun 2 17:40:06 2016 -0700 -- .../org/apache/spark/sql/DataFrameWriter.scala | 10 +- .../test/DataFrameReaderWriterSuite.scala| 4 ++-- .../spark/sql/sources/BucketedWriteSuite.scala | 19 +++ 3 files changed, 22 insertions(+), 11 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/1bce96db/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala index 50ae966..1dd8818 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala @@ -281,7 +281,7 @@ final class DataFrameWriter private[sql](df: DataFrame) { * @since 1.4.0 */ def save(): Unit = { -assertNotBucketed() +assertNotBucketed("save") assertNotStreaming("save() can only be called on non-continuous queries") val dataSource = DataSource( df.sparkSession, @@ -330,7 +330,7 @@ final class DataFrameWriter private[sql](df: DataFrame) { */ @Experimental def startStream(): ContinuousQuery = { -assertNotBucketed() +assertNotBucketed("startStream") assertStreaming("startStream() can only be called on continuous queries") if (source == "memory") { @@ -430,7 +430,7 @@ final class DataFrameWriter private[sql](df: DataFrame) { } private def insertInto(tableIdent: TableIdentifier): Unit = { -assertNotBucketed() +assertNotBucketed("insertInto") assertNotStreaming("insertInto() can only be called on non-continuous queries") val partitions = normalizedParCols.map(_.map(col => col -> (None: Option[String])).toMap) val overwrite = mode == SaveMode.Overwrite @@ -500,10 +500,10 @@ final class DataFrameWriter private[sql](df: DataFrame) { s"existing columns (${validColumnNames.mkString(", ")})")) } - private def assertNotBucketed(): Unit = { + private def assertNotBucketed(operation: String): Unit = { if (numBuckets.isDefined || sortColumnNames.isDefined) { throw new IllegalArgumentException( -"Currently we don't support writing bucketed data to this data source.") +s"'$operation' does not support bucketing right now.") } } http://git-wip-us.apache.org/repos/asf/spark/blob/1bce96db/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataFrameReaderWriterSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataFrameReaderWriterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataFrameReaderWriterSuite.scala index a2aac69..431a943 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataFrameReaderWriterSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataFrameReaderWriterSuite.scala @@ -456,7 +456,7 @@ class DataFrameReaderWriterSuite extends StreamTest with BeforeAndAfter { .
spark git commit: [SPARK-15736][CORE] Gracefully handle loss of DiskStore files
Repository: spark Updated Branches: refs/heads/master 5855e0057 -> 229f90225 [SPARK-15736][CORE] Gracefully handle loss of DiskStore files If an RDD partition is cached on disk and the DiskStore file is lost, then reads of that cached partition will fail and the missing partition is supposed to be recomputed by a new task attempt. In the current BlockManager implementation, however, the missing file does not trigger any metadata updates / does not invalidate the cache, so subsequent task attempts will be scheduled on the same executor and the doomed read will be repeatedly retried, leading to repeated task failures and eventually a total job failure. In order to fix this problem, the executor with the missing file needs to properly mark the corresponding block as missing so that it stops advertising itself as a cache location for that block. This patch fixes this bug and adds an end-to-end regression test (in `FailureSuite`) and a set of unit tests (`in BlockManagerSuite`). Author: Josh Rosen Closes #13473 from JoshRosen/handle-missing-cache-files. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/229f9022 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/229f9022 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/229f9022 Branch: refs/heads/master Commit: 229f90225748343972d7202c5567b45364cd8497 Parents: 5855e00 Author: Josh Rosen Authored: Thu Jun 2 17:36:31 2016 -0700 Committer: Andrew Or Committed: Thu Jun 2 17:36:31 2016 -0700 -- .../org/apache/spark/storage/BlockManager.scala | 20 +++--- .../scala/org/apache/spark/FailureSuite.scala | 12 ++ .../spark/storage/BlockManagerSuite.scala | 40 3 files changed, 66 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/229f9022/core/src/main/scala/org/apache/spark/storage/BlockManager.scala -- diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala index 2f9473a..83a9cbd 100644 --- a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala +++ b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala @@ -403,6 +403,17 @@ private[spark] class BlockManager( } /** + * Cleanup code run in response to a failed local read. + * Must be called while holding a read lock on the block. + */ + private def handleLocalReadFailure(blockId: BlockId): Nothing = { +releaseLock(blockId) +// Remove the missing block so that its unavailability is reported to the driver +removeBlock(blockId) +throw new SparkException(s"Block $blockId was not found even though it's read-locked") + } + + /** * Get block from local block manager as an iterator of Java objects. */ def getLocalValues(blockId: BlockId): Option[BlockResult] = { @@ -441,8 +452,7 @@ private[spark] class BlockManager( val ci = CompletionIterator[Any, Iterator[Any]](iterToReturn, releaseLock(blockId)) Some(new BlockResult(ci, DataReadMethod.Disk, info.size)) } else { - releaseLock(blockId) - throw new SparkException(s"Block $blockId was not found even though it's read-locked") + handleLocalReadFailure(blockId) } } } @@ -489,8 +499,7 @@ private[spark] class BlockManager( // The block was not found on disk, so serialize an in-memory copy: serializerManager.dataSerialize(blockId, memoryStore.getValues(blockId).get) } else { -releaseLock(blockId) -throw new SparkException(s"Block $blockId was not found even though it's read-locked") +handleLocalReadFailure(blockId) } } else { // storage level is serialized if (level.useMemory && memoryStore.contains(blockId)) { @@ -499,8 +508,7 @@ private[spark] class BlockManager( val diskBytes = diskStore.getBytes(blockId) maybeCacheDiskBytesInMemory(info, blockId, level, diskBytes).getOrElse(diskBytes) } else { -releaseLock(blockId) -throw new SparkException(s"Block $blockId was not found even though it's read-locked") +handleLocalReadFailure(blockId) } } } http://git-wip-us.apache.org/repos/asf/spark/blob/229f9022/core/src/test/scala/org/apache/spark/FailureSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/FailureSuite.scala b/core/src/test/scala/org/apache/spark/FailureSuite.scala index 333c23b..132f636 100644 --- a/core/src/test/scala/org/apache/spark/FailureSuite.scala +++ b/core/src/test/scala/org/apache/spark/FailureSuite.scala @@ -20,6 +20,7 @@ package org.apache.spark imp
spark git commit: [SPARK-15736][CORE] Gracefully handle loss of DiskStore files
Repository: spark Updated Branches: refs/heads/branch-2.0 0802ff9f6 -> 1551a72cb [SPARK-15736][CORE] Gracefully handle loss of DiskStore files If an RDD partition is cached on disk and the DiskStore file is lost, then reads of that cached partition will fail and the missing partition is supposed to be recomputed by a new task attempt. In the current BlockManager implementation, however, the missing file does not trigger any metadata updates / does not invalidate the cache, so subsequent task attempts will be scheduled on the same executor and the doomed read will be repeatedly retried, leading to repeated task failures and eventually a total job failure. In order to fix this problem, the executor with the missing file needs to properly mark the corresponding block as missing so that it stops advertising itself as a cache location for that block. This patch fixes this bug and adds an end-to-end regression test (in `FailureSuite`) and a set of unit tests (`in BlockManagerSuite`). Author: Josh Rosen Closes #13473 from JoshRosen/handle-missing-cache-files. (cherry picked from commit 229f90225748343972d7202c5567b45364cd8497) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1551a72c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1551a72c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1551a72c Branch: refs/heads/branch-2.0 Commit: 1551a72cb7217f6d8b1c30fddcc865a9df545cff Parents: 0802ff9 Author: Josh Rosen Authored: Thu Jun 2 17:36:31 2016 -0700 Committer: Andrew Or Committed: Thu Jun 2 17:36:39 2016 -0700 -- .../org/apache/spark/storage/BlockManager.scala | 20 +++--- .../scala/org/apache/spark/FailureSuite.scala | 12 ++ .../spark/storage/BlockManagerSuite.scala | 40 3 files changed, 66 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/1551a72c/core/src/main/scala/org/apache/spark/storage/BlockManager.scala -- diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala index 2f9473a..83a9cbd 100644 --- a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala +++ b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala @@ -403,6 +403,17 @@ private[spark] class BlockManager( } /** + * Cleanup code run in response to a failed local read. + * Must be called while holding a read lock on the block. + */ + private def handleLocalReadFailure(blockId: BlockId): Nothing = { +releaseLock(blockId) +// Remove the missing block so that its unavailability is reported to the driver +removeBlock(blockId) +throw new SparkException(s"Block $blockId was not found even though it's read-locked") + } + + /** * Get block from local block manager as an iterator of Java objects. */ def getLocalValues(blockId: BlockId): Option[BlockResult] = { @@ -441,8 +452,7 @@ private[spark] class BlockManager( val ci = CompletionIterator[Any, Iterator[Any]](iterToReturn, releaseLock(blockId)) Some(new BlockResult(ci, DataReadMethod.Disk, info.size)) } else { - releaseLock(blockId) - throw new SparkException(s"Block $blockId was not found even though it's read-locked") + handleLocalReadFailure(blockId) } } } @@ -489,8 +499,7 @@ private[spark] class BlockManager( // The block was not found on disk, so serialize an in-memory copy: serializerManager.dataSerialize(blockId, memoryStore.getValues(blockId).get) } else { -releaseLock(blockId) -throw new SparkException(s"Block $blockId was not found even though it's read-locked") +handleLocalReadFailure(blockId) } } else { // storage level is serialized if (level.useMemory && memoryStore.contains(blockId)) { @@ -499,8 +508,7 @@ private[spark] class BlockManager( val diskBytes = diskStore.getBytes(blockId) maybeCacheDiskBytesInMemory(info, blockId, level, diskBytes).getOrElse(diskBytes) } else { -releaseLock(blockId) -throw new SparkException(s"Block $blockId was not found even though it's read-locked") +handleLocalReadFailure(blockId) } } } http://git-wip-us.apache.org/repos/asf/spark/blob/1551a72c/core/src/test/scala/org/apache/spark/FailureSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/FailureSuite.scala b/core/src/test/scala/org/apache/spark/FailureSuite.scala index 333c23b..132f636 100644 --- a/core/src/test/scala/org/apache/spark/FailureSuite.scala +++ b
spark git commit: [SPARK-15711][SQL] Ban CREATE TEMPORARY TABLE USING AS SELECT
Repository: spark Updated Branches: refs/heads/branch-2.0 32b025e94 -> a55454eb6 [SPARK-15711][SQL] Ban CREATE TEMPORARY TABLE USING AS SELECT ## What changes were proposed in this pull request? This PR bans syntax like `CREATE TEMPORARY TABLE USING AS SELECT` `CREATE TEMPORARY TABLE ... USING ... AS ...` is not properly implemented, the temporary data is not cleaned up when the session exits. Before a full fix, we probably should ban this syntax. This PR only impact syntax like `CREATE TEMPORARY TABLE ... USING ... AS ...`. Other syntax like `CREATE TEMPORARY TABLE .. USING ...` and `CREATE TABLE ... USING ...` are not impacted. ## How was this patch tested? Unit test. Author: Sean Zhong Closes #13451 from clockfly/ban_create_temp_table_using_as. (cherry picked from commit d109a1beeef5bca1e683247e0a5db4ec841bf3ba) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a55454eb Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a55454eb Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a55454eb Branch: refs/heads/branch-2.0 Commit: a55454eb6a1d1b785982bacc282753372a8107b9 Parents: 32b025e Author: Sean Zhong Authored: Thu Jun 2 14:11:01 2016 -0700 Committer: Andrew Or Committed: Thu Jun 2 14:11:12 2016 -0700 -- .../org/apache/spark/sql/DataFrameWriter.scala | 1 - .../spark/sql/execution/SparkSqlParser.scala| 9 +- .../spark/sql/execution/SparkStrategies.scala | 10 +- .../spark/sql/execution/datasources/ddl.scala | 32 --- .../sql/sources/CreateTableAsSelectSuite.scala | 265 ++- .../sql/hive/execution/SQLQuerySuite.scala | 46 6 files changed, 142 insertions(+), 221 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a55454eb/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala index 25678e9..50ae966 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala @@ -561,7 +561,6 @@ final class DataFrameWriter private[sql](df: DataFrame) { CreateTableUsingAsSelect( tableIdent, source, -temporary = false, partitioningColumns.map(_.toArray).getOrElse(Array.empty[String]), getBucketSpec, mode, http://git-wip-us.apache.org/repos/asf/spark/blob/a55454eb/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala index 01409c6..8ffc556 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala @@ -317,17 +317,19 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder { // Get the backing query. val query = plan(ctx.query) + if (temp) { +throw operationNotAllowed("CREATE TEMPORARY TABLE ... USING ... AS query", ctx) + } + // Determine the storage mode. val mode = if (ifNotExists) { SaveMode.Ignore - } else if (temp) { -SaveMode.Overwrite } else { SaveMode.ErrorIfExists } CreateTableUsingAsSelect( -table, provider, temp, partitionColumnNames, bucketSpec, mode, options, query) +table, provider, partitionColumnNames, bucketSpec, mode, options, query) } else { val struct = Option(ctx.colTypeList()).map(createStructType) CreateTableUsing( @@ -960,7 +962,6 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder { CreateTableUsingAsSelect( tableIdent = tableDesc.identifier, provider = conf.defaultDataSourceName, -temporary = false, partitionColumns = tableDesc.partitionColumnNames.toArray, bucketSpec = None, mode = mode, http://git-wip-us.apache.org/repos/asf/spark/blob/a55454eb/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala index 0110663..2e69027 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.sc
spark git commit: [SPARK-15711][SQL] Ban CREATE TEMPORARY TABLE USING AS SELECT
Repository: spark Updated Branches: refs/heads/master 9aff6f3b1 -> d109a1bee [SPARK-15711][SQL] Ban CREATE TEMPORARY TABLE USING AS SELECT ## What changes were proposed in this pull request? This PR bans syntax like `CREATE TEMPORARY TABLE USING AS SELECT` `CREATE TEMPORARY TABLE ... USING ... AS ...` is not properly implemented, the temporary data is not cleaned up when the session exits. Before a full fix, we probably should ban this syntax. This PR only impact syntax like `CREATE TEMPORARY TABLE ... USING ... AS ...`. Other syntax like `CREATE TEMPORARY TABLE .. USING ...` and `CREATE TABLE ... USING ...` are not impacted. ## How was this patch tested? Unit test. Author: Sean Zhong Closes #13451 from clockfly/ban_create_temp_table_using_as. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d109a1be Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d109a1be Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d109a1be Branch: refs/heads/master Commit: d109a1beeef5bca1e683247e0a5db4ec841bf3ba Parents: 9aff6f3 Author: Sean Zhong Authored: Thu Jun 2 14:11:01 2016 -0700 Committer: Andrew Or Committed: Thu Jun 2 14:11:01 2016 -0700 -- .../org/apache/spark/sql/DataFrameWriter.scala | 1 - .../spark/sql/execution/SparkSqlParser.scala| 9 +- .../spark/sql/execution/SparkStrategies.scala | 10 +- .../spark/sql/execution/datasources/ddl.scala | 32 --- .../sql/sources/CreateTableAsSelectSuite.scala | 265 ++- .../sql/hive/execution/SQLQuerySuite.scala | 46 6 files changed, 142 insertions(+), 221 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d109a1be/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala index 25678e9..50ae966 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala @@ -561,7 +561,6 @@ final class DataFrameWriter private[sql](df: DataFrame) { CreateTableUsingAsSelect( tableIdent, source, -temporary = false, partitioningColumns.map(_.toArray).getOrElse(Array.empty[String]), getBucketSpec, mode, http://git-wip-us.apache.org/repos/asf/spark/blob/d109a1be/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala index 01409c6..8ffc556 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala @@ -317,17 +317,19 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder { // Get the backing query. val query = plan(ctx.query) + if (temp) { +throw operationNotAllowed("CREATE TEMPORARY TABLE ... USING ... AS query", ctx) + } + // Determine the storage mode. val mode = if (ifNotExists) { SaveMode.Ignore - } else if (temp) { -SaveMode.Overwrite } else { SaveMode.ErrorIfExists } CreateTableUsingAsSelect( -table, provider, temp, partitionColumnNames, bucketSpec, mode, options, query) +table, provider, partitionColumnNames, bucketSpec, mode, options, query) } else { val struct = Option(ctx.colTypeList()).map(createStructType) CreateTableUsing( @@ -960,7 +962,6 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder { CreateTableUsingAsSelect( tableIdent = tableDesc.identifier, provider = conf.defaultDataSourceName, -temporary = false, partitionColumns = tableDesc.partitionColumnNames.toArray, bucketSpec = None, mode = mode, http://git-wip-us.apache.org/repos/asf/spark/blob/d109a1be/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala index 9610506..b20897e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala @@ -397,15 +397,
spark git commit: [SPARK-15646][SQL] When spark.sql.hive.convertCTAS is true, the conversion rule needs to respect TEXTFILE/SEQUENCEFILE format and the user-defined location
Repository: spark Updated Branches: refs/heads/branch-2.0 35195f6ce -> 5a835b99f [SPARK-15646][SQL] When spark.sql.hive.convertCTAS is true, the conversion rule needs to respect TEXTFILE/SEQUENCEFILE format and the user-defined location ## What changes were proposed in this pull request? When `spark.sql.hive.convertCTAS` is true, for a CTAS statement, we will create a data source table using the default source (i.e. parquet) if the CTAS does not specify any Hive storage format. However, there are two issues with this conversion logic. 1. First, we determine if a CTAS statement defines storage format by checking the serde. However, TEXTFILE/SEQUENCEFILE does not have a default serde. When we do the check, we have not set the default serde. So, a query like `CREATE TABLE abc STORED AS TEXTFILE AS SELECT ...` actually creates a data source parquet table. 2. In the conversion logic, we are ignoring the user-specified location. This PR fixes the above two issues. Also, this PR makes the parser throws an exception when a CTAS statement has a PARTITIONED BY clause. This change is made because Hive's syntax does not allow it and our current implementation actually does not work for this case (the insert operation always throws an exception because the insertion does not pick up the partitioning info). ## How was this patch tested? I am adding new tests in SQLQuerySuite and HiveDDLCommandSuite. Author: Yin Huai Closes #13386 from yhuai/SPARK-14507. (cherry picked from commit 6dddb70c387ed1f002d2602b2b1f919ef021de91) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5a835b99 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5a835b99 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5a835b99 Branch: refs/heads/branch-2.0 Commit: 5a835b99f9852b0c2a35f9c75a51d493474994ea Parents: 35195f6 Author: Yin Huai Authored: Wed Jun 1 17:55:37 2016 -0700 Committer: Andrew Or Committed: Wed Jun 1 17:55:49 2016 -0700 -- .../spark/sql/execution/SparkSqlParser.scala| 37 - .../spark/sql/execution/command/tables.scala| 2 +- .../org/apache/spark/sql/internal/SQLConf.scala | 10 ++ .../spark/sql/hive/HiveMetastoreCatalog.scala | 57 ++-- .../spark/sql/hive/HiveSessionState.scala | 16 --- .../org/apache/spark/sql/hive/HiveUtils.scala | 6 - .../CreateHiveTableAsSelectCommand.scala| 102 ++ .../execution/CreateTableAsSelectCommand.scala | 101 -- .../spark/sql/hive/HiveDDLCommandSuite.scala| 25 ++-- .../sql/hive/execution/HiveExplainSuite.scala | 6 +- .../sql/hive/execution/SQLQuerySuite.scala | 135 ++- 11 files changed, 273 insertions(+), 224 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5a835b99/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala index 6c19bf0..01409c6 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala @@ -839,7 +839,7 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder { /** * Create a table, returning either a [[CreateTableCommand]] or a - * [[CreateTableAsSelectLogicalPlan]]. + * [[CreateHiveTableAsSelectLogicalPlan]]. * * This is not used to create datasource tables, which is handled through * "CREATE TABLE ... USING ...". @@ -936,7 +936,40 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder { comment = comment) selectQuery match { - case Some(q) => CreateTableAsSelectLogicalPlan(tableDesc, q, ifNotExists) + case Some(q) => +// Hive does not allow to use a CTAS statement to create a partitioned table. +if (tableDesc.partitionColumnNames.nonEmpty) { + val errorMessage = "A Create Table As Select (CTAS) statement is not allowed to " + +"create a partitioned table using Hive's file formats. " + +"Please use the syntax of \"CREATE TABLE tableName USING dataSource " + +"OPTIONS (...) PARTITIONED BY ...\" to create a partitioned table through a " + +"CTAS statement." + throw operationNotAllowed(errorMessage, ctx) +} + +val hasStorageProperties = (ctx.createFileFormat != null) || (ctx.rowFormat != null) +if (conf.convertCTAS && !hasStorageProperties) { + val mode = if (ifNotExists) SaveMode.Ignore else SaveMode.ErrorIfExists + // At here, both rowStorage.ser
spark git commit: [SPARK-15646][SQL] When spark.sql.hive.convertCTAS is true, the conversion rule needs to respect TEXTFILE/SEQUENCEFILE format and the user-defined location
Repository: spark Updated Branches: refs/heads/master c8fb776d4 -> 6dddb70c3 [SPARK-15646][SQL] When spark.sql.hive.convertCTAS is true, the conversion rule needs to respect TEXTFILE/SEQUENCEFILE format and the user-defined location ## What changes were proposed in this pull request? When `spark.sql.hive.convertCTAS` is true, for a CTAS statement, we will create a data source table using the default source (i.e. parquet) if the CTAS does not specify any Hive storage format. However, there are two issues with this conversion logic. 1. First, we determine if a CTAS statement defines storage format by checking the serde. However, TEXTFILE/SEQUENCEFILE does not have a default serde. When we do the check, we have not set the default serde. So, a query like `CREATE TABLE abc STORED AS TEXTFILE AS SELECT ...` actually creates a data source parquet table. 2. In the conversion logic, we are ignoring the user-specified location. This PR fixes the above two issues. Also, this PR makes the parser throws an exception when a CTAS statement has a PARTITIONED BY clause. This change is made because Hive's syntax does not allow it and our current implementation actually does not work for this case (the insert operation always throws an exception because the insertion does not pick up the partitioning info). ## How was this patch tested? I am adding new tests in SQLQuerySuite and HiveDDLCommandSuite. Author: Yin Huai Closes #13386 from yhuai/SPARK-14507. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6dddb70c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6dddb70c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6dddb70c Branch: refs/heads/master Commit: 6dddb70c387ed1f002d2602b2b1f919ef021de91 Parents: c8fb776 Author: Yin Huai Authored: Wed Jun 1 17:55:37 2016 -0700 Committer: Andrew Or Committed: Wed Jun 1 17:55:37 2016 -0700 -- .../spark/sql/execution/SparkSqlParser.scala| 37 - .../spark/sql/execution/command/tables.scala| 2 +- .../org/apache/spark/sql/internal/SQLConf.scala | 10 ++ .../spark/sql/hive/HiveMetastoreCatalog.scala | 57 ++-- .../spark/sql/hive/HiveSessionState.scala | 16 --- .../org/apache/spark/sql/hive/HiveUtils.scala | 6 - .../CreateHiveTableAsSelectCommand.scala| 102 ++ .../execution/CreateTableAsSelectCommand.scala | 101 -- .../spark/sql/hive/HiveDDLCommandSuite.scala| 25 ++-- .../sql/hive/execution/HiveExplainSuite.scala | 6 +- .../sql/hive/execution/SQLQuerySuite.scala | 135 ++- 11 files changed, 273 insertions(+), 224 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/6dddb70c/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala index 6c19bf0..01409c6 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala @@ -839,7 +839,7 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder { /** * Create a table, returning either a [[CreateTableCommand]] or a - * [[CreateTableAsSelectLogicalPlan]]. + * [[CreateHiveTableAsSelectLogicalPlan]]. * * This is not used to create datasource tables, which is handled through * "CREATE TABLE ... USING ...". @@ -936,7 +936,40 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder { comment = comment) selectQuery match { - case Some(q) => CreateTableAsSelectLogicalPlan(tableDesc, q, ifNotExists) + case Some(q) => +// Hive does not allow to use a CTAS statement to create a partitioned table. +if (tableDesc.partitionColumnNames.nonEmpty) { + val errorMessage = "A Create Table As Select (CTAS) statement is not allowed to " + +"create a partitioned table using Hive's file formats. " + +"Please use the syntax of \"CREATE TABLE tableName USING dataSource " + +"OPTIONS (...) PARTITIONED BY ...\" to create a partitioned table through a " + +"CTAS statement." + throw operationNotAllowed(errorMessage, ctx) +} + +val hasStorageProperties = (ctx.createFileFormat != null) || (ctx.rowFormat != null) +if (conf.convertCTAS && !hasStorageProperties) { + val mode = if (ifNotExists) SaveMode.Ignore else SaveMode.ErrorIfExists + // At here, both rowStorage.serdeProperties and fileStorage.serdeProperties + // are empty Maps. + val optionsWithPa
spark git commit: [HOTFIX] DDLSuite was broken by 93e9714
Repository: spark Updated Branches: refs/heads/master ac38bdc75 -> 1dd925644 [HOTFIX] DDLSuite was broken by 93e9714 Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1dd92564 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1dd92564 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1dd92564 Branch: refs/heads/master Commit: 1dd925644138c4a822328d4c6c51ba3ebd99c524 Parents: ac38bdc Author: Andrew Or Authored: Tue May 31 20:06:08 2016 -0700 Committer: Andrew Or Committed: Tue May 31 20:06:08 2016 -0700 -- .../org/apache/spark/sql/execution/command/DDLSuite.scala| 8 1 file changed, 4 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/1dd92564/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala index 5d45cfb..dd1f598 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala @@ -1179,11 +1179,11 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach { var message = intercept[AnalysisException] { sql(s"INSERT OVERWRITE TABLE $tabName SELECT 1, 'a'") }.getMessage - assert(message.contains("Please enable Hive support when inserting the regular tables")) + assert(message.contains("Hive support is required to insert into the following tables")) message = intercept[AnalysisException] { sql(s"SELECT * FROM $tabName") }.getMessage - assert(message.contains("Please enable Hive support when selecting the regular tables")) + assert(message.contains("Hive support is required to select over the following tables")) } } @@ -1205,11 +1205,11 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach { var message = intercept[AnalysisException] { sql(s"INSERT OVERWRITE TABLE $tabName SELECT 1, 'a'") }.getMessage -assert(message.contains("Please enable Hive support when inserting the regular tables")) +assert(message.contains("Hive support is required to insert into the following tables")) message = intercept[AnalysisException] { sql(s"SELECT * FROM $tabName") }.getMessage -assert(message.contains("Please enable Hive support when selecting the regular tables")) +assert(message.contains("Hive support is required to select over the following tables")) } } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [HOTFIX] DDLSuite was broken by 93e9714
Repository: spark Updated Branches: refs/heads/branch-2.0 d34c0fc10 -> 0ade44cc4 [HOTFIX] DDLSuite was broken by 93e9714 Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0ade44cc Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0ade44cc Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0ade44cc Branch: refs/heads/branch-2.0 Commit: 0ade44cc45ca01899e81cc2bc377cc5fa63e914b Parents: d34c0fc Author: Andrew Or Authored: Tue May 31 20:06:08 2016 -0700 Committer: Andrew Or Committed: Tue May 31 20:06:55 2016 -0700 -- .../org/apache/spark/sql/execution/command/DDLSuite.scala| 8 1 file changed, 4 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0ade44cc/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala index 5d45cfb..dd1f598 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala @@ -1179,11 +1179,11 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach { var message = intercept[AnalysisException] { sql(s"INSERT OVERWRITE TABLE $tabName SELECT 1, 'a'") }.getMessage - assert(message.contains("Please enable Hive support when inserting the regular tables")) + assert(message.contains("Hive support is required to insert into the following tables")) message = intercept[AnalysisException] { sql(s"SELECT * FROM $tabName") }.getMessage - assert(message.contains("Please enable Hive support when selecting the regular tables")) + assert(message.contains("Hive support is required to select over the following tables")) } } @@ -1205,11 +1205,11 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach { var message = intercept[AnalysisException] { sql(s"INSERT OVERWRITE TABLE $tabName SELECT 1, 'a'") }.getMessage -assert(message.contains("Please enable Hive support when inserting the regular tables")) +assert(message.contains("Hive support is required to insert into the following tables")) message = intercept[AnalysisException] { sql(s"SELECT * FROM $tabName") }.getMessage -assert(message.contains("Please enable Hive support when selecting the regular tables")) +assert(message.contains("Hive support is required to select over the following tables")) } } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15236][SQL][SPARK SHELL] Add spark-defaults property to switch to use InMemoryCatalog
Repository: spark Updated Branches: refs/heads/branch-2.0 459fd34a0 -> db96f398b [SPARK-15236][SQL][SPARK SHELL] Add spark-defaults property to switch to use InMemoryCatalog ## What changes were proposed in this pull request? This PR change REPL/Main to check this property `spark.sql.catalogImplementation` to decide if `enableHiveSupport `should be called. If `spark.sql.catalogImplementation` is set to `hive`, and hive classes are built, Spark will use Hive support. Other wise, Spark will create a SparkSession with in-memory catalog support. ## How was this patch tested? Run the REPL component test. Author: xin Wu Author: Xin Wu Closes #13088 from xwu0226/SPARK-15236. (cherry picked from commit 04f925ede851fc77add9ef1cacb79fb3a617f650) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/db96f398 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/db96f398 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/db96f398 Branch: refs/heads/branch-2.0 Commit: db96f398be338a937ef369515615f7fe7bb439a5 Parents: 459fd34 Author: xin Wu Authored: Tue May 31 17:42:47 2016 -0700 Committer: Andrew Or Committed: Tue May 31 17:42:57 2016 -0700 -- .../main/scala/org/apache/spark/repl/Main.scala | 20 ++-- .../scala/org/apache/spark/repl/ReplSuite.scala | 50 +++- 2 files changed, 66 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/db96f398/repl/scala-2.11/src/main/scala/org/apache/spark/repl/Main.scala -- diff --git a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/Main.scala b/repl/scala-2.11/src/main/scala/org/apache/spark/repl/Main.scala index 005edda..771670f 100644 --- a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/Main.scala +++ b/repl/scala-2.11/src/main/scala/org/apache/spark/repl/Main.scala @@ -22,6 +22,7 @@ import java.io.File import scala.tools.nsc.GenericRunnerSettings import org.apache.spark._ +import org.apache.spark.internal.config.CATALOG_IMPLEMENTATION import org.apache.spark.internal.Logging import org.apache.spark.sql.SparkSession import org.apache.spark.util.Utils @@ -88,10 +89,23 @@ object Main extends Logging { } val builder = SparkSession.builder.config(conf) -if (SparkSession.hiveClassesArePresent) { - sparkSession = builder.enableHiveSupport().getOrCreate() - logInfo("Created Spark session with Hive support") +if (conf.get(CATALOG_IMPLEMENTATION.key, "hive").toLowerCase == "hive") { + if (SparkSession.hiveClassesArePresent) { +// In the case that the property is not set at all, builder's config +// does not have this value set to 'hive' yet. The original default +// behavior is that when there are hive classes, we use hive catalog. +sparkSession = builder.enableHiveSupport().getOrCreate() +logInfo("Created Spark session with Hive support") + } else { +// Need to change it back to 'in-memory' if no hive classes are found +// in the case that the property is set to hive in spark-defaults.conf +builder.config(CATALOG_IMPLEMENTATION.key, "in-memory") +sparkSession = builder.getOrCreate() +logInfo("Created Spark session") + } } else { + // In the case that the property is set but not to 'hive', the internal + // default is 'in-memory'. So the sparkSession will use in-memory catalog. sparkSession = builder.getOrCreate() logInfo("Created Spark session") } http://git-wip-us.apache.org/repos/asf/spark/blob/db96f398/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala -- diff --git a/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala b/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala index af82e7a..1256860 100644 --- a/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala +++ b/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala @@ -21,9 +21,11 @@ import java.io._ import java.net.URLClassLoader import scala.collection.mutable.ArrayBuffer - import org.apache.commons.lang3.StringEscapeUtils +import org.apache.log4j.{Level, LogManager} import org.apache.spark.{SparkContext, SparkFunSuite} +import org.apache.spark.internal.config._ +import org.apache.spark.sql.SparkSession import org.apache.spark.util.Utils class ReplSuite extends SparkFunSuite { @@ -99,6 +101,52 @@ class ReplSuite extends SparkFunSuite { System.clearProperty("spark.driver.port") } + test("SPARK-15236: use Hive catalog") { +// turn on the INFO log so that it is possible the code will dump INFO +// entry for usi
spark git commit: [SPARK-15236][SQL][SPARK SHELL] Add spark-defaults property to switch to use InMemoryCatalog
Repository: spark Updated Branches: refs/heads/master 85d6b0db9 -> 04f925ede [SPARK-15236][SQL][SPARK SHELL] Add spark-defaults property to switch to use InMemoryCatalog ## What changes were proposed in this pull request? This PR change REPL/Main to check this property `spark.sql.catalogImplementation` to decide if `enableHiveSupport `should be called. If `spark.sql.catalogImplementation` is set to `hive`, and hive classes are built, Spark will use Hive support. Other wise, Spark will create a SparkSession with in-memory catalog support. ## How was this patch tested? Run the REPL component test. Author: xin Wu Author: Xin Wu Closes #13088 from xwu0226/SPARK-15236. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/04f925ed Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/04f925ed Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/04f925ed Branch: refs/heads/master Commit: 04f925ede851fc77add9ef1cacb79fb3a617f650 Parents: 85d6b0d Author: xin Wu Authored: Tue May 31 17:42:47 2016 -0700 Committer: Andrew Or Committed: Tue May 31 17:42:47 2016 -0700 -- .../main/scala/org/apache/spark/repl/Main.scala | 20 ++-- .../scala/org/apache/spark/repl/ReplSuite.scala | 50 +++- 2 files changed, 66 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/04f925ed/repl/scala-2.11/src/main/scala/org/apache/spark/repl/Main.scala -- diff --git a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/Main.scala b/repl/scala-2.11/src/main/scala/org/apache/spark/repl/Main.scala index 005edda..771670f 100644 --- a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/Main.scala +++ b/repl/scala-2.11/src/main/scala/org/apache/spark/repl/Main.scala @@ -22,6 +22,7 @@ import java.io.File import scala.tools.nsc.GenericRunnerSettings import org.apache.spark._ +import org.apache.spark.internal.config.CATALOG_IMPLEMENTATION import org.apache.spark.internal.Logging import org.apache.spark.sql.SparkSession import org.apache.spark.util.Utils @@ -88,10 +89,23 @@ object Main extends Logging { } val builder = SparkSession.builder.config(conf) -if (SparkSession.hiveClassesArePresent) { - sparkSession = builder.enableHiveSupport().getOrCreate() - logInfo("Created Spark session with Hive support") +if (conf.get(CATALOG_IMPLEMENTATION.key, "hive").toLowerCase == "hive") { + if (SparkSession.hiveClassesArePresent) { +// In the case that the property is not set at all, builder's config +// does not have this value set to 'hive' yet. The original default +// behavior is that when there are hive classes, we use hive catalog. +sparkSession = builder.enableHiveSupport().getOrCreate() +logInfo("Created Spark session with Hive support") + } else { +// Need to change it back to 'in-memory' if no hive classes are found +// in the case that the property is set to hive in spark-defaults.conf +builder.config(CATALOG_IMPLEMENTATION.key, "in-memory") +sparkSession = builder.getOrCreate() +logInfo("Created Spark session") + } } else { + // In the case that the property is set but not to 'hive', the internal + // default is 'in-memory'. So the sparkSession will use in-memory catalog. sparkSession = builder.getOrCreate() logInfo("Created Spark session") } http://git-wip-us.apache.org/repos/asf/spark/blob/04f925ed/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala -- diff --git a/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala b/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala index af82e7a..1256860 100644 --- a/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala +++ b/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala @@ -21,9 +21,11 @@ import java.io._ import java.net.URLClassLoader import scala.collection.mutable.ArrayBuffer - import org.apache.commons.lang3.StringEscapeUtils +import org.apache.log4j.{Level, LogManager} import org.apache.spark.{SparkContext, SparkFunSuite} +import org.apache.spark.internal.config._ +import org.apache.spark.sql.SparkSession import org.apache.spark.util.Utils class ReplSuite extends SparkFunSuite { @@ -99,6 +101,52 @@ class ReplSuite extends SparkFunSuite { System.clearProperty("spark.driver.port") } + test("SPARK-15236: use Hive catalog") { +// turn on the INFO log so that it is possible the code will dump INFO +// entry for using "HiveMetastore" +val rootLogger = LogManager.getRootLogger() +val logLevel = rootLogger.getLe
spark git commit: [SPARK-15618][SQL][MLLIB] Use SparkSession.builder.sparkContext if applicable.
Repository: spark Updated Branches: refs/heads/branch-2.0 ac4cb1718 -> 459fd34a0 [SPARK-15618][SQL][MLLIB] Use SparkSession.builder.sparkContext if applicable. This PR changes function `SparkSession.builder.sparkContext(..)` from **private[sql]** into **private[spark]**, and uses it if applicable like the followings. ``` - val spark = SparkSession.builder().config(sc.getConf).getOrCreate() + val spark = SparkSession.builder().sparkContext(sc).getOrCreate() ``` Pass the existing Jenkins tests. Author: Dongjoon Hyun Closes #13365 from dongjoon-hyun/SPARK-15618. (cherry picked from commit 85d6b0db9f5bd425c36482ffcb1c3b9fd0fcdb31) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/459fd34a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/459fd34a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/459fd34a Branch: refs/heads/branch-2.0 Commit: 459fd34a0b011589d924e318925c97657e71930d Parents: ac4cb17 Author: Dongjoon Hyun Authored: Tue May 31 17:40:44 2016 -0700 Committer: Andrew Or Committed: Tue May 31 17:41:28 2016 -0700 -- .../apache/spark/examples/BroadcastTest.scala | 7 +--- .../spark/examples/mllib/LDAExample.scala | 1 + .../spark/examples/sql/hive/HiveFromSpark.scala | 5 +-- .../spark/mllib/api/python/PythonMLLibAPI.scala | 6 +-- .../classification/LogisticRegression.scala | 2 +- .../spark/mllib/classification/NaiveBayes.scala | 8 ++-- .../impl/GLMClassificationModel.scala | 4 +- .../mllib/clustering/BisectingKMeansModel.scala | 4 +- .../mllib/clustering/GaussianMixtureModel.scala | 4 +- .../spark/mllib/clustering/KMeansModel.scala| 4 +- .../spark/mllib/clustering/LDAModel.scala | 8 ++-- .../clustering/PowerIterationClustering.scala | 4 +- .../spark/mllib/feature/ChiSqSelector.scala | 4 +- .../apache/spark/mllib/feature/Word2Vec.scala | 4 +- .../org/apache/spark/mllib/fpm/FPGrowth.scala | 4 +- .../org/apache/spark/mllib/fpm/PrefixSpan.scala | 4 +- .../MatrixFactorizationModel.scala | 4 +- .../mllib/regression/IsotonicRegression.scala | 4 +- .../regression/impl/GLMRegressionModel.scala| 4 +- .../mllib/tree/model/DecisionTreeModel.scala| 4 +- .../mllib/tree/model/treeEnsembleModels.scala | 4 +- .../spark/ml/feature/ChiSqSelectorSuite.scala | 8 +--- .../ml/feature/QuantileDiscretizerSuite.scala | 6 +-- .../spark/ml/recommendation/ALSSuite.scala | 3 +- .../apache/spark/ml/tree/impl/TreeTests.scala | 3 +- .../org/apache/spark/sql/SparkSession.scala | 2 +- .../execution/joins/BroadcastJoinSuite.scala| 39 ++-- .../spark/sql/hive/HiveSparkSubmitSuite.scala | 13 +++ 28 files changed, 78 insertions(+), 89 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/459fd34a/examples/src/main/scala/org/apache/spark/examples/BroadcastTest.scala -- diff --git a/examples/src/main/scala/org/apache/spark/examples/BroadcastTest.scala b/examples/src/main/scala/org/apache/spark/examples/BroadcastTest.scala index c50f25d..a68fd02 100644 --- a/examples/src/main/scala/org/apache/spark/examples/BroadcastTest.scala +++ b/examples/src/main/scala/org/apache/spark/examples/BroadcastTest.scala @@ -29,13 +29,10 @@ object BroadcastTest { val blockSize = if (args.length > 2) args(2) else "4096" -val sparkConf = new SparkConf() - .set("spark.broadcast.blockSize", blockSize) - val spark = SparkSession - .builder - .config(sparkConf) + .builder() .appName("Broadcast Test") + .config("spark.broadcast.blockSize", blockSize) .getOrCreate() val sc = spark.sparkContext http://git-wip-us.apache.org/repos/asf/spark/blob/459fd34a/examples/src/main/scala/org/apache/spark/examples/mllib/LDAExample.scala -- diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/LDAExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/LDAExample.scala index 7651aad..3fbf8e0 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/LDAExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/LDAExample.scala @@ -191,6 +191,7 @@ object LDAExample { val spark = SparkSession .builder + .sparkContext(sc) .getOrCreate() import spark.implicits._ http://git-wip-us.apache.org/repos/asf/spark/blob/459fd34a/examples/src/main/scala/org/apache/spark/examples/sql/hive/HiveFromSpark.scala -- diff --git a/examples/src/main/scala/org/apache/spark/examples/sql/hive/HiveFromSpark.scala b/examples/s
spark git commit: [SPARK-15618][SQL][MLLIB] Use SparkSession.builder.sparkContext if applicable.
Repository: spark Updated Branches: refs/heads/master 93e97147e -> 85d6b0db9 [SPARK-15618][SQL][MLLIB] Use SparkSession.builder.sparkContext if applicable. ## What changes were proposed in this pull request? This PR changes function `SparkSession.builder.sparkContext(..)` from **private[sql]** into **private[spark]**, and uses it if applicable like the followings. ``` - val spark = SparkSession.builder().config(sc.getConf).getOrCreate() + val spark = SparkSession.builder().sparkContext(sc).getOrCreate() ``` ## How was this patch tested? Pass the existing Jenkins tests. Author: Dongjoon Hyun Closes #13365 from dongjoon-hyun/SPARK-15618. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/85d6b0db Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/85d6b0db Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/85d6b0db Branch: refs/heads/master Commit: 85d6b0db9f5bd425c36482ffcb1c3b9fd0fcdb31 Parents: 93e9714 Author: Dongjoon Hyun Authored: Tue May 31 17:40:44 2016 -0700 Committer: Andrew Or Committed: Tue May 31 17:40:44 2016 -0700 -- .../apache/spark/examples/BroadcastTest.scala | 7 +--- .../spark/examples/mllib/LDAExample.scala | 1 + .../spark/examples/sql/hive/HiveFromSpark.scala | 5 +-- .../spark/mllib/api/python/PythonMLLibAPI.scala | 6 +-- .../classification/LogisticRegression.scala | 2 +- .../spark/mllib/classification/NaiveBayes.scala | 8 ++-- .../impl/GLMClassificationModel.scala | 4 +- .../mllib/clustering/BisectingKMeansModel.scala | 4 +- .../mllib/clustering/GaussianMixtureModel.scala | 4 +- .../spark/mllib/clustering/KMeansModel.scala| 4 +- .../spark/mllib/clustering/LDAModel.scala | 8 ++-- .../clustering/PowerIterationClustering.scala | 4 +- .../spark/mllib/feature/ChiSqSelector.scala | 4 +- .../apache/spark/mllib/feature/Word2Vec.scala | 4 +- .../org/apache/spark/mllib/fpm/FPGrowth.scala | 4 +- .../org/apache/spark/mllib/fpm/PrefixSpan.scala | 4 +- .../MatrixFactorizationModel.scala | 4 +- .../mllib/regression/IsotonicRegression.scala | 4 +- .../regression/impl/GLMRegressionModel.scala| 4 +- .../mllib/tree/model/DecisionTreeModel.scala| 4 +- .../mllib/tree/model/treeEnsembleModels.scala | 4 +- .../spark/ml/feature/ChiSqSelectorSuite.scala | 8 +--- .../ml/feature/QuantileDiscretizerSuite.scala | 6 +-- .../spark/ml/recommendation/ALSSuite.scala | 3 +- .../apache/spark/ml/tree/impl/TreeTests.scala | 3 +- .../org/apache/spark/sql/SparkSession.scala | 2 +- .../execution/joins/BroadcastJoinSuite.scala| 41 ++-- .../spark/sql/hive/HiveSparkSubmitSuite.scala | 13 +++ 28 files changed, 79 insertions(+), 90 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/85d6b0db/examples/src/main/scala/org/apache/spark/examples/BroadcastTest.scala -- diff --git a/examples/src/main/scala/org/apache/spark/examples/BroadcastTest.scala b/examples/src/main/scala/org/apache/spark/examples/BroadcastTest.scala index c50f25d..a68fd02 100644 --- a/examples/src/main/scala/org/apache/spark/examples/BroadcastTest.scala +++ b/examples/src/main/scala/org/apache/spark/examples/BroadcastTest.scala @@ -29,13 +29,10 @@ object BroadcastTest { val blockSize = if (args.length > 2) args(2) else "4096" -val sparkConf = new SparkConf() - .set("spark.broadcast.blockSize", blockSize) - val spark = SparkSession - .builder - .config(sparkConf) + .builder() .appName("Broadcast Test") + .config("spark.broadcast.blockSize", blockSize) .getOrCreate() val sc = spark.sparkContext http://git-wip-us.apache.org/repos/asf/spark/blob/85d6b0db/examples/src/main/scala/org/apache/spark/examples/mllib/LDAExample.scala -- diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/LDAExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/LDAExample.scala index 7651aad..3fbf8e0 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/LDAExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/LDAExample.scala @@ -191,6 +191,7 @@ object LDAExample { val spark = SparkSession .builder + .sparkContext(sc) .getOrCreate() import spark.implicits._ http://git-wip-us.apache.org/repos/asf/spark/blob/85d6b0db/examples/src/main/scala/org/apache/spark/examples/sql/hive/HiveFromSpark.scala -- diff --git a/examples/src/main/scala/org/apache/spark/examples/sql/hive/HiveFromSpark.scala b/examples/src/main/scala/org/ap
spark git commit: [MINOR] Slightly better error message when attempting to query hive tables w/in-mem catalog
Repository: spark Updated Branches: refs/heads/branch-2.0 4b19c9776 -> ac4cb1718 [MINOR] Slightly better error message when attempting to query hive tables w/in-mem catalog andrewor14 Author: Eric Liang Closes #13427 from ericl/better-error-msg. (cherry picked from commit 93e97147eb499dde1e54e07ba113eebcbe25508a) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ac4cb171 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ac4cb171 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ac4cb171 Branch: refs/heads/branch-2.0 Commit: ac4cb17182615dba56e7fb5978150e3eb69a113a Parents: 4b19c97 Author: Eric Liang Authored: Tue May 31 17:39:03 2016 -0700 Committer: Andrew Or Committed: Tue May 31 17:39:13 2016 -0700 -- .../org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ac4cb171/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala index c5f221d..7b451ba 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala @@ -309,14 +309,14 @@ trait CheckAnalysis extends PredicateHelper { case s: SimpleCatalogRelation => failAnalysis( s""" - |Please enable Hive support when selecting the regular tables: + |Hive support is required to select over the following tables: |${s.catalogTable.identifier} """.stripMargin) case InsertIntoTable(s: SimpleCatalogRelation, _, _, _, _) => failAnalysis( s""" - |Please enable Hive support when inserting the regular tables: + |Hive support is required to insert into the following tables: |${s.catalogTable.identifier} """.stripMargin) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [MINOR] Slightly better error message when attempting to query hive tables w/in-mem catalog
Repository: spark Updated Branches: refs/heads/master 196a0d827 -> 93e97147e [MINOR] Slightly better error message when attempting to query hive tables w/in-mem catalog andrewor14 Author: Eric Liang Closes #13427 from ericl/better-error-msg. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/93e97147 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/93e97147 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/93e97147 Branch: refs/heads/master Commit: 93e97147eb499dde1e54e07ba113eebcbe25508a Parents: 196a0d8 Author: Eric Liang Authored: Tue May 31 17:39:03 2016 -0700 Committer: Andrew Or Committed: Tue May 31 17:39:03 2016 -0700 -- .../org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/93e97147/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala index c5f221d..7b451ba 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala @@ -309,14 +309,14 @@ trait CheckAnalysis extends PredicateHelper { case s: SimpleCatalogRelation => failAnalysis( s""" - |Please enable Hive support when selecting the regular tables: + |Hive support is required to select over the following tables: |${s.catalogTable.identifier} """.stripMargin) case InsertIntoTable(s: SimpleCatalogRelation, _, _, _, _) => failAnalysis( s""" - |Please enable Hive support when inserting the regular tables: + |Hive support is required to insert into the following tables: |${s.catalogTable.identifier} """.stripMargin) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [MINOR][SQL][DOCS] Fix docs of Dataset.scala and SQLImplicits.scala.
Repository: spark Updated Branches: refs/heads/branch-2.0 b8de4ad7d -> 4b19c9776 [MINOR][SQL][DOCS] Fix docs of Dataset.scala and SQLImplicits.scala. This PR fixes a sample code, a description, and indentations in docs. Manual. Author: Dongjoon Hyun Closes #13420 from dongjoon-hyun/minor_fix_dataset_doc. (cherry picked from commit 196a0d82730e78b573a64a791a6ad873aa9ec74d) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4b19c977 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4b19c977 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4b19c977 Branch: refs/heads/branch-2.0 Commit: 4b19c97764489a48abccab75e1b132b469383f44 Parents: b8de4ad Author: Dongjoon Hyun Authored: Tue May 31 17:36:24 2016 -0700 Committer: Andrew Or Committed: Tue May 31 17:37:49 2016 -0700 -- .../scala/org/apache/spark/sql/Dataset.scala| 36 ++-- .../org/apache/spark/sql/SQLImplicits.scala | 2 +- 2 files changed, 19 insertions(+), 19 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4b19c977/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala index 31000dc..7be49b1 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala @@ -1,19 +1,19 @@ /* -* Licensed to the Apache Software Foundation (ASF) under one or more -* contributor license agreements. See the NOTICE file distributed with -* this work for additional information regarding copyright ownership. -* The ASF licenses this file to You under the Apache License, Version 2.0 -* (the "License"); you may not use this file except in compliance with -* the License. You may obtain a copy of the License at -* -*http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.spark.sql @@ -93,14 +93,14 @@ private[sql] object Dataset { * to some files on storage systems, using the `read` function available on a `SparkSession`. * {{{ * val people = spark.read.parquet("...").as[Person] // Scala - * Dataset people = spark.read().parquet("...").as(Encoders.bean(Person.class) // Java + * Dataset people = spark.read().parquet("...").as(Encoders.bean(Person.class)); // Java * }}} * * Datasets can also be created through transformations available on existing Datasets. For example, * the following creates a new Dataset by applying a filter on the existing one: * {{{ * val names = people.map(_.name) // in Scala; names is a Dataset[String] - * Dataset names = people.map((Person p) -> p.name, Encoders.STRING)) // in Java 8 + * Dataset names = people.map((Person p) -> p.name, Encoders.STRING)); // in Java 8 * }}} * * Dataset operations can also be untyped, through various domain-specific-language (DSL) @@ -110,7 +110,7 @@ private[sql] object Dataset { * To select a column from the Dataset, use `apply` method in Scala and `col` in Java. * {{{ * val ageCol = people("age") // in Scala - * Column ageCol = people.col("age") // in Java + * Column ageCol = people.col("age"); // in Java * }}} * * Note that the [[Column]] type can also be manipulated through its various functions. http://git-wip-us.apache.org/repos/asf/spark/blob/4b19c977/sql/core/src/main/scala/org/apache/spark/sql/SQLImplicits.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQL
spark git commit: [MINOR][SQL][DOCS] Fix docs of Dataset.scala and SQLImplicits.scala.
Repository: spark Updated Branches: refs/heads/master dad5a6881 -> 196a0d827 [MINOR][SQL][DOCS] Fix docs of Dataset.scala and SQLImplicits.scala. This PR fixes a sample code, a description, and indentations in docs. Manual. Author: Dongjoon Hyun Closes #13420 from dongjoon-hyun/minor_fix_dataset_doc. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/196a0d82 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/196a0d82 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/196a0d82 Branch: refs/heads/master Commit: 196a0d82730e78b573a64a791a6ad873aa9ec74d Parents: dad5a68 Author: Dongjoon Hyun Authored: Tue May 31 17:36:24 2016 -0700 Committer: Andrew Or Committed: Tue May 31 17:37:33 2016 -0700 -- .../scala/org/apache/spark/sql/Dataset.scala| 36 ++-- .../org/apache/spark/sql/SQLImplicits.scala | 2 +- 2 files changed, 19 insertions(+), 19 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/196a0d82/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala index 31000dc..7be49b1 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala @@ -1,19 +1,19 @@ /* -* Licensed to the Apache Software Foundation (ASF) under one or more -* contributor license agreements. See the NOTICE file distributed with -* this work for additional information regarding copyright ownership. -* The ASF licenses this file to You under the Apache License, Version 2.0 -* (the "License"); you may not use this file except in compliance with -* the License. You may obtain a copy of the License at -* -*http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.spark.sql @@ -93,14 +93,14 @@ private[sql] object Dataset { * to some files on storage systems, using the `read` function available on a `SparkSession`. * {{{ * val people = spark.read.parquet("...").as[Person] // Scala - * Dataset people = spark.read().parquet("...").as(Encoders.bean(Person.class) // Java + * Dataset people = spark.read().parquet("...").as(Encoders.bean(Person.class)); // Java * }}} * * Datasets can also be created through transformations available on existing Datasets. For example, * the following creates a new Dataset by applying a filter on the existing one: * {{{ * val names = people.map(_.name) // in Scala; names is a Dataset[String] - * Dataset names = people.map((Person p) -> p.name, Encoders.STRING)) // in Java 8 + * Dataset names = people.map((Person p) -> p.name, Encoders.STRING)); // in Java 8 * }}} * * Dataset operations can also be untyped, through various domain-specific-language (DSL) @@ -110,7 +110,7 @@ private[sql] object Dataset { * To select a column from the Dataset, use `apply` method in Scala and `col` in Java. * {{{ * val ageCol = people("age") // in Scala - * Column ageCol = people.col("age") // in Java + * Column ageCol = people.col("age"); // in Java * }}} * * Note that the [[Column]] type can also be manipulated through its various functions. http://git-wip-us.apache.org/repos/asf/spark/blob/196a0d82/sql/core/src/main/scala/org/apache/spark/sql/SQLImplicits.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLImplicits.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLImplicits.scala index f423e7d..b7ea2a
spark git commit: [SPARK-15670][JAVA API][SPARK CORE] label_accumulator_deprecate_in_java_spark_context
Repository: spark Updated Branches: refs/heads/branch-2.0 00fca2a05 -> 7f240eaee [SPARK-15670][JAVA API][SPARK CORE] label_accumulator_deprecate_in_java_spark_context ## What changes were proposed in this pull request? Add deprecate annotation for acumulator V1 interface in JavaSparkContext class ## How was this patch tested? N/A Author: WeichenXu Closes #13412 from WeichenXu123/label_accumulator_deprecate_in_java_spark_context. (cherry picked from commit dad5a68818436eb7feaeb762b72433248eff298f) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7f240eae Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7f240eae Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7f240eae Branch: refs/heads/branch-2.0 Commit: 7f240eaeed78b42c23973678c5f643e486d4cfa3 Parents: 00fca2a Author: WeichenXu Authored: Tue May 31 17:34:34 2016 -0700 Committer: Andrew Or Committed: Tue May 31 17:34:43 2016 -0700 -- .../main/scala/org/apache/spark/api/java/JavaSparkContext.scala | 4 1 file changed, 4 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/7f240eae/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala -- diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala b/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala index fb63234..bfb6a35 100644 --- a/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala +++ b/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala @@ -596,6 +596,7 @@ class JavaSparkContext(val sc: SparkContext) * Create an [[org.apache.spark.Accumulator]] variable of a given type, which tasks can "add" * values to using the `add` method. Only the master can access the accumulator's `value`. */ + @deprecated("use AccumulatorV2", "2.0.0") def accumulator[T](initialValue: T, accumulatorParam: AccumulatorParam[T]): Accumulator[T] = sc.accumulator(initialValue)(accumulatorParam) @@ -605,6 +606,7 @@ class JavaSparkContext(val sc: SparkContext) * * This version supports naming the accumulator for display in Spark's web UI. */ + @deprecated("use AccumulatorV2", "2.0.0") def accumulator[T](initialValue: T, name: String, accumulatorParam: AccumulatorParam[T]) : Accumulator[T] = sc.accumulator(initialValue, name)(accumulatorParam) @@ -613,6 +615,7 @@ class JavaSparkContext(val sc: SparkContext) * Create an [[org.apache.spark.Accumulable]] shared variable of the given type, to which tasks * can "add" values with `add`. Only the master can access the accumuable's `value`. */ + @deprecated("use AccumulatorV2", "2.0.0") def accumulable[T, R](initialValue: T, param: AccumulableParam[T, R]): Accumulable[T, R] = sc.accumulable(initialValue)(param) @@ -622,6 +625,7 @@ class JavaSparkContext(val sc: SparkContext) * * This version supports naming the accumulator for display in Spark's web UI. */ + @deprecated("use AccumulatorV2", "2.0.0") def accumulable[T, R](initialValue: T, name: String, param: AccumulableParam[T, R]) : Accumulable[T, R] = sc.accumulable(initialValue, name)(param) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15670][JAVA API][SPARK CORE] label_accumulator_deprecate_in_java_spark_context
Repository: spark Updated Branches: refs/heads/master 06514d689 -> dad5a6881 [SPARK-15670][JAVA API][SPARK CORE] label_accumulator_deprecate_in_java_spark_context ## What changes were proposed in this pull request? Add deprecate annotation for acumulator V1 interface in JavaSparkContext class ## How was this patch tested? N/A Author: WeichenXu Closes #13412 from WeichenXu123/label_accumulator_deprecate_in_java_spark_context. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/dad5a688 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/dad5a688 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/dad5a688 Branch: refs/heads/master Commit: dad5a68818436eb7feaeb762b72433248eff298f Parents: 06514d6 Author: WeichenXu Authored: Tue May 31 17:34:34 2016 -0700 Committer: Andrew Or Committed: Tue May 31 17:34:34 2016 -0700 -- .../main/scala/org/apache/spark/api/java/JavaSparkContext.scala | 4 1 file changed, 4 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/dad5a688/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala -- diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala b/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala index fb63234..bfb6a35 100644 --- a/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala +++ b/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala @@ -596,6 +596,7 @@ class JavaSparkContext(val sc: SparkContext) * Create an [[org.apache.spark.Accumulator]] variable of a given type, which tasks can "add" * values to using the `add` method. Only the master can access the accumulator's `value`. */ + @deprecated("use AccumulatorV2", "2.0.0") def accumulator[T](initialValue: T, accumulatorParam: AccumulatorParam[T]): Accumulator[T] = sc.accumulator(initialValue)(accumulatorParam) @@ -605,6 +606,7 @@ class JavaSparkContext(val sc: SparkContext) * * This version supports naming the accumulator for display in Spark's web UI. */ + @deprecated("use AccumulatorV2", "2.0.0") def accumulator[T](initialValue: T, name: String, accumulatorParam: AccumulatorParam[T]) : Accumulator[T] = sc.accumulator(initialValue, name)(accumulatorParam) @@ -613,6 +615,7 @@ class JavaSparkContext(val sc: SparkContext) * Create an [[org.apache.spark.Accumulable]] shared variable of the given type, to which tasks * can "add" values with `add`. Only the master can access the accumuable's `value`. */ + @deprecated("use AccumulatorV2", "2.0.0") def accumulable[T, R](initialValue: T, param: AccumulableParam[T, R]): Accumulable[T, R] = sc.accumulable(initialValue)(param) @@ -622,6 +625,7 @@ class JavaSparkContext(val sc: SparkContext) * * This version supports naming the accumulator for display in Spark's web UI. */ + @deprecated("use AccumulatorV2", "2.0.0") def accumulable[T, R](initialValue: T, name: String, param: AccumulableParam[T, R]) : Accumulable[T, R] = sc.accumulable(initialValue, name)(param) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [CORE][DOC][MINOR] typos + links
Repository: spark Updated Branches: refs/heads/branch-2.0 f0e8738c1 -> 00fca2a05 [CORE][DOC][MINOR] typos + links ## What changes were proposed in this pull request? A very tiny change to javadoc (which I don't mind if gets merged with a bigger change). I've just found it annoying and couldn't resist proposing a pull request. Sorry srowen and rxin. ## How was this patch tested? Manual build Author: Jacek Laskowski Closes #13383 from jaceklaskowski/memory-consumer. (cherry picked from commit 0f24713468088fa7617d208572179d558e1f286b) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/00fca2a0 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/00fca2a0 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/00fca2a0 Branch: refs/heads/branch-2.0 Commit: 00fca2a058d9803fe2b07d7c5827d51e821e523e Parents: f0e8738 Author: Jacek Laskowski Authored: Tue May 31 17:32:37 2016 -0700 Committer: Andrew Or Committed: Tue May 31 17:32:47 2016 -0700 -- core/src/main/java/org/apache/spark/memory/MemoryConsumer.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/00fca2a0/core/src/main/java/org/apache/spark/memory/MemoryConsumer.java -- diff --git a/core/src/main/java/org/apache/spark/memory/MemoryConsumer.java b/core/src/main/java/org/apache/spark/memory/MemoryConsumer.java index 38a21a8..fc1f3a8 100644 --- a/core/src/main/java/org/apache/spark/memory/MemoryConsumer.java +++ b/core/src/main/java/org/apache/spark/memory/MemoryConsumer.java @@ -23,7 +23,7 @@ import org.apache.spark.unsafe.array.LongArray; import org.apache.spark.unsafe.memory.MemoryBlock; /** - * An memory consumer of TaskMemoryManager, which support spilling. + * A memory consumer of {@link TaskMemoryManager} that supports spilling. * * Note: this only supports allocation / spilling of Tungsten memory. */ @@ -45,7 +45,7 @@ public abstract class MemoryConsumer { } /** - * Returns the memory mode, ON_HEAP or OFF_HEAP. + * Returns the memory mode, {@link MemoryMode#ON_HEAP} or {@link MemoryMode#OFF_HEAP}. */ public MemoryMode getMode() { return mode; - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [CORE][DOC][MINOR] typos + links
Repository: spark Updated Branches: refs/heads/master 8ca01a6fe -> 0f2471346 [CORE][DOC][MINOR] typos + links ## What changes were proposed in this pull request? A very tiny change to javadoc (which I don't mind if gets merged with a bigger change). I've just found it annoying and couldn't resist proposing a pull request. Sorry srowen and rxin. ## How was this patch tested? Manual build Author: Jacek Laskowski Closes #13383 from jaceklaskowski/memory-consumer. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0f247134 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0f247134 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0f247134 Branch: refs/heads/master Commit: 0f24713468088fa7617d208572179d558e1f286b Parents: 8ca01a6 Author: Jacek Laskowski Authored: Tue May 31 17:32:37 2016 -0700 Committer: Andrew Or Committed: Tue May 31 17:32:37 2016 -0700 -- core/src/main/java/org/apache/spark/memory/MemoryConsumer.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0f247134/core/src/main/java/org/apache/spark/memory/MemoryConsumer.java -- diff --git a/core/src/main/java/org/apache/spark/memory/MemoryConsumer.java b/core/src/main/java/org/apache/spark/memory/MemoryConsumer.java index 38a21a8..fc1f3a8 100644 --- a/core/src/main/java/org/apache/spark/memory/MemoryConsumer.java +++ b/core/src/main/java/org/apache/spark/memory/MemoryConsumer.java @@ -23,7 +23,7 @@ import org.apache.spark.unsafe.array.LongArray; import org.apache.spark.unsafe.memory.MemoryBlock; /** - * An memory consumer of TaskMemoryManager, which support spilling. + * A memory consumer of {@link TaskMemoryManager} that supports spilling. * * Note: this only supports allocation / spilling of Tungsten memory. */ @@ -45,7 +45,7 @@ public abstract class MemoryConsumer { } /** - * Returns the memory mode, ON_HEAP or OFF_HEAP. + * Returns the memory mode, {@link MemoryMode#ON_HEAP} or {@link MemoryMode#OFF_HEAP}. */ public MemoryMode getMode() { return mode; - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15662][SQL] Add since annotation for classes in sql.catalog
Repository: spark Updated Branches: refs/heads/branch-2.0 1f4c4def9 -> 978f54e76 [SPARK-15662][SQL] Add since annotation for classes in sql.catalog ## What changes were proposed in this pull request? This patch does a few things: 1. Adds since version annotation to methods and classes in sql.catalog. 2. Fixed a typo in FilterFunction and a whitespace issue in spark/api/java/function/package.scala 3. Added "database" field to Function class. ## How was this patch tested? Updated unit test case for "database" field in Function class. Author: Reynold Xin Closes #13406 from rxin/SPARK-15662. (cherry picked from commit 223f1d58c4f4b6eb0f0037a118a0bb635ae20bb1) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/978f54e7 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/978f54e7 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/978f54e7 Branch: refs/heads/branch-2.0 Commit: 978f54e76bab2e03e034e2dfcc2c1ebaffefc5e0 Parents: 1f4c4de Author: Reynold Xin Authored: Tue May 31 17:29:10 2016 -0700 Committer: Andrew Or Committed: Tue May 31 17:29:38 2016 -0700 -- .../spark/api/java/function/FilterFunction.java | 2 +- .../spark/api/java/function/package.scala | 2 +- .../org/apache/spark/sql/catalog/Catalog.scala | 2 + .../apache/spark/sql/catalog/interface.scala| 42 +++- .../apache/spark/sql/internal/CatalogImpl.scala | 1 + .../spark/sql/internal/CatalogSuite.scala | 22 +++--- 6 files changed, 62 insertions(+), 9 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/978f54e7/core/src/main/java/org/apache/spark/api/java/function/FilterFunction.java -- diff --git a/core/src/main/java/org/apache/spark/api/java/function/FilterFunction.java b/core/src/main/java/org/apache/spark/api/java/function/FilterFunction.java index e8d999d..462ca3f 100644 --- a/core/src/main/java/org/apache/spark/api/java/function/FilterFunction.java +++ b/core/src/main/java/org/apache/spark/api/java/function/FilterFunction.java @@ -22,7 +22,7 @@ import java.io.Serializable; /** * Base interface for a function used in Dataset's filter function. * - * If the function returns true, the element is discarded in the returned Dataset. + * If the function returns true, the element is included in the returned Dataset. */ public interface FilterFunction extends Serializable { boolean call(T value) throws Exception; http://git-wip-us.apache.org/repos/asf/spark/blob/978f54e7/core/src/main/java/org/apache/spark/api/java/function/package.scala -- diff --git a/core/src/main/java/org/apache/spark/api/java/function/package.scala b/core/src/main/java/org/apache/spark/api/java/function/package.scala index 0f9bac7..e19f12f 100644 --- a/core/src/main/java/org/apache/spark/api/java/function/package.scala +++ b/core/src/main/java/org/apache/spark/api/java/function/package.scala @@ -22,4 +22,4 @@ package org.apache.spark.api.java * these interfaces to pass functions to various Java API methods for Spark. Please visit Spark's * Java programming guide for more details. */ -package object function +package object function http://git-wip-us.apache.org/repos/asf/spark/blob/978f54e7/sql/core/src/main/scala/org/apache/spark/sql/catalog/Catalog.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/catalog/Catalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/catalog/Catalog.scala index a99bc3b..6ddb1a7 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/catalog/Catalog.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/catalog/Catalog.scala @@ -24,6 +24,8 @@ import org.apache.spark.sql.types.StructType /** * Catalog interface for Spark. To access this, use `SparkSession.catalog`. + * + * @since 2.0.0 */ abstract class Catalog { http://git-wip-us.apache.org/repos/asf/spark/blob/978f54e7/sql/core/src/main/scala/org/apache/spark/sql/catalog/interface.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/catalog/interface.scala b/sql/core/src/main/scala/org/apache/spark/sql/catalog/interface.scala index 0f7feb8..33032f0 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/catalog/interface.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/catalog/interface.scala @@ -25,6 +25,14 @@ import org.apache.spark.sql.catalyst.DefinedByConstructorParams // Note: all classes here are expected to be wrapped in Datasets and so must extend // DefinedByConstructorParams for the catalog to be able to create encoders for
spark git commit: [SPARK-15662][SQL] Add since annotation for classes in sql.catalog
Repository: spark Updated Branches: refs/heads/master 695470429 -> 223f1d58c [SPARK-15662][SQL] Add since annotation for classes in sql.catalog ## What changes were proposed in this pull request? This patch does a few things: 1. Adds since version annotation to methods and classes in sql.catalog. 2. Fixed a typo in FilterFunction and a whitespace issue in spark/api/java/function/package.scala 3. Added "database" field to Function class. ## How was this patch tested? Updated unit test case for "database" field in Function class. Author: Reynold Xin Closes #13406 from rxin/SPARK-15662. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/223f1d58 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/223f1d58 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/223f1d58 Branch: refs/heads/master Commit: 223f1d58c4f4b6eb0f0037a118a0bb635ae20bb1 Parents: 6954704 Author: Reynold Xin Authored: Tue May 31 17:29:10 2016 -0700 Committer: Andrew Or Committed: Tue May 31 17:29:10 2016 -0700 -- .../spark/api/java/function/FilterFunction.java | 2 +- .../spark/api/java/function/package.scala | 2 +- .../org/apache/spark/sql/catalog/Catalog.scala | 2 + .../apache/spark/sql/catalog/interface.scala| 42 +++- .../apache/spark/sql/internal/CatalogImpl.scala | 1 + .../spark/sql/internal/CatalogSuite.scala | 22 +++--- 6 files changed, 62 insertions(+), 9 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/223f1d58/core/src/main/java/org/apache/spark/api/java/function/FilterFunction.java -- diff --git a/core/src/main/java/org/apache/spark/api/java/function/FilterFunction.java b/core/src/main/java/org/apache/spark/api/java/function/FilterFunction.java index e8d999d..462ca3f 100644 --- a/core/src/main/java/org/apache/spark/api/java/function/FilterFunction.java +++ b/core/src/main/java/org/apache/spark/api/java/function/FilterFunction.java @@ -22,7 +22,7 @@ import java.io.Serializable; /** * Base interface for a function used in Dataset's filter function. * - * If the function returns true, the element is discarded in the returned Dataset. + * If the function returns true, the element is included in the returned Dataset. */ public interface FilterFunction extends Serializable { boolean call(T value) throws Exception; http://git-wip-us.apache.org/repos/asf/spark/blob/223f1d58/core/src/main/java/org/apache/spark/api/java/function/package.scala -- diff --git a/core/src/main/java/org/apache/spark/api/java/function/package.scala b/core/src/main/java/org/apache/spark/api/java/function/package.scala index 0f9bac7..e19f12f 100644 --- a/core/src/main/java/org/apache/spark/api/java/function/package.scala +++ b/core/src/main/java/org/apache/spark/api/java/function/package.scala @@ -22,4 +22,4 @@ package org.apache.spark.api.java * these interfaces to pass functions to various Java API methods for Spark. Please visit Spark's * Java programming guide for more details. */ -package object function +package object function http://git-wip-us.apache.org/repos/asf/spark/blob/223f1d58/sql/core/src/main/scala/org/apache/spark/sql/catalog/Catalog.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/catalog/Catalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/catalog/Catalog.scala index a99bc3b..6ddb1a7 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/catalog/Catalog.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/catalog/Catalog.scala @@ -24,6 +24,8 @@ import org.apache.spark.sql.types.StructType /** * Catalog interface for Spark. To access this, use `SparkSession.catalog`. + * + * @since 2.0.0 */ abstract class Catalog { http://git-wip-us.apache.org/repos/asf/spark/blob/223f1d58/sql/core/src/main/scala/org/apache/spark/sql/catalog/interface.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/catalog/interface.scala b/sql/core/src/main/scala/org/apache/spark/sql/catalog/interface.scala index 0f7feb8..33032f0 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/catalog/interface.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/catalog/interface.scala @@ -25,6 +25,14 @@ import org.apache.spark.sql.catalyst.DefinedByConstructorParams // Note: all classes here are expected to be wrapped in Datasets and so must extend // DefinedByConstructorParams for the catalog to be able to create encoders for them. +/** + * A database in Spark, as returned by the `listDatabases` method defined in [[Catalog]].
spark git commit: [HOTFIX] Scala 2.10 compile GaussianMixtureModel
Repository: spark Updated Branches: refs/heads/branch-2.0 17f43cc87 -> 5ea58898c [HOTFIX] Scala 2.10 compile GaussianMixtureModel Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5ea58898 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5ea58898 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5ea58898 Branch: refs/heads/branch-2.0 Commit: 5ea58898cc9413fd0b04b60db230c8894d8bb9ef Parents: 17f43cc Author: Andrew Or Authored: Fri May 27 11:43:01 2016 -0700 Committer: Andrew Or Committed: Fri May 27 11:43:50 2016 -0700 -- .../org/apache/spark/mllib/clustering/GaussianMixtureModel.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5ea58898/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala index 4b06816..f470b0f 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala @@ -170,7 +170,7 @@ object GaussianMixtureModel extends Loader[GaussianMixtureModel] { (weight, new MultivariateGaussian(mu, sigma)) }.unzip - new GaussianMixtureModel(weights, gaussians) + new GaussianMixtureModel(weights.toArray, gaussians.toArray) } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [HOTFIX] Scala 2.10 compile GaussianMixtureModel
Repository: spark Updated Branches: refs/heads/master 1b98fa2e4 -> b376a4eab [HOTFIX] Scala 2.10 compile GaussianMixtureModel Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b376a4ea Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b376a4ea Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b376a4ea Branch: refs/heads/master Commit: b376a4eabc82d622ea26290345c01465af7a628d Parents: 1b98fa2 Author: Andrew Or Authored: Fri May 27 11:43:01 2016 -0700 Committer: Andrew Or Committed: Fri May 27 11:43:01 2016 -0700 -- .../org/apache/spark/mllib/clustering/GaussianMixtureModel.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b376a4ea/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala index 4b06816..f470b0f 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala @@ -170,7 +170,7 @@ object GaussianMixtureModel extends Loader[GaussianMixtureModel] { (weight, new MultivariateGaussian(mu, sigma)) }.unzip - new GaussianMixtureModel(weights, gaussians) + new GaussianMixtureModel(weights.toArray, gaussians.toArray) } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15569] Reduce frequency of updateBytesWritten function in Disk…
Repository: spark Updated Branches: refs/heads/master 5bdbedf22 -> ce756daa4 [SPARK-15569] Reduce frequency of updateBytesWritten function in Disk… ## What changes were proposed in this pull request? Profiling a Spark job spilling large amount of intermediate data we found that significant portion of time is being spent in DiskObjectWriter.updateBytesWritten function. Looking at the code, we see that the function is being called too frequently to update the number of bytes written to disk. We should reduce the frequency to avoid this. ## How was this patch tested? Tested by running the job on cluster and saw 20% CPU gain by this change. Author: Sital Kedia Closes #13332 from sitalkedia/DiskObjectWriter. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ce756daa Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ce756daa Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ce756daa Branch: refs/heads/master Commit: ce756daa4f012ebdc5a41bf5a89ff11b6dfdab8c Parents: 5bdbedf Author: Sital Kedia Authored: Fri May 27 11:22:39 2016 -0700 Committer: Andrew Or Committed: Fri May 27 11:22:39 2016 -0700 -- .../apache/spark/storage/DiskBlockObjectWriter.scala| 3 +-- .../spark/storage/DiskBlockObjectWriterSuite.scala | 12 ++-- 2 files changed, 7 insertions(+), 8 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ce756daa/core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala -- diff --git a/core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala b/core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala index ab97d2e..5b493f4 100644 --- a/core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala +++ b/core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala @@ -203,8 +203,7 @@ private[spark] class DiskBlockObjectWriter( numRecordsWritten += 1 writeMetrics.incRecordsWritten(1) -// TODO: call updateBytesWritten() less frequently. -if (numRecordsWritten % 32 == 0) { +if (numRecordsWritten % 16384 == 0) { updateBytesWritten() } } http://git-wip-us.apache.org/repos/asf/spark/blob/ce756daa/core/src/test/scala/org/apache/spark/storage/DiskBlockObjectWriterSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/storage/DiskBlockObjectWriterSuite.scala b/core/src/test/scala/org/apache/spark/storage/DiskBlockObjectWriterSuite.scala index 8eff3c2..ec4ef4b 100644 --- a/core/src/test/scala/org/apache/spark/storage/DiskBlockObjectWriterSuite.scala +++ b/core/src/test/scala/org/apache/spark/storage/DiskBlockObjectWriterSuite.scala @@ -53,13 +53,13 @@ class DiskBlockObjectWriterSuite extends SparkFunSuite with BeforeAndAfterEach { assert(writeMetrics.recordsWritten === 1) // Metrics don't update on every write assert(writeMetrics.bytesWritten == 0) -// After 32 writes, metrics should update -for (i <- 0 until 32) { +// After 16384 writes, metrics should update +for (i <- 0 until 16384) { writer.flush() writer.write(Long.box(i), Long.box(i)) } assert(writeMetrics.bytesWritten > 0) -assert(writeMetrics.recordsWritten === 33) +assert(writeMetrics.recordsWritten === 16385) writer.commitAndClose() assert(file.length() == writeMetrics.bytesWritten) } @@ -75,13 +75,13 @@ class DiskBlockObjectWriterSuite extends SparkFunSuite with BeforeAndAfterEach { assert(writeMetrics.recordsWritten === 1) // Metrics don't update on every write assert(writeMetrics.bytesWritten == 0) -// After 32 writes, metrics should update -for (i <- 0 until 32) { +// After 16384 writes, metrics should update +for (i <- 0 until 16384) { writer.flush() writer.write(Long.box(i), Long.box(i)) } assert(writeMetrics.bytesWritten > 0) -assert(writeMetrics.recordsWritten === 33) +assert(writeMetrics.recordsWritten === 16385) writer.revertPartialWritesAndClose() assert(writeMetrics.bytesWritten == 0) assert(writeMetrics.recordsWritten == 0) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15569] Reduce frequency of updateBytesWritten function in Disk…
Repository: spark Updated Branches: refs/heads/branch-2.0 89fdb6972 -> 30e87b55b [SPARK-15569] Reduce frequency of updateBytesWritten function in Disk… ## What changes were proposed in this pull request? Profiling a Spark job spilling large amount of intermediate data we found that significant portion of time is being spent in DiskObjectWriter.updateBytesWritten function. Looking at the code, we see that the function is being called too frequently to update the number of bytes written to disk. We should reduce the frequency to avoid this. ## How was this patch tested? Tested by running the job on cluster and saw 20% CPU gain by this change. Author: Sital Kedia Closes #13332 from sitalkedia/DiskObjectWriter. (cherry picked from commit ce756daa4f012ebdc5a41bf5a89ff11b6dfdab8c) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/30e87b55 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/30e87b55 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/30e87b55 Branch: refs/heads/branch-2.0 Commit: 30e87b55b6f59ca029778087710effc768fafc35 Parents: 89fdb69 Author: Sital Kedia Authored: Fri May 27 11:22:39 2016 -0700 Committer: Andrew Or Committed: Fri May 27 11:22:48 2016 -0700 -- .../apache/spark/storage/DiskBlockObjectWriter.scala| 3 +-- .../spark/storage/DiskBlockObjectWriterSuite.scala | 12 ++-- 2 files changed, 7 insertions(+), 8 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/30e87b55/core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala -- diff --git a/core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala b/core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala index ab97d2e..5b493f4 100644 --- a/core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala +++ b/core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala @@ -203,8 +203,7 @@ private[spark] class DiskBlockObjectWriter( numRecordsWritten += 1 writeMetrics.incRecordsWritten(1) -// TODO: call updateBytesWritten() less frequently. -if (numRecordsWritten % 32 == 0) { +if (numRecordsWritten % 16384 == 0) { updateBytesWritten() } } http://git-wip-us.apache.org/repos/asf/spark/blob/30e87b55/core/src/test/scala/org/apache/spark/storage/DiskBlockObjectWriterSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/storage/DiskBlockObjectWriterSuite.scala b/core/src/test/scala/org/apache/spark/storage/DiskBlockObjectWriterSuite.scala index 8eff3c2..ec4ef4b 100644 --- a/core/src/test/scala/org/apache/spark/storage/DiskBlockObjectWriterSuite.scala +++ b/core/src/test/scala/org/apache/spark/storage/DiskBlockObjectWriterSuite.scala @@ -53,13 +53,13 @@ class DiskBlockObjectWriterSuite extends SparkFunSuite with BeforeAndAfterEach { assert(writeMetrics.recordsWritten === 1) // Metrics don't update on every write assert(writeMetrics.bytesWritten == 0) -// After 32 writes, metrics should update -for (i <- 0 until 32) { +// After 16384 writes, metrics should update +for (i <- 0 until 16384) { writer.flush() writer.write(Long.box(i), Long.box(i)) } assert(writeMetrics.bytesWritten > 0) -assert(writeMetrics.recordsWritten === 33) +assert(writeMetrics.recordsWritten === 16385) writer.commitAndClose() assert(file.length() == writeMetrics.bytesWritten) } @@ -75,13 +75,13 @@ class DiskBlockObjectWriterSuite extends SparkFunSuite with BeforeAndAfterEach { assert(writeMetrics.recordsWritten === 1) // Metrics don't update on every write assert(writeMetrics.bytesWritten == 0) -// After 32 writes, metrics should update -for (i <- 0 until 32) { +// After 16384 writes, metrics should update +for (i <- 0 until 16384) { writer.flush() writer.write(Long.box(i), Long.box(i)) } assert(writeMetrics.bytesWritten > 0) -assert(writeMetrics.recordsWritten === 33) +assert(writeMetrics.recordsWritten === 16385) writer.revertPartialWritesAndClose() assert(writeMetrics.bytesWritten == 0) assert(writeMetrics.recordsWritten == 0) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [MINOR][DOCS] Typo fixes in Dataset scaladoc
Repository: spark Updated Branches: refs/heads/branch-2.0 f52a95248 -> 89fdb6972 [MINOR][DOCS] Typo fixes in Dataset scaladoc ## What changes were proposed in this pull request? Minor typo fixes in Dataset scaladoc * Corrected context type as SparkSession, not SQLContext. liancheng rxin andrewor14 ## How was this patch tested? Compiled locally Author: Xinh Huynh Closes #13330 from xinhhuynh/fix-dataset-typos. (cherry picked from commit 5bdbedf2201efa6c34392aa9eff709761f027e1d) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/89fdb697 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/89fdb697 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/89fdb697 Branch: refs/heads/branch-2.0 Commit: 89fdb6972d5410f250bc56f8a834c939ee6653d2 Parents: f52a952 Author: Xinh Huynh Authored: Fri May 27 11:13:53 2016 -0700 Committer: Andrew Or Committed: Fri May 27 11:14:01 2016 -0700 -- sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/89fdb697/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala index 85f0cf8..abd16f2 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala @@ -121,7 +121,7 @@ private[sql] object Dataset { * * A more concrete example in Scala: * {{{ - * // To create Dataset[Row] using SQLContext + * // To create Dataset[Row] using SparkSession * val people = spark.read.parquet("...") * val department = spark.read.parquet("...") * @@ -133,7 +133,7 @@ private[sql] object Dataset { * * and in Java: * {{{ - * // To create Dataset using SQLContext + * // To create Dataset using SparkSession * Dataset people = spark.read().parquet("..."); * Dataset department = spark.read().parquet("..."); * - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [MINOR][DOCS] Typo fixes in Dataset scaladoc
Repository: spark Updated Branches: refs/heads/master a52e68133 -> 5bdbedf22 [MINOR][DOCS] Typo fixes in Dataset scaladoc ## What changes were proposed in this pull request? Minor typo fixes in Dataset scaladoc * Corrected context type as SparkSession, not SQLContext. liancheng rxin andrewor14 ## How was this patch tested? Compiled locally Author: Xinh Huynh Closes #13330 from xinhhuynh/fix-dataset-typos. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5bdbedf2 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5bdbedf2 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5bdbedf2 Branch: refs/heads/master Commit: 5bdbedf2201efa6c34392aa9eff709761f027e1d Parents: a52e681 Author: Xinh Huynh Authored: Fri May 27 11:13:53 2016 -0700 Committer: Andrew Or Committed: Fri May 27 11:13:53 2016 -0700 -- sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5bdbedf2/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala index 85f0cf8..abd16f2 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala @@ -121,7 +121,7 @@ private[sql] object Dataset { * * A more concrete example in Scala: * {{{ - * // To create Dataset[Row] using SQLContext + * // To create Dataset[Row] using SparkSession * val people = spark.read.parquet("...") * val department = spark.read.parquet("...") * @@ -133,7 +133,7 @@ private[sql] object Dataset { * * and in Java: * {{{ - * // To create Dataset using SQLContext + * // To create Dataset using SparkSession * Dataset people = spark.read().parquet("..."); * Dataset department = spark.read().parquet("..."); * - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15597][SQL] Add SparkSession.emptyDataset
Repository: spark Updated Branches: refs/heads/branch-2.0 e69639f43 -> f52a95248 [SPARK-15597][SQL] Add SparkSession.emptyDataset ## What changes were proposed in this pull request? This patch adds a new function emptyDataset to SparkSession, for creating an empty dataset. ## How was this patch tested? Added a test case. Author: Reynold Xin Closes #13344 from rxin/SPARK-15597. (cherry picked from commit a52e6813392ba4bdb1b818694b7ced8f6caa6a2b) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f52a9524 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f52a9524 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f52a9524 Branch: refs/heads/branch-2.0 Commit: f52a9524865b8c56058a65b29a1aaacffb709f69 Parents: e69639f Author: Reynold Xin Authored: Fri May 27 11:13:09 2016 -0700 Committer: Andrew Or Committed: Fri May 27 11:13:17 2016 -0700 -- .../main/scala/org/apache/spark/sql/SparkSession.scala | 12 .../test/scala/org/apache/spark/sql/DatasetSuite.scala | 6 ++ 2 files changed, 18 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f52a9524/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala index aa60048..c9276cf 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala @@ -224,6 +224,18 @@ class SparkSession private( /** * :: Experimental :: + * Creates a new [[Dataset]] of type T containing zero elements. + * + * @return 2.0.0 + */ + @Experimental + def emptyDataset[T: Encoder]: Dataset[T] = { +val encoder = implicitly[Encoder[T]] +new Dataset(self, LocalRelation(encoder.schema.toAttributes), encoder) + } + + /** + * :: Experimental :: * Creates a [[DataFrame]] from an RDD of Product (e.g. case classes, tuples). * * @group dataframes http://git-wip-us.apache.org/repos/asf/spark/blob/f52a9524/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala index 2a65916..e395007 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala @@ -46,6 +46,12 @@ class DatasetSuite extends QueryTest with SharedSQLContext { 1, 1, 1) } + test("emptyDataset") { +val ds = spark.emptyDataset[Int] +assert(ds.count() == 0L) +assert(ds.collect() sameElements Array.empty[Int]) + } + test("range") { assert(spark.range(10).map(_ + 1).reduce(_ + _) == 55) assert(spark.range(10).map{ case i: java.lang.Long => i + 1 }.reduce(_ + _) == 55) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15597][SQL] Add SparkSession.emptyDataset
Repository: spark Updated Branches: refs/heads/master 635fb30f8 -> a52e68133 [SPARK-15597][SQL] Add SparkSession.emptyDataset ## What changes were proposed in this pull request? This patch adds a new function emptyDataset to SparkSession, for creating an empty dataset. ## How was this patch tested? Added a test case. Author: Reynold Xin Closes #13344 from rxin/SPARK-15597. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a52e6813 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a52e6813 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a52e6813 Branch: refs/heads/master Commit: a52e6813392ba4bdb1b818694b7ced8f6caa6a2b Parents: 635fb30 Author: Reynold Xin Authored: Fri May 27 11:13:09 2016 -0700 Committer: Andrew Or Committed: Fri May 27 11:13:09 2016 -0700 -- .../main/scala/org/apache/spark/sql/SparkSession.scala | 12 .../test/scala/org/apache/spark/sql/DatasetSuite.scala | 6 ++ 2 files changed, 18 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a52e6813/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala index aa60048..c9276cf 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala @@ -224,6 +224,18 @@ class SparkSession private( /** * :: Experimental :: + * Creates a new [[Dataset]] of type T containing zero elements. + * + * @return 2.0.0 + */ + @Experimental + def emptyDataset[T: Encoder]: Dataset[T] = { +val encoder = implicitly[Encoder[T]] +new Dataset(self, LocalRelation(encoder.schema.toAttributes), encoder) + } + + /** + * :: Experimental :: * Creates a [[DataFrame]] from an RDD of Product (e.g. case classes, tuples). * * @group dataframes http://git-wip-us.apache.org/repos/asf/spark/blob/a52e6813/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala index 2a65916..e395007 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala @@ -46,6 +46,12 @@ class DatasetSuite extends QueryTest with SharedSQLContext { 1, 1, 1) } + test("emptyDataset") { +val ds = spark.emptyDataset[Int] +assert(ds.count() == 0L) +assert(ds.collect() sameElements Array.empty[Int]) + } + test("range") { assert(spark.range(10).map(_ + 1).reduce(_ + _) == 55) assert(spark.range(10).map{ case i: java.lang.Long => i + 1 }.reduce(_ + _) == 55) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15599][SQL][DOCS] API docs for `createDataset` functions in SparkSession
Repository: spark Updated Branches: refs/heads/branch-2.0 a14c88acc -> e69639f43 [SPARK-15599][SQL][DOCS] API docs for `createDataset` functions in SparkSession ## What changes were proposed in this pull request? Adds API docs and usage examples for the 3 `createDataset` calls in `SparkSession` ## How was this patch tested? N/A Author: Sameer Agarwal Closes #13345 from sameeragarwal/dataset-doc. (cherry picked from commit 635fb30f83a66cc56f5fecfed5bff77873bf49a6) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e69639f4 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e69639f4 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e69639f4 Branch: refs/heads/branch-2.0 Commit: e69639f4334aae3ace5e50452603dd667467ea9a Parents: a14c88a Author: Sameer Agarwal Authored: Fri May 27 11:11:31 2016 -0700 Committer: Andrew Or Committed: Fri May 27 11:11:40 2016 -0700 -- .../org/apache/spark/sql/SparkSession.scala | 63 1 file changed, 63 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e69639f4/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala index 5dabe0e..aa60048 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala @@ -376,6 +376,40 @@ class SparkSession private( Dataset.ofRows(self, LogicalRelation(baseRelation)) } + /* --- * + | Methods for creating DataSets | + * --- */ + + /** + * :: Experimental :: + * Creates a [[Dataset]] from a local Seq of data of a given type. This method requires an + * encoder (to convert a JVM object of type `T` to and from the internal Spark SQL representation) + * that is generally created automatically through implicits from a `SparkSession`, or can be + * created explicitly by calling static methods on [[Encoders]]. + * + * == Example == + * + * {{{ + * + * import spark.implicits._ + * case class Person(name: String, age: Long) + * val data = Seq(Person("Michael", 29), Person("Andy", 30), Person("Justin", 19)) + * val ds = spark.createDataset(data) + * + * ds.show() + * // +---+---+ + * // | name|age| + * // +---+---+ + * // |Michael| 29| + * // | Andy| 30| + * // | Justin| 19| + * // +---+---+ + * }}} + * + * @since 2.0.0 + * @group dataset + */ + @Experimental def createDataset[T : Encoder](data: Seq[T]): Dataset[T] = { val enc = encoderFor[T] val attributes = enc.schema.toAttributes @@ -384,6 +418,17 @@ class SparkSession private( Dataset[T](self, plan) } + /** + * :: Experimental :: + * Creates a [[Dataset]] from an RDD of a given type. This method requires an + * encoder (to convert a JVM object of type `T` to and from the internal Spark SQL representation) + * that is generally created automatically through implicits from a `SparkSession`, or can be + * created explicitly by calling static methods on [[Encoders]]. + * + * @since 2.0.0 + * @group dataset + */ + @Experimental def createDataset[T : Encoder](data: RDD[T]): Dataset[T] = { val enc = encoderFor[T] val attributes = enc.schema.toAttributes @@ -392,6 +437,24 @@ class SparkSession private( Dataset[T](self, plan) } + /** + * :: Experimental :: + * Creates a [[Dataset]] from a [[java.util.List]] of a given type. This method requires an + * encoder (to convert a JVM object of type `T` to and from the internal Spark SQL representation) + * that is generally created automatically through implicits from a `SparkSession`, or can be + * created explicitly by calling static methods on [[Encoders]]. + * + * == Java Example == + * + * {{{ + * List data = Arrays.asList("hello", "world"); + * Dataset ds = spark.createDataset(data, Encoders.STRING()); + * }}} + * + * @since 2.0.0 + * @group dataset + */ + @Experimental def createDataset[T : Encoder](data: java.util.List[T]): Dataset[T] = { createDataset(data.asScala) } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15599][SQL][DOCS] API docs for `createDataset` functions in SparkSession
Repository: spark Updated Branches: refs/heads/master 4538443e2 -> 635fb30f8 [SPARK-15599][SQL][DOCS] API docs for `createDataset` functions in SparkSession ## What changes were proposed in this pull request? Adds API docs and usage examples for the 3 `createDataset` calls in `SparkSession` ## How was this patch tested? N/A Author: Sameer Agarwal Closes #13345 from sameeragarwal/dataset-doc. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/635fb30f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/635fb30f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/635fb30f Branch: refs/heads/master Commit: 635fb30f83a66cc56f5fecfed5bff77873bf49a6 Parents: 4538443 Author: Sameer Agarwal Authored: Fri May 27 11:11:31 2016 -0700 Committer: Andrew Or Committed: Fri May 27 11:11:31 2016 -0700 -- .../org/apache/spark/sql/SparkSession.scala | 63 1 file changed, 63 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/635fb30f/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala index 5dabe0e..aa60048 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala @@ -376,6 +376,40 @@ class SparkSession private( Dataset.ofRows(self, LogicalRelation(baseRelation)) } + /* --- * + | Methods for creating DataSets | + * --- */ + + /** + * :: Experimental :: + * Creates a [[Dataset]] from a local Seq of data of a given type. This method requires an + * encoder (to convert a JVM object of type `T` to and from the internal Spark SQL representation) + * that is generally created automatically through implicits from a `SparkSession`, or can be + * created explicitly by calling static methods on [[Encoders]]. + * + * == Example == + * + * {{{ + * + * import spark.implicits._ + * case class Person(name: String, age: Long) + * val data = Seq(Person("Michael", 29), Person("Andy", 30), Person("Justin", 19)) + * val ds = spark.createDataset(data) + * + * ds.show() + * // +---+---+ + * // | name|age| + * // +---+---+ + * // |Michael| 29| + * // | Andy| 30| + * // | Justin| 19| + * // +---+---+ + * }}} + * + * @since 2.0.0 + * @group dataset + */ + @Experimental def createDataset[T : Encoder](data: Seq[T]): Dataset[T] = { val enc = encoderFor[T] val attributes = enc.schema.toAttributes @@ -384,6 +418,17 @@ class SparkSession private( Dataset[T](self, plan) } + /** + * :: Experimental :: + * Creates a [[Dataset]] from an RDD of a given type. This method requires an + * encoder (to convert a JVM object of type `T` to and from the internal Spark SQL representation) + * that is generally created automatically through implicits from a `SparkSession`, or can be + * created explicitly by calling static methods on [[Encoders]]. + * + * @since 2.0.0 + * @group dataset + */ + @Experimental def createDataset[T : Encoder](data: RDD[T]): Dataset[T] = { val enc = encoderFor[T] val attributes = enc.schema.toAttributes @@ -392,6 +437,24 @@ class SparkSession private( Dataset[T](self, plan) } + /** + * :: Experimental :: + * Creates a [[Dataset]] from a [[java.util.List]] of a given type. This method requires an + * encoder (to convert a JVM object of type `T` to and from the internal Spark SQL representation) + * that is generally created automatically through implicits from a `SparkSession`, or can be + * created explicitly by calling static methods on [[Encoders]]. + * + * == Java Example == + * + * {{{ + * List data = Arrays.asList("hello", "world"); + * Dataset ds = spark.createDataset(data, Encoders.STRING()); + * }}} + * + * @since 2.0.0 + * @group dataset + */ + @Experimental def createDataset[T : Encoder](data: java.util.List[T]): Dataset[T] = { createDataset(data.asScala) } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15584][SQL] Abstract duplicate code: `spark.sql.sources.` properties
Repository: spark Updated Branches: refs/heads/master d24e25157 -> 4538443e2 [SPARK-15584][SQL] Abstract duplicate code: `spark.sql.sources.` properties ## What changes were proposed in this pull request? This PR replaces `spark.sql.sources.` strings with `CreateDataSourceTableUtils.*` constant variables. ## How was this patch tested? Pass the existing Jenkins tests. Author: Dongjoon Hyun Closes #13349 from dongjoon-hyun/SPARK-15584. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4538443e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4538443e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4538443e Branch: refs/heads/master Commit: 4538443e276597530a27c6922e48503677b13956 Parents: d24e251 Author: Dongjoon Hyun Authored: Fri May 27 11:10:31 2016 -0700 Committer: Andrew Or Committed: Fri May 27 11:10:31 2016 -0700 -- .../spark/ml/source/libsvm/LibSVMRelation.scala | 3 +- .../command/createDataSourceTables.scala| 28 +- .../spark/sql/execution/command/ddl.scala | 19 +++ .../spark/sql/execution/command/tables.scala| 4 +- .../datasources/DataSourceStrategy.scala| 2 +- .../execution/datasources/WriterContainer.scala | 10 ++-- .../execution/datasources/csv/CSVRelation.scala | 3 +- .../datasources/json/JsonFileFormat.scala | 5 +- .../datasources/parquet/ParquetFileFormat.scala | 4 +- .../datasources/text/TextFileFormat.scala | 3 +- .../spark/sql/execution/command/DDLSuite.scala | 10 ++-- .../spark/sql/hive/HiveMetastoreCatalog.scala | 18 +++--- .../spark/sql/hive/orc/OrcFileFormat.scala | 3 +- .../sql/hive/MetastoreDataSourcesSuite.scala| 58 ++-- .../sql/hive/execution/HiveCommandSuite.scala | 16 +++--- .../spark/sql/sources/SimpleTextRelation.scala | 3 +- 16 files changed, 95 insertions(+), 94 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4538443e/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala index 64ebf0c..7629369 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala @@ -34,6 +34,7 @@ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.catalyst.expressions.AttributeReference import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection +import org.apache.spark.sql.execution.command.CreateDataSourceTableUtils import org.apache.spark.sql.execution.datasources._ import org.apache.spark.sql.sources._ import org.apache.spark.sql.types._ @@ -51,7 +52,7 @@ private[libsvm] class LibSVMOutputWriter( new TextOutputFormat[NullWritable, Text]() { override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = { val configuration = context.getConfiguration -val uniqueWriteJobId = configuration.get("spark.sql.sources.writeJobUUID") +val uniqueWriteJobId = configuration.get(CreateDataSourceTableUtils.DATASOURCE_WRITEJOBUUID) val taskAttemptId = context.getTaskAttemptID val split = taskAttemptId.getTaskID.getId new Path(path, f"part-r-$split%05d-$uniqueWriteJobId$extension") http://git-wip-us.apache.org/repos/asf/spark/blob/4538443e/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala index deedb68..4b9aab6 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala @@ -256,15 +256,15 @@ case class CreateDataSourceTableAsSelectCommand( object CreateDataSourceTableUtils extends Logging { - // TODO: Actually replace usages with these variables (SPARK-15584) - val DATASOURCE_PREFIX = "spark.sql.sources." val DATASOURCE_PROVIDER = DATASOURCE_PREFIX + "provider" val DATASOURCE_WRITEJOBUUID = DATASOURCE_PREFIX + "writeJobUUID" val DATASOURCE_OUTPUTPATH = DATASOURCE_PREFIX + "output.path" - val DATASOURCE_SCHEMA_PREFIX = DATASOURCE_PREFIX + "schema." + val DATASOURCE_SCHEMA = DATASOURCE_PREFIX + "schema" + val DATASOURCE_S
spark git commit: [SPARK-15584][SQL] Abstract duplicate code: `spark.sql.sources.` properties
Repository: spark Updated Branches: refs/heads/branch-2.0 a355edeef -> a14c88acc [SPARK-15584][SQL] Abstract duplicate code: `spark.sql.sources.` properties ## What changes were proposed in this pull request? This PR replaces `spark.sql.sources.` strings with `CreateDataSourceTableUtils.*` constant variables. ## How was this patch tested? Pass the existing Jenkins tests. Author: Dongjoon Hyun Closes #13349 from dongjoon-hyun/SPARK-15584. (cherry picked from commit 4538443e276597530a27c6922e48503677b13956) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a14c88ac Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a14c88ac Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a14c88ac Branch: refs/heads/branch-2.0 Commit: a14c88acce0733f3db8b0508ae8b0417822e08d8 Parents: a355ede Author: Dongjoon Hyun Authored: Fri May 27 11:10:31 2016 -0700 Committer: Andrew Or Committed: Fri May 27 11:10:39 2016 -0700 -- .../spark/ml/source/libsvm/LibSVMRelation.scala | 3 +- .../command/createDataSourceTables.scala| 28 +- .../spark/sql/execution/command/ddl.scala | 19 +++ .../spark/sql/execution/command/tables.scala| 4 +- .../datasources/DataSourceStrategy.scala| 2 +- .../execution/datasources/WriterContainer.scala | 10 ++-- .../execution/datasources/csv/CSVRelation.scala | 3 +- .../datasources/json/JsonFileFormat.scala | 5 +- .../datasources/parquet/ParquetFileFormat.scala | 4 +- .../datasources/text/TextFileFormat.scala | 3 +- .../spark/sql/execution/command/DDLSuite.scala | 10 ++-- .../spark/sql/hive/HiveMetastoreCatalog.scala | 18 +++--- .../spark/sql/hive/orc/OrcFileFormat.scala | 3 +- .../sql/hive/MetastoreDataSourcesSuite.scala| 58 ++-- .../sql/hive/execution/HiveCommandSuite.scala | 16 +++--- .../spark/sql/sources/SimpleTextRelation.scala | 3 +- 16 files changed, 95 insertions(+), 94 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a14c88ac/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala index 64ebf0c..7629369 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala @@ -34,6 +34,7 @@ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.catalyst.expressions.AttributeReference import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection +import org.apache.spark.sql.execution.command.CreateDataSourceTableUtils import org.apache.spark.sql.execution.datasources._ import org.apache.spark.sql.sources._ import org.apache.spark.sql.types._ @@ -51,7 +52,7 @@ private[libsvm] class LibSVMOutputWriter( new TextOutputFormat[NullWritable, Text]() { override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = { val configuration = context.getConfiguration -val uniqueWriteJobId = configuration.get("spark.sql.sources.writeJobUUID") +val uniqueWriteJobId = configuration.get(CreateDataSourceTableUtils.DATASOURCE_WRITEJOBUUID) val taskAttemptId = context.getTaskAttemptID val split = taskAttemptId.getTaskID.getId new Path(path, f"part-r-$split%05d-$uniqueWriteJobId$extension") http://git-wip-us.apache.org/repos/asf/spark/blob/a14c88ac/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala index deedb68..4b9aab6 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala @@ -256,15 +256,15 @@ case class CreateDataSourceTableAsSelectCommand( object CreateDataSourceTableUtils extends Logging { - // TODO: Actually replace usages with these variables (SPARK-15584) - val DATASOURCE_PREFIX = "spark.sql.sources." val DATASOURCE_PROVIDER = DATASOURCE_PREFIX + "provider" val DATASOURCE_WRITEJOBUUID = DATASOURCE_PREFIX + "writeJobUUID" val DATASOURCE_OUTPUTPATH = DATASOURCE_PREFIX + "output.path" - val DATASOURCE_SCHEMA_PREFIX = D
spark git commit: [SPARK-15603][MLLIB] Replace SQLContext with SparkSession in ML/MLLib
Repository: spark Updated Branches: refs/heads/branch-2.0 2cb84dd23 -> a355edeef [SPARK-15603][MLLIB] Replace SQLContext with SparkSession in ML/MLLib ## What changes were proposed in this pull request? This PR replaces all deprecated `SQLContext` occurrences with `SparkSession` in `ML/MLLib` module except the following two classes. These two classes use `SQLContext` in their function signatures. - ReadWrite.scala - TreeModels.scala ## How was this patch tested? Pass the existing Jenkins tests. Author: Dongjoon Hyun Closes #13352 from dongjoon-hyun/SPARK-15603. (cherry picked from commit d24e251572d39a453293cabfe14e4aed25a55208) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a355edee Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a355edee Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a355edee Branch: refs/heads/branch-2.0 Commit: a355edeefa16988da8b05d2539a91277e75e823c Parents: 2cb84dd Author: Dongjoon Hyun Authored: Fri May 27 11:09:15 2016 -0700 Committer: Andrew Or Committed: Fri May 27 11:09:33 2016 -0700 -- .../spark/ml/clustering/GaussianMixture.scala | 7 ++-- .../spark/ml/feature/SQLTransformer.scala | 14 .../org/apache/spark/ml/feature/Word2Vec.scala | 16 - .../spark/mllib/api/python/PythonMLLibAPI.scala | 23 +++-- .../classification/LogisticRegression.scala | 19 +-- .../spark/mllib/classification/NaiveBayes.scala | 24 ++--- .../impl/GLMClassificationModel.scala | 18 +- .../mllib/clustering/BisectingKMeansModel.scala | 12 +++ .../mllib/clustering/GaussianMixtureModel.scala | 16 - .../spark/mllib/clustering/KMeansModel.scala| 13 --- .../spark/mllib/clustering/LDAModel.scala | 36 +--- .../clustering/PowerIterationClustering.scala | 12 +++ .../spark/mllib/feature/ChiSqSelector.scala | 13 --- .../apache/spark/mllib/feature/Word2Vec.scala | 13 +++ .../org/apache/spark/mllib/fpm/FPGrowth.scala | 10 +++--- .../org/apache/spark/mllib/fpm/PrefixSpan.scala | 10 +++--- .../MatrixFactorizationModel.scala | 12 +++ .../mllib/regression/IsotonicRegression.scala | 12 +++ .../regression/impl/GLMRegressionModel.scala| 18 +- .../mllib/tree/model/DecisionTreeModel.scala| 20 +-- .../mllib/tree/model/treeEnsembleModels.scala | 17 + .../ml/feature/QuantileDiscretizerSuite.scala | 14 .../mllib/util/MLlibTestSparkContext.scala | 6 ++-- 23 files changed, 160 insertions(+), 195 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a355edee/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala index 88b6b27..773e50e 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala @@ -20,7 +20,6 @@ package org.apache.spark.ml.clustering import breeze.linalg.{DenseVector => BDV} import org.apache.hadoop.fs.Path -import org.apache.spark.SparkContext import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.ml.impl.Utils.EPSILON @@ -33,7 +32,7 @@ import org.apache.spark.mllib.clustering.{GaussianMixture => MLlibGM} import org.apache.spark.mllib.linalg.{Matrices => OldMatrices, Matrix => OldMatrix, Vector => OldVector, Vectors => OldVectors, VectorUDT => OldVectorUDT} import org.apache.spark.rdd.RDD -import org.apache.spark.sql.{DataFrame, Dataset, Row, SQLContext} +import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{IntegerType, StructType} @@ -134,9 +133,7 @@ class GaussianMixtureModel private[ml] ( val modelGaussians = gaussians.map { gaussian => (OldVectors.fromML(gaussian.mean), OldMatrices.fromML(gaussian.cov)) } -val sc = SparkContext.getOrCreate() -val sqlContext = SQLContext.getOrCreate(sc) -sqlContext.createDataFrame(modelGaussians).toDF("mean", "cov") + SparkSession.builder().getOrCreate().createDataFrame(modelGaussians).toDF("mean", "cov") } /** http://git-wip-us.apache.org/repos/asf/spark/blob/a355edee/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala b/ml
spark git commit: [SPARK-15603][MLLIB] Replace SQLContext with SparkSession in ML/MLLib
Repository: spark Updated Branches: refs/heads/master c17272902 -> d24e25157 [SPARK-15603][MLLIB] Replace SQLContext with SparkSession in ML/MLLib ## What changes were proposed in this pull request? This PR replaces all deprecated `SQLContext` occurrences with `SparkSession` in `ML/MLLib` module except the following two classes. These two classes use `SQLContext` in their function signatures. - ReadWrite.scala - TreeModels.scala ## How was this patch tested? Pass the existing Jenkins tests. Author: Dongjoon Hyun Closes #13352 from dongjoon-hyun/SPARK-15603. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d24e2515 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d24e2515 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d24e2515 Branch: refs/heads/master Commit: d24e251572d39a453293cabfe14e4aed25a55208 Parents: c172729 Author: Dongjoon Hyun Authored: Fri May 27 11:09:15 2016 -0700 Committer: Andrew Or Committed: Fri May 27 11:09:15 2016 -0700 -- .../spark/ml/clustering/GaussianMixture.scala | 7 ++-- .../spark/ml/feature/SQLTransformer.scala | 14 .../org/apache/spark/ml/feature/Word2Vec.scala | 16 - .../spark/mllib/api/python/PythonMLLibAPI.scala | 23 +++-- .../classification/LogisticRegression.scala | 19 +-- .../spark/mllib/classification/NaiveBayes.scala | 24 ++--- .../impl/GLMClassificationModel.scala | 18 +- .../mllib/clustering/BisectingKMeansModel.scala | 12 +++ .../mllib/clustering/GaussianMixtureModel.scala | 16 - .../spark/mllib/clustering/KMeansModel.scala| 13 --- .../spark/mllib/clustering/LDAModel.scala | 36 +--- .../clustering/PowerIterationClustering.scala | 12 +++ .../spark/mllib/feature/ChiSqSelector.scala | 13 --- .../apache/spark/mllib/feature/Word2Vec.scala | 13 +++ .../org/apache/spark/mllib/fpm/FPGrowth.scala | 10 +++--- .../org/apache/spark/mllib/fpm/PrefixSpan.scala | 10 +++--- .../MatrixFactorizationModel.scala | 12 +++ .../mllib/regression/IsotonicRegression.scala | 12 +++ .../regression/impl/GLMRegressionModel.scala| 18 +- .../mllib/tree/model/DecisionTreeModel.scala| 20 +-- .../mllib/tree/model/treeEnsembleModels.scala | 17 + .../ml/feature/QuantileDiscretizerSuite.scala | 14 .../mllib/util/MLlibTestSparkContext.scala | 6 ++-- 23 files changed, 160 insertions(+), 195 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d24e2515/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala index 88b6b27..773e50e 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala @@ -20,7 +20,6 @@ package org.apache.spark.ml.clustering import breeze.linalg.{DenseVector => BDV} import org.apache.hadoop.fs.Path -import org.apache.spark.SparkContext import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.ml.impl.Utils.EPSILON @@ -33,7 +32,7 @@ import org.apache.spark.mllib.clustering.{GaussianMixture => MLlibGM} import org.apache.spark.mllib.linalg.{Matrices => OldMatrices, Matrix => OldMatrix, Vector => OldVector, Vectors => OldVectors, VectorUDT => OldVectorUDT} import org.apache.spark.rdd.RDD -import org.apache.spark.sql.{DataFrame, Dataset, Row, SQLContext} +import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{IntegerType, StructType} @@ -134,9 +133,7 @@ class GaussianMixtureModel private[ml] ( val modelGaussians = gaussians.map { gaussian => (OldVectors.fromML(gaussian.mean), OldMatrices.fromML(gaussian.cov)) } -val sc = SparkContext.getOrCreate() -val sqlContext = SQLContext.getOrCreate(sc) -sqlContext.createDataFrame(modelGaussians).toDF("mean", "cov") + SparkSession.builder().getOrCreate().createDataFrame(modelGaussians).toDF("mean", "cov") } /** http://git-wip-us.apache.org/repos/asf/spark/blob/d24e2515/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala index 2d4cac6..bd8f949 100644 --- a/
spark git commit: [SPARK-15538][SPARK-15539][SQL] Truncate table fixes round 2
Repository: spark Updated Branches: refs/heads/branch-2.0 d3cd579d7 -> c99ad9215 [SPARK-15538][SPARK-15539][SQL] Truncate table fixes round 2 ## What changes were proposed in this pull request? Two more changes: (1) Fix truncate table for data source tables (only for cases without `PARTITION`) (2) Disallow truncating external tables or views ## How was this patch tested? `DDLSuite` Author: Andrew Or Closes #13315 from andrewor14/truncate-table. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c99ad921 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c99ad921 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c99ad921 Branch: refs/heads/branch-2.0 Commit: c99ad9215cea7f0f983ce06e9cbdbcfa7bd092bf Parents: d3cd579 Author: Andrew Or Authored: Thu May 26 19:01:41 2016 -0700 Committer: Andrew Or Committed: Thu May 26 19:02:28 2016 -0700 -- .../spark/sql/execution/command/tables.scala| 78 +--- .../spark/sql/execution/command/DDLSuite.scala | 34 + 2 files changed, 86 insertions(+), 26 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c99ad921/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala index bef4c92..e34beec 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala @@ -285,41 +285,67 @@ case class TruncateTableCommand( tableName: TableIdentifier, partitionSpec: Option[TablePartitionSpec]) extends RunnableCommand { - override def run(sparkSession: SparkSession): Seq[Row] = { -val catalog = sparkSession.sessionState.catalog + override def run(spark: SparkSession): Seq[Row] = { +val catalog = spark.sessionState.catalog if (!catalog.tableExists(tableName)) { throw new AnalysisException(s"Table '$tableName' in TRUNCATE TABLE does not exist.") -} else if (catalog.isTemporaryTable(tableName)) { +} +if (catalog.isTemporaryTable(tableName)) { throw new AnalysisException( s"Operation not allowed: TRUNCATE TABLE on temporary tables: '$tableName'") -} else { - val locations = if (partitionSpec.isDefined) { -catalog.listPartitions(tableName, partitionSpec).map(_.storage.locationUri) +} +val table = catalog.getTableMetadata(tableName) +if (table.tableType == CatalogTableType.EXTERNAL) { + throw new AnalysisException( +s"Operation not allowed: TRUNCATE TABLE on external tables: '$tableName'") +} +if (table.tableType == CatalogTableType.VIEW) { + throw new AnalysisException( +s"Operation not allowed: TRUNCATE TABLE on views: '$tableName'") +} +val isDatasourceTable = DDLUtils.isDatasourceTable(table) +if (isDatasourceTable && partitionSpec.isDefined) { + throw new AnalysisException( +s"Operation not allowed: TRUNCATE TABLE ... PARTITION is not supported " + +s"for tables created using the data sources API: '$tableName'") +} +if (table.partitionColumnNames.isEmpty && partitionSpec.isDefined) { + throw new AnalysisException( +s"Operation not allowed: TRUNCATE TABLE ... PARTITION is not supported " + +s"for tables that are not partitioned: '$tableName'") +} +val locations = + if (isDatasourceTable || table.partitionColumnNames.isEmpty) { +Seq(table.storage.locationUri) } else { -val table = catalog.getTableMetadata(tableName) -if (table.partitionColumnNames.nonEmpty) { - catalog.listPartitions(tableName).map(_.storage.locationUri) -} else { - Seq(table.storage.locationUri) -} +catalog.listPartitions(tableName, partitionSpec).map(_.storage.locationUri) } - val hadoopConf = sparkSession.sessionState.newHadoopConf() - locations.foreach { location => -if (location.isDefined) { - val path = new Path(location.get) - try { -val fs = path.getFileSystem(hadoopConf) -fs.delete(path, true) -fs.mkdirs(path) - } catch { -case NonFatal(e) => - throw new AnalysisException( -s"Failed to truncate table '$tableName' when removing data of the path: $path " + - s"because of ${e.toS
spark git commit: [SPARK-15538][SPARK-15539][SQL] Truncate table fixes round 2
Repository: spark Updated Branches: refs/heads/master 3ac2363d7 -> 008a5377d [SPARK-15538][SPARK-15539][SQL] Truncate table fixes round 2 ## What changes were proposed in this pull request? Two more changes: (1) Fix truncate table for data source tables (only for cases without `PARTITION`) (2) Disallow truncating external tables or views ## How was this patch tested? `DDLSuite` Author: Andrew Or Closes #13315 from andrewor14/truncate-table. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/008a5377 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/008a5377 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/008a5377 Branch: refs/heads/master Commit: 008a5377d57ce6692eca4a41539fb27978b58e01 Parents: 3ac2363 Author: Andrew Or Authored: Thu May 26 19:01:41 2016 -0700 Committer: Andrew Or Committed: Thu May 26 19:01:41 2016 -0700 -- .../spark/sql/execution/command/tables.scala| 78 +--- .../spark/sql/execution/command/DDLSuite.scala | 34 + 2 files changed, 86 insertions(+), 26 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/008a5377/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala index bef4c92..e34beec 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala @@ -285,41 +285,67 @@ case class TruncateTableCommand( tableName: TableIdentifier, partitionSpec: Option[TablePartitionSpec]) extends RunnableCommand { - override def run(sparkSession: SparkSession): Seq[Row] = { -val catalog = sparkSession.sessionState.catalog + override def run(spark: SparkSession): Seq[Row] = { +val catalog = spark.sessionState.catalog if (!catalog.tableExists(tableName)) { throw new AnalysisException(s"Table '$tableName' in TRUNCATE TABLE does not exist.") -} else if (catalog.isTemporaryTable(tableName)) { +} +if (catalog.isTemporaryTable(tableName)) { throw new AnalysisException( s"Operation not allowed: TRUNCATE TABLE on temporary tables: '$tableName'") -} else { - val locations = if (partitionSpec.isDefined) { -catalog.listPartitions(tableName, partitionSpec).map(_.storage.locationUri) +} +val table = catalog.getTableMetadata(tableName) +if (table.tableType == CatalogTableType.EXTERNAL) { + throw new AnalysisException( +s"Operation not allowed: TRUNCATE TABLE on external tables: '$tableName'") +} +if (table.tableType == CatalogTableType.VIEW) { + throw new AnalysisException( +s"Operation not allowed: TRUNCATE TABLE on views: '$tableName'") +} +val isDatasourceTable = DDLUtils.isDatasourceTable(table) +if (isDatasourceTable && partitionSpec.isDefined) { + throw new AnalysisException( +s"Operation not allowed: TRUNCATE TABLE ... PARTITION is not supported " + +s"for tables created using the data sources API: '$tableName'") +} +if (table.partitionColumnNames.isEmpty && partitionSpec.isDefined) { + throw new AnalysisException( +s"Operation not allowed: TRUNCATE TABLE ... PARTITION is not supported " + +s"for tables that are not partitioned: '$tableName'") +} +val locations = + if (isDatasourceTable || table.partitionColumnNames.isEmpty) { +Seq(table.storage.locationUri) } else { -val table = catalog.getTableMetadata(tableName) -if (table.partitionColumnNames.nonEmpty) { - catalog.listPartitions(tableName).map(_.storage.locationUri) -} else { - Seq(table.storage.locationUri) -} +catalog.listPartitions(tableName, partitionSpec).map(_.storage.locationUri) } - val hadoopConf = sparkSession.sessionState.newHadoopConf() - locations.foreach { location => -if (location.isDefined) { - val path = new Path(location.get) - try { -val fs = path.getFileSystem(hadoopConf) -fs.delete(path, true) -fs.mkdirs(path) - } catch { -case NonFatal(e) => - throw new AnalysisException( -s"Failed to truncate table '$tableName' when removing data of the path: $path " + - s"because of ${e.toS
spark git commit: [SPARK-15552][SQL] Remove unnecessary private[sql] methods in SparkSession
Repository: spark Updated Branches: refs/heads/branch-2.0 eb0c49799 -> 6eea33ec3 [SPARK-15552][SQL] Remove unnecessary private[sql] methods in SparkSession ## What changes were proposed in this pull request? SparkSession has a list of unnecessary private[sql] methods. These methods cause some trouble because private[sql] doesn't apply in Java. In the cases that they are easy to remove, we can simply remove them. This patch does that. As part of this pull request, I also replaced a bunch of protected[sql] with private[sql], to tighten up visibility. ## How was this patch tested? Updated test cases to reflect the changes. Author: Reynold Xin Closes #13319 from rxin/SPARK-15552. (cherry picked from commit 0f61d6efb45b9ee94fa663f67c4489fbdae2eded) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6eea33ec Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6eea33ec Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6eea33ec Branch: refs/heads/branch-2.0 Commit: 6eea33ec3ea805a16b812287141d22ce1aa659f9 Parents: eb0c497 Author: Reynold Xin Authored: Thu May 26 13:03:07 2016 -0700 Committer: Andrew Or Committed: Thu May 26 13:03:20 2016 -0700 -- .../org/apache/spark/sql/DataFrameWriter.scala | 4 +- .../scala/org/apache/spark/sql/Dataset.scala| 32 ++-- .../scala/org/apache/spark/sql/SQLContext.scala | 49 +++--- .../org/apache/spark/sql/SparkSession.scala | 54 .../spark/sql/execution/CacheManager.scala | 2 +- .../spark/sql/execution/QueryExecution.scala| 2 +- .../spark/sql/execution/command/commands.scala | 2 +- .../spark/sql/execution/command/ddl.scala | 3 +- .../spark/sql/execution/command/views.scala | 6 +-- .../sql/execution/datasources/DataSource.scala | 2 +- .../InsertIntoDataSourceCommand.scala | 2 +- .../apache/spark/sql/internal/CatalogImpl.scala | 24 - .../spark/sql/internal/SessionState.scala | 4 +- .../org/apache/spark/sql/CachedTableSuite.scala | 16 +++--- .../apache/spark/sql/DataFramePivotSuite.scala | 2 +- .../apache/spark/sql/DatasetCacheSuite.scala| 12 +++-- .../scala/org/apache/spark/sql/JoinSuite.scala | 8 +-- .../columnar/InMemoryColumnarQuerySuite.scala | 8 +-- .../sql/execution/metric/SQLMetricsSuite.scala | 18 --- .../sql/execution/ui/SQLListenerSuite.scala | 16 +++--- .../org/apache/spark/sql/jdbc/JDBCSuite.scala | 4 +- .../sql/hive/thriftserver/SparkSQLDriver.scala | 2 +- .../spark/sql/hive/HiveMetastoreCatalog.scala | 7 ++- .../execution/CreateTableAsSelectCommand.scala | 2 +- .../hive/execution/InsertIntoHiveTable.scala| 2 +- .../spark/sql/hive/ErrorPositionSuite.scala | 2 +- .../spark/sql/hive/ShowCreateTableSuite.scala | 4 +- .../apache/spark/sql/hive/StatisticsSuite.scala | 2 +- .../hive/execution/ConcurrentHiveSuite.scala| 6 +-- 29 files changed, 129 insertions(+), 168 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/6eea33ec/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala index 3aacce7..2e85e36 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala @@ -402,7 +402,7 @@ final class DataFrameWriter private[sql](df: DataFrame) { Project(inputDataCols ++ inputPartCols, df.logicalPlan) }.getOrElse(df.logicalPlan) -df.sparkSession.executePlan( +df.sparkSession.sessionState.executePlan( InsertIntoTable( UnresolvedRelation(tableIdent), partitions.getOrElse(Map.empty[String, Option[String]]), @@ -524,7 +524,7 @@ final class DataFrameWriter private[sql](df: DataFrame) { mode, extraOptions.toMap, df.logicalPlan) -df.sparkSession.executePlan(cmd).toRdd +df.sparkSession.sessionState.executePlan(cmd).toRdd } } http://git-wip-us.apache.org/repos/asf/spark/blob/6eea33ec/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala index e5140fc..961ae32 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala @@ -58,7 +58,7 @@ private[sql] object Dataset { } def ofRows(sparkSession: SparkSession, logicalPlan: LogicalPla
spark git commit: [SPARK-15552][SQL] Remove unnecessary private[sql] methods in SparkSession
Repository: spark Updated Branches: refs/heads/master 594a1bf20 -> 0f61d6efb [SPARK-15552][SQL] Remove unnecessary private[sql] methods in SparkSession ## What changes were proposed in this pull request? SparkSession has a list of unnecessary private[sql] methods. These methods cause some trouble because private[sql] doesn't apply in Java. In the cases that they are easy to remove, we can simply remove them. This patch does that. As part of this pull request, I also replaced a bunch of protected[sql] with private[sql], to tighten up visibility. ## How was this patch tested? Updated test cases to reflect the changes. Author: Reynold Xin Closes #13319 from rxin/SPARK-15552. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0f61d6ef Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0f61d6ef Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0f61d6ef Branch: refs/heads/master Commit: 0f61d6efb45b9ee94fa663f67c4489fbdae2eded Parents: 594a1bf Author: Reynold Xin Authored: Thu May 26 13:03:07 2016 -0700 Committer: Andrew Or Committed: Thu May 26 13:03:07 2016 -0700 -- .../org/apache/spark/sql/DataFrameWriter.scala | 4 +- .../scala/org/apache/spark/sql/Dataset.scala| 32 ++-- .../scala/org/apache/spark/sql/SQLContext.scala | 49 +++--- .../org/apache/spark/sql/SparkSession.scala | 54 .../spark/sql/execution/CacheManager.scala | 2 +- .../spark/sql/execution/QueryExecution.scala| 2 +- .../spark/sql/execution/command/commands.scala | 2 +- .../spark/sql/execution/command/ddl.scala | 3 +- .../spark/sql/execution/command/views.scala | 6 +-- .../sql/execution/datasources/DataSource.scala | 2 +- .../InsertIntoDataSourceCommand.scala | 2 +- .../apache/spark/sql/internal/CatalogImpl.scala | 24 - .../spark/sql/internal/SessionState.scala | 4 +- .../org/apache/spark/sql/CachedTableSuite.scala | 16 +++--- .../apache/spark/sql/DataFramePivotSuite.scala | 2 +- .../apache/spark/sql/DatasetCacheSuite.scala| 12 +++-- .../scala/org/apache/spark/sql/JoinSuite.scala | 8 +-- .../columnar/InMemoryColumnarQuerySuite.scala | 8 +-- .../sql/execution/metric/SQLMetricsSuite.scala | 18 --- .../sql/execution/ui/SQLListenerSuite.scala | 16 +++--- .../org/apache/spark/sql/jdbc/JDBCSuite.scala | 4 +- .../sql/hive/thriftserver/SparkSQLDriver.scala | 2 +- .../spark/sql/hive/HiveMetastoreCatalog.scala | 7 ++- .../execution/CreateTableAsSelectCommand.scala | 2 +- .../hive/execution/InsertIntoHiveTable.scala| 2 +- .../spark/sql/hive/ErrorPositionSuite.scala | 2 +- .../spark/sql/hive/ShowCreateTableSuite.scala | 4 +- .../apache/spark/sql/hive/StatisticsSuite.scala | 2 +- .../hive/execution/ConcurrentHiveSuite.scala| 6 +-- 29 files changed, 129 insertions(+), 168 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0f61d6ef/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala index 3aacce7..2e85e36 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala @@ -402,7 +402,7 @@ final class DataFrameWriter private[sql](df: DataFrame) { Project(inputDataCols ++ inputPartCols, df.logicalPlan) }.getOrElse(df.logicalPlan) -df.sparkSession.executePlan( +df.sparkSession.sessionState.executePlan( InsertIntoTable( UnresolvedRelation(tableIdent), partitions.getOrElse(Map.empty[String, Option[String]]), @@ -524,7 +524,7 @@ final class DataFrameWriter private[sql](df: DataFrame) { mode, extraOptions.toMap, df.logicalPlan) -df.sparkSession.executePlan(cmd).toRdd +df.sparkSession.sessionState.executePlan(cmd).toRdd } } http://git-wip-us.apache.org/repos/asf/spark/blob/0f61d6ef/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala index e5140fc..961ae32 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala @@ -58,7 +58,7 @@ private[sql] object Dataset { } def ofRows(sparkSession: SparkSession, logicalPlan: LogicalPlan): DataFrame = { -val qe = sparkSession.executePlan(logicalPlan) +val qe = sparkSession.sessio
spark git commit: [SPARK-15520][SQL] Also set sparkContext confs when using SparkSession builder in pyspark
Repository: spark Updated Branches: refs/heads/branch-2.0 69b3e9cee -> eb0c49799 [SPARK-15520][SQL] Also set sparkContext confs when using SparkSession builder in pyspark ## What changes were proposed in this pull request? Also sets confs in the underlying sc when using SparkSession.builder.getOrCreate(). This is a bug-fix from a post-merge comment in https://github.com/apache/spark/pull/13289 ## How was this patch tested? Python doc-tests. Author: Eric Liang Closes #13309 from ericl/spark-15520-1. (cherry picked from commit 594a1bf200fea8d6bcf25839a49186f66f922bc8) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/eb0c4979 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/eb0c4979 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/eb0c4979 Branch: refs/heads/branch-2.0 Commit: eb0c49799880acf2c35b95984b17fcb0ad7b5eca Parents: 69b3e9c Author: Eric Liang Authored: Thu May 26 12:05:47 2016 -0700 Committer: Andrew Or Committed: Thu May 26 12:05:55 2016 -0700 -- python/pyspark/sql/session.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/eb0c4979/python/pyspark/sql/session.py -- diff --git a/python/pyspark/sql/session.py b/python/pyspark/sql/session.py index 52e7f3d..8f7dcb5 100644 --- a/python/pyspark/sql/session.py +++ b/python/pyspark/sql/session.py @@ -144,7 +144,7 @@ class SparkSession(object): default. >>> s1 = SparkSession.builder.config("k1", "v1").getOrCreate() ->>> s1.conf.get("k1") == "v1" +>>> s1.conf.get("k1") == s1.sparkContext.getConf().get("k1") == "v1" True In case an existing SparkSession is returned, the config options specified @@ -168,6 +168,8 @@ class SparkSession(object): session = SparkSession(sc) for key, value in self._options.items(): session.conf.set(key, value) +for key, value in self._options.items(): +session.sparkContext._conf.set(key, value) return session builder = Builder() - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15520][SQL] Also set sparkContext confs when using SparkSession builder in pyspark
Repository: spark Updated Branches: refs/heads/master 2b1ac6cea -> 594a1bf20 [SPARK-15520][SQL] Also set sparkContext confs when using SparkSession builder in pyspark ## What changes were proposed in this pull request? Also sets confs in the underlying sc when using SparkSession.builder.getOrCreate(). This is a bug-fix from a post-merge comment in https://github.com/apache/spark/pull/13289 ## How was this patch tested? Python doc-tests. Author: Eric Liang Closes #13309 from ericl/spark-15520-1. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/594a1bf2 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/594a1bf2 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/594a1bf2 Branch: refs/heads/master Commit: 594a1bf200fea8d6bcf25839a49186f66f922bc8 Parents: 2b1ac6c Author: Eric Liang Authored: Thu May 26 12:05:47 2016 -0700 Committer: Andrew Or Committed: Thu May 26 12:05:47 2016 -0700 -- python/pyspark/sql/session.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/594a1bf2/python/pyspark/sql/session.py -- diff --git a/python/pyspark/sql/session.py b/python/pyspark/sql/session.py index 52e7f3d..8f7dcb5 100644 --- a/python/pyspark/sql/session.py +++ b/python/pyspark/sql/session.py @@ -144,7 +144,7 @@ class SparkSession(object): default. >>> s1 = SparkSession.builder.config("k1", "v1").getOrCreate() ->>> s1.conf.get("k1") == "v1" +>>> s1.conf.get("k1") == s1.sparkContext.getConf().get("k1") == "v1" True In case an existing SparkSession is returned, the config options specified @@ -168,6 +168,8 @@ class SparkSession(object): session = SparkSession(sc) for key, value in self._options.items(): session.conf.set(key, value) +for key, value in self._options.items(): +session.sparkContext._conf.set(key, value) return session builder = Builder() - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15539][SQL] DROP TABLE throw exception if table doesn't exist
Repository: spark Updated Branches: refs/heads/branch-2.0 0cb69a918 -> 69b3e9cee [SPARK-15539][SQL] DROP TABLE throw exception if table doesn't exist ## What changes were proposed in this pull request? Same as #13302, but for DROP TABLE. ## How was this patch tested? `DDLSuite` Author: Andrew Or Closes #13307 from andrewor14/drop-table. (cherry picked from commit 2b1ac6cea882246ef0e655bb2c134ef1656a5068) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/69b3e9ce Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/69b3e9ce Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/69b3e9ce Branch: refs/heads/branch-2.0 Commit: 69b3e9ceeaba8b1b836e93c0164566ed2e25a84e Parents: 0cb69a9 Author: Andrew Or Authored: Thu May 26 12:04:18 2016 -0700 Committer: Andrew Or Committed: Thu May 26 12:04:26 2016 -0700 -- .../spark/sql/execution/command/ddl.scala | 4 +- .../spark/sql/execution/command/DDLSuite.scala | 10 ++-- .../hive/execution/HiveCompatibilitySuite.scala | 52 ++-- .../HiveWindowFunctionQuerySuite.scala | 10 ++-- .../sql/hive/MetastoreDataSourcesSuite.scala| 2 +- .../spark/sql/hive/QueryPartitionSuite.scala| 4 +- 6 files changed, 42 insertions(+), 40 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/69b3e9ce/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala index dd3f17d..ffea628 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala @@ -185,7 +185,7 @@ case class DropTableCommand( if (!catalog.tableExists(tableName)) { if (!ifExists) { val objectName = if (isView) "View" else "Table" -logError(s"$objectName '${tableName.quotedString}' does not exist") +throw new AnalysisException(s"$objectName to drop '$tableName' does not exist") } } else { // If the command DROP VIEW is to drop a table or DROP TABLE is to drop a view @@ -202,7 +202,7 @@ case class DropTableCommand( try { sparkSession.cacheManager.tryUncacheQuery(sparkSession.table(tableName.quotedString)) } catch { -case NonFatal(e) => log.warn(s"${e.getMessage}", e) +case NonFatal(e) => log.warn(e.toString, e) } catalog.invalidateTable(tableName) catalog.dropTable(tableName, ifExists) http://git-wip-us.apache.org/repos/asf/spark/blob/69b3e9ce/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala index 64f5a4a..bddd3f2 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala @@ -741,14 +741,12 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach { sql("DROP TABLE dbx.tab1") assert(catalog.listTables("dbx") == Nil) sql("DROP TABLE IF EXISTS dbx.tab1") -// no exception will be thrown -sql("DROP TABLE dbx.tab1") +intercept[AnalysisException] { + sql("DROP TABLE dbx.tab1") +} } - test("drop view in SQLContext") { -// SQLContext does not support create view. Log an error message, if tab1 does not exists -sql("DROP VIEW tab1") - + test("drop view") { val catalog = spark.sessionState.catalog val tableIdent = TableIdentifier("tab1", Some("dbx")) createDatabase(catalog, "dbx") http://git-wip-us.apache.org/repos/asf/spark/blob/69b3e9ce/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala -- diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala index a8645f7..2d5a970 100644 --- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala +++ b/sql/hive/compatibility/
spark git commit: [SPARK-15539][SQL] DROP TABLE throw exception if table doesn't exist
Repository: spark Updated Branches: refs/heads/master 01b350a4f -> 2b1ac6cea [SPARK-15539][SQL] DROP TABLE throw exception if table doesn't exist ## What changes were proposed in this pull request? Same as #13302, but for DROP TABLE. ## How was this patch tested? `DDLSuite` Author: Andrew Or Closes #13307 from andrewor14/drop-table. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2b1ac6ce Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2b1ac6ce Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2b1ac6ce Branch: refs/heads/master Commit: 2b1ac6cea882246ef0e655bb2c134ef1656a5068 Parents: 01b350a Author: Andrew Or Authored: Thu May 26 12:04:18 2016 -0700 Committer: Andrew Or Committed: Thu May 26 12:04:18 2016 -0700 -- .../spark/sql/execution/command/ddl.scala | 4 +- .../spark/sql/execution/command/DDLSuite.scala | 10 ++-- .../hive/execution/HiveCompatibilitySuite.scala | 52 ++-- .../HiveWindowFunctionQuerySuite.scala | 10 ++-- .../sql/hive/MetastoreDataSourcesSuite.scala| 2 +- .../spark/sql/hive/QueryPartitionSuite.scala| 4 +- 6 files changed, 42 insertions(+), 40 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/2b1ac6ce/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala index dd3f17d..ffea628 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala @@ -185,7 +185,7 @@ case class DropTableCommand( if (!catalog.tableExists(tableName)) { if (!ifExists) { val objectName = if (isView) "View" else "Table" -logError(s"$objectName '${tableName.quotedString}' does not exist") +throw new AnalysisException(s"$objectName to drop '$tableName' does not exist") } } else { // If the command DROP VIEW is to drop a table or DROP TABLE is to drop a view @@ -202,7 +202,7 @@ case class DropTableCommand( try { sparkSession.cacheManager.tryUncacheQuery(sparkSession.table(tableName.quotedString)) } catch { -case NonFatal(e) => log.warn(s"${e.getMessage}", e) +case NonFatal(e) => log.warn(e.toString, e) } catalog.invalidateTable(tableName) catalog.dropTable(tableName, ifExists) http://git-wip-us.apache.org/repos/asf/spark/blob/2b1ac6ce/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala index 64f5a4a..bddd3f2 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala @@ -741,14 +741,12 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach { sql("DROP TABLE dbx.tab1") assert(catalog.listTables("dbx") == Nil) sql("DROP TABLE IF EXISTS dbx.tab1") -// no exception will be thrown -sql("DROP TABLE dbx.tab1") +intercept[AnalysisException] { + sql("DROP TABLE dbx.tab1") +} } - test("drop view in SQLContext") { -// SQLContext does not support create view. Log an error message, if tab1 does not exists -sql("DROP VIEW tab1") - + test("drop view") { val catalog = spark.sessionState.catalog val tableIdent = TableIdentifier("tab1", Some("dbx")) createDatabase(catalog, "dbx") http://git-wip-us.apache.org/repos/asf/spark/blob/2b1ac6ce/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala -- diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala index a8645f7..2d5a970 100644 --- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala +++ b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala @@ -515,7 +515,33 @@ cla
spark git commit: [SPARK-15534][SPARK-15535][SQL] Truncate table fixes
Repository: spark Updated Branches: refs/heads/master 589cce93c -> ee682fe29 [SPARK-15534][SPARK-15535][SQL] Truncate table fixes ## What changes were proposed in this pull request? Two changes: - When things fail, `TRUNCATE TABLE` just returns nothing. Instead, we should throw exceptions. - Remove `TRUNCATE TABLE ... COLUMN`, which was never supported by either Spark or Hive. ## How was this patch tested? Jenkins. Author: Andrew Or Closes #13302 from andrewor14/truncate-table. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ee682fe2 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ee682fe2 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ee682fe2 Branch: refs/heads/master Commit: ee682fe293b47988056b540ee46ca49861309982 Parents: 589cce9 Author: Andrew Or Authored: Wed May 25 15:08:39 2016 -0700 Committer: Andrew Or Committed: Wed May 25 15:08:39 2016 -0700 -- .../org/apache/spark/sql/catalyst/parser/SqlBase.g4 | 3 +-- .../org/apache/spark/sql/execution/SparkSqlParser.scala | 7 +-- .../org/apache/spark/sql/execution/command/tables.scala | 7 --- .../spark/sql/hive/execution/HiveCommandSuite.scala | 12 4 files changed, 6 insertions(+), 23 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ee682fe2/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 -- diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 index 403191a..b0e71c7 100644 --- a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 +++ b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 @@ -115,8 +115,7 @@ statement | CLEAR CACHE #clearCache | LOAD DATA LOCAL? INPATH path=STRING OVERWRITE? INTO TABLE tableIdentifier partitionSpec? #loadData -| TRUNCATE TABLE tableIdentifier partitionSpec? -(COLUMNS identifierList)? #truncateTable +| TRUNCATE TABLE tableIdentifier partitionSpec? #truncateTable | op=(ADD | LIST) identifier .*? #manageResource | SET ROLE .*? #failNativeCommand | SET .*? #setConfiguration http://git-wip-us.apache.org/repos/asf/spark/blob/ee682fe2/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala index 57f534c..cfebfc6 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala @@ -368,17 +368,12 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder { * For example: * {{{ * TRUNCATE TABLE tablename [PARTITION (partcol1=val1, partcol2=val2 ...)] - * [COLUMNS (col1, col2)] * }}} */ override def visitTruncateTable(ctx: TruncateTableContext): LogicalPlan = withOrigin(ctx) { -if (ctx.identifierList != null) { - throw operationNotAllowed("TRUNCATE TABLE ... COLUMNS", ctx) -} TruncateTableCommand( visitTableIdentifier(ctx.tableIdentifier), - Option(ctx.partitionSpec).map(visitNonOptionalPartitionSpec) -) + Option(ctx.partitionSpec).map(visitNonOptionalPartitionSpec)) } /** http://git-wip-us.apache.org/repos/asf/spark/blob/ee682fe2/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala index 13e63a1..bef4c92 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala @@ -278,7 +278,7 @@ case class LoadDataCommand( * * The syntax of this command is: * {{{ - * TRUNCATE TABLE tablename [PARTITION (partcol1=val1, partcol2=val2 ...)] + * TRUNCATE TABLE tablename [PARTITION (partcol1=val1, partcol2=val2 ...)] * }}} */ case class TruncateTableCommand( @@ -288,9 +288,10 @@ case class TruncateTableCommand( ove