date:20160525

spark git commit: [SPARK-15436][SQL] Remove DescribeFunction and ShowFunctions

2016-05-25 Thread hvanhovell

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 1dad1a891 -> f63ba2210


[SPARK-15436][SQL] Remove DescribeFunction and ShowFunctions

## What changes were proposed in this pull request?
This patch removes the last two commands defined in the catalyst module: 
DescribeFunction and ShowFunctions. They were unnecessary since the parser 
could just generate DescribeFunctionCommand and ShowFunctionsCommand directly.

## How was this patch tested?
Created a new SparkSqlParserSuite.

Author: Reynold Xin 

Closes #13292 from rxin/SPARK-15436.

(cherry picked from commit 4f27b8dd58a66fca7ddd4c239e02b90c34b1cebd)
Signed-off-by: Herman van Hovell 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f63ba221
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f63ba221
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f63ba221

Branch: refs/heads/branch-2.0
Commit: f63ba2210a27df1843f5007d367036dae0e0139f
Parents: 1dad1a8
Author: Reynold Xin 
Authored: Wed May 25 19:17:53 2016 +0200
Committer: Herman van Hovell 
Committed: Wed May 25 19:18:32 2016 +0200

--
 .../spark/sql/catalyst/parser/AstBuilder.scala  | 32 +
 .../sql/catalyst/plans/logical/Command.scala| 25 +++
 .../sql/catalyst/plans/logical/commands.scala   | 55 
 .../analysis/UnsupportedOperationsSuite.scala   |  9 ++-
 .../sql/catalyst/parser/PlanParserSuite.scala   | 30 +++--
 .../spark/sql/execution/SparkSqlParser.scala| 33 +-
 .../spark/sql/execution/SparkStrategies.scala   |  6 --
 .../sql/execution/SparkSqlParserSuite.scala | 68 
 .../sql/execution/command/DDLCommandSuite.scala |  2 +-
 .../sql/hive/execution/HiveComparisonTest.scala |  6 +-
 10 files changed, 145 insertions(+), 121 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/f63ba221/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
index a13c03a..3473fee 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
@@ -14,6 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 package org.apache.spark.sql.catalyst.parser
 
 import java.sql.{Date, Timestamp}
@@ -82,37 +83,6 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with 
Logging {
   protected def plan(tree: ParserRuleContext): LogicalPlan = typedVisit(tree)
 
   /**
-   * Create a plan for a SHOW FUNCTIONS command.
-   */
-  override def visitShowFunctions(ctx: ShowFunctionsContext): LogicalPlan = 
withOrigin(ctx) {
-import ctx._
-if (qualifiedName != null) {
-  val name = visitFunctionName(qualifiedName)
-  ShowFunctions(name.database, Some(name.funcName))
-} else if (pattern != null) {
-  ShowFunctions(None, Some(string(pattern)))
-} else {
-  ShowFunctions(None, None)
-}
-  }
-
-  /**
-   * Create a plan for a DESCRIBE FUNCTION command.
-   */
-  override def visitDescribeFunction(ctx: DescribeFunctionContext): 
LogicalPlan = withOrigin(ctx) {
-import ctx._
-val functionName =
-  if (describeFuncName.STRING() != null) {
-FunctionIdentifier(string(describeFuncName.STRING()), database = None)
-  } else if (describeFuncName.qualifiedName() != null) {
-visitFunctionName(describeFuncName.qualifiedName)
-  } else {
-FunctionIdentifier(describeFuncName.getText, database = None)
-  }
-DescribeFunction(functionName, EXTENDED != null)
-  }
-
-  /**
* Create a top-level plan with Common Table Expressions.
*/
   override def visitQuery(ctx: QueryContext): LogicalPlan = withOrigin(ctx) {

http://git-wip-us.apache.org/repos/asf/spark/blob/f63ba221/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Command.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Command.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Command.scala
new file mode 100644
index 000..75a5b10
--- /dev/null
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Command.scala
@@ -0,0 +1,25 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+

spark git commit: [SPARK-15436][SQL] Remove DescribeFunction and ShowFunctions

2016-05-25 Thread hvanhovell

Repository: spark
Updated Branches:
  refs/heads/master 9082b7968 -> 4f27b8dd5


[SPARK-15436][SQL] Remove DescribeFunction and ShowFunctions

## What changes were proposed in this pull request?
This patch removes the last two commands defined in the catalyst module: 
DescribeFunction and ShowFunctions. They were unnecessary since the parser 
could just generate DescribeFunctionCommand and ShowFunctionsCommand directly.

## How was this patch tested?
Created a new SparkSqlParserSuite.

Author: Reynold Xin 

Closes #13292 from rxin/SPARK-15436.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4f27b8dd
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4f27b8dd
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4f27b8dd

Branch: refs/heads/master
Commit: 4f27b8dd58a66fca7ddd4c239e02b90c34b1cebd
Parents: 9082b79
Author: Reynold Xin 
Authored: Wed May 25 19:17:53 2016 +0200
Committer: Herman van Hovell 
Committed: Wed May 25 19:17:53 2016 +0200

--
 .../spark/sql/catalyst/parser/AstBuilder.scala  | 32 +
 .../sql/catalyst/plans/logical/Command.scala| 25 +++
 .../sql/catalyst/plans/logical/commands.scala   | 55 
 .../analysis/UnsupportedOperationsSuite.scala   |  9 ++-
 .../sql/catalyst/parser/PlanParserSuite.scala   | 30 +++--
 .../spark/sql/execution/SparkSqlParser.scala| 33 +-
 .../spark/sql/execution/SparkStrategies.scala   |  6 --
 .../sql/execution/SparkSqlParserSuite.scala | 68 
 .../sql/execution/command/DDLCommandSuite.scala |  2 +-
 .../sql/hive/execution/HiveComparisonTest.scala |  6 +-
 10 files changed, 145 insertions(+), 121 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/4f27b8dd/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
index a13c03a..3473fee 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
@@ -14,6 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 package org.apache.spark.sql.catalyst.parser
 
 import java.sql.{Date, Timestamp}
@@ -82,37 +83,6 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with 
Logging {
   protected def plan(tree: ParserRuleContext): LogicalPlan = typedVisit(tree)
 
   /**
-   * Create a plan for a SHOW FUNCTIONS command.
-   */
-  override def visitShowFunctions(ctx: ShowFunctionsContext): LogicalPlan = 
withOrigin(ctx) {
-import ctx._
-if (qualifiedName != null) {
-  val name = visitFunctionName(qualifiedName)
-  ShowFunctions(name.database, Some(name.funcName))
-} else if (pattern != null) {
-  ShowFunctions(None, Some(string(pattern)))
-} else {
-  ShowFunctions(None, None)
-}
-  }
-
-  /**
-   * Create a plan for a DESCRIBE FUNCTION command.
-   */
-  override def visitDescribeFunction(ctx: DescribeFunctionContext): 
LogicalPlan = withOrigin(ctx) {
-import ctx._
-val functionName =
-  if (describeFuncName.STRING() != null) {
-FunctionIdentifier(string(describeFuncName.STRING()), database = None)
-  } else if (describeFuncName.qualifiedName() != null) {
-visitFunctionName(describeFuncName.qualifiedName)
-  } else {
-FunctionIdentifier(describeFuncName.getText, database = None)
-  }
-DescribeFunction(functionName, EXTENDED != null)
-  }
-
-  /**
* Create a top-level plan with Common Table Expressions.
*/
   override def visitQuery(ctx: QueryContext): LogicalPlan = withOrigin(ctx) {

http://git-wip-us.apache.org/repos/asf/spark/blob/4f27b8dd/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Command.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Command.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Command.scala
new file mode 100644
index 000..75a5b10
--- /dev/null
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Command.scala
@@ -0,0 +1,25 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use t

spark git commit: [SPARK-9044] Fix "Storage" tab in UI so that it reflects RDD name change.

2016-05-25 Thread zsxwing

Repository: spark
Updated Branches:
  refs/heads/master 4f27b8dd5 -> b120fba6a


[SPARK-9044] Fix "Storage" tab in UI so that it reflects RDD name change.

## What changes were proposed in this pull request?

1. Making 'name' field of RDDInfo mutable.
2. In StorageListener: catching the fact that RDD's name was changed and 
updating it in RDDInfo.

## How was this patch tested?

1. Manual verification - the 'Storage' tab now behaves as expected.
2. The commit also contains a new unit test which verifies this.

Author: Lukasz 

Closes #13264 from lgieron/SPARK-9044.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b120fba6
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b120fba6
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b120fba6

Branch: refs/heads/master
Commit: b120fba6ae26186b3fa0dfbb1637046f4e76c2b0
Parents: 4f27b8d
Author: Lukasz 
Authored: Wed May 25 10:24:21 2016 -0700
Committer: Shixiong Zhu 
Committed: Wed May 25 10:24:21 2016 -0700

--
 .../scala/org/apache/spark/storage/RDDInfo.scala   |  2 +-
 .../org/apache/spark/ui/storage/StorageTab.scala   |  2 +-
 .../apache/spark/ui/storage/StorageTabSuite.scala  | 17 +
 3 files changed, 19 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/b120fba6/core/src/main/scala/org/apache/spark/storage/RDDInfo.scala
--
diff --git a/core/src/main/scala/org/apache/spark/storage/RDDInfo.scala 
b/core/src/main/scala/org/apache/spark/storage/RDDInfo.scala
index 083d78b..e5abbf7 100644
--- a/core/src/main/scala/org/apache/spark/storage/RDDInfo.scala
+++ b/core/src/main/scala/org/apache/spark/storage/RDDInfo.scala
@@ -24,7 +24,7 @@ import org.apache.spark.util.Utils
 @DeveloperApi
 class RDDInfo(
 val id: Int,
-val name: String,
+var name: String,
 val numPartitions: Int,
 var storageLevel: StorageLevel,
 val parentIds: Seq[Int],

http://git-wip-us.apache.org/repos/asf/spark/blob/b120fba6/core/src/main/scala/org/apache/spark/ui/storage/StorageTab.scala
--
diff --git a/core/src/main/scala/org/apache/spark/ui/storage/StorageTab.scala 
b/core/src/main/scala/org/apache/spark/ui/storage/StorageTab.scala
index 5009583..c212362 100644
--- a/core/src/main/scala/org/apache/spark/ui/storage/StorageTab.scala
+++ b/core/src/main/scala/org/apache/spark/ui/storage/StorageTab.scala
@@ -59,7 +59,7 @@ class StorageListener(storageStatusListener: 
StorageStatusListener) extends Bloc
 
   override def onStageSubmitted(stageSubmitted: SparkListenerStageSubmitted): 
Unit = synchronized {
 val rddInfos = stageSubmitted.stageInfo.rddInfos
-rddInfos.foreach { info => _rddInfoMap.getOrElseUpdate(info.id, info) }
+rddInfos.foreach { info => _rddInfoMap.getOrElseUpdate(info.id, info).name 
= info.name }
   }
 
   override def onStageCompleted(stageCompleted: SparkListenerStageCompleted): 
Unit = synchronized {

http://git-wip-us.apache.org/repos/asf/spark/blob/b120fba6/core/src/test/scala/org/apache/spark/ui/storage/StorageTabSuite.scala
--
diff --git 
a/core/src/test/scala/org/apache/spark/ui/storage/StorageTabSuite.scala 
b/core/src/test/scala/org/apache/spark/ui/storage/StorageTabSuite.scala
index 7d77dee..411a0dd 100644
--- a/core/src/test/scala/org/apache/spark/ui/storage/StorageTabSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ui/storage/StorageTabSuite.scala
@@ -179,6 +179,23 @@ class StorageTabSuite extends SparkFunSuite with 
BeforeAndAfter {
 assert(storageListener.rddInfoList.size === 2)
   }
 
+  test("verify StorageTab still contains a renamed RDD") {
+val rddInfo = new RDDInfo(0, "original_name", 1, memOnly, Seq(4))
+val stageInfo0 = new StageInfo(0, 0, "stage0", 1, Seq(rddInfo), Seq.empty, 
"details")
+bus.postToAll(SparkListenerBlockManagerAdded(1L, bm1, 1000L))
+bus.postToAll(SparkListenerStageSubmitted(stageInfo0))
+val blockUpdateInfos1 = Seq(BlockUpdatedInfo(bm1, RDDBlockId(0, 1), 
memOnly, 100L, 0L))
+postUpdateBlocks(bus, blockUpdateInfos1)
+assert(storageListener.rddInfoList.size == 1)
+
+val newName = "new_name"
+val rddInfoRenamed = new RDDInfo(0, newName, 1, memOnly, Seq(4))
+val stageInfo1 = new StageInfo(1, 0, "stage1", 1, Seq(rddInfoRenamed), 
Seq.empty, "details")
+bus.postToAll(SparkListenerStageSubmitted(stageInfo1))
+assert(storageListener.rddInfoList.size == 1)
+assert(storageListener.rddInfoList.head.name == newName)
+  }
+
   private def postUpdateBlocks(
   bus: SparkListenerBus, blockUpdateInfos: Seq[BlockUpdatedInfo]): Unit = {
 blockUpdateInfos.foreach { blockUpdateInfo =>


---

spark git commit: [SPARK-9044] Fix "Storage" tab in UI so that it reflects RDD name change.

2016-05-25 Thread zsxwing

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 f63ba2210 -> 69327667d


[SPARK-9044] Fix "Storage" tab in UI so that it reflects RDD name change.

## What changes were proposed in this pull request?

1. Making 'name' field of RDDInfo mutable.
2. In StorageListener: catching the fact that RDD's name was changed and 
updating it in RDDInfo.

## How was this patch tested?

1. Manual verification - the 'Storage' tab now behaves as expected.
2. The commit also contains a new unit test which verifies this.

Author: Lukasz 

Closes #13264 from lgieron/SPARK-9044.

(cherry picked from commit b120fba6ae26186b3fa0dfbb1637046f4e76c2b0)
Signed-off-by: Shixiong Zhu 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/69327667
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/69327667
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/69327667

Branch: refs/heads/branch-2.0
Commit: 69327667d5a14b12de8055d752fbe3abb8d6671c
Parents: f63ba22
Author: Lukasz 
Authored: Wed May 25 10:24:21 2016 -0700
Committer: Shixiong Zhu 
Committed: Wed May 25 10:24:28 2016 -0700

--
 .../scala/org/apache/spark/storage/RDDInfo.scala   |  2 +-
 .../org/apache/spark/ui/storage/StorageTab.scala   |  2 +-
 .../apache/spark/ui/storage/StorageTabSuite.scala  | 17 +
 3 files changed, 19 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/69327667/core/src/main/scala/org/apache/spark/storage/RDDInfo.scala
--
diff --git a/core/src/main/scala/org/apache/spark/storage/RDDInfo.scala 
b/core/src/main/scala/org/apache/spark/storage/RDDInfo.scala
index 083d78b..e5abbf7 100644
--- a/core/src/main/scala/org/apache/spark/storage/RDDInfo.scala
+++ b/core/src/main/scala/org/apache/spark/storage/RDDInfo.scala
@@ -24,7 +24,7 @@ import org.apache.spark.util.Utils
 @DeveloperApi
 class RDDInfo(
 val id: Int,
-val name: String,
+var name: String,
 val numPartitions: Int,
 var storageLevel: StorageLevel,
 val parentIds: Seq[Int],

http://git-wip-us.apache.org/repos/asf/spark/blob/69327667/core/src/main/scala/org/apache/spark/ui/storage/StorageTab.scala
--
diff --git a/core/src/main/scala/org/apache/spark/ui/storage/StorageTab.scala 
b/core/src/main/scala/org/apache/spark/ui/storage/StorageTab.scala
index 5009583..c212362 100644
--- a/core/src/main/scala/org/apache/spark/ui/storage/StorageTab.scala
+++ b/core/src/main/scala/org/apache/spark/ui/storage/StorageTab.scala
@@ -59,7 +59,7 @@ class StorageListener(storageStatusListener: 
StorageStatusListener) extends Bloc
 
   override def onStageSubmitted(stageSubmitted: SparkListenerStageSubmitted): 
Unit = synchronized {
 val rddInfos = stageSubmitted.stageInfo.rddInfos
-rddInfos.foreach { info => _rddInfoMap.getOrElseUpdate(info.id, info) }
+rddInfos.foreach { info => _rddInfoMap.getOrElseUpdate(info.id, info).name 
= info.name }
   }
 
   override def onStageCompleted(stageCompleted: SparkListenerStageCompleted): 
Unit = synchronized {

http://git-wip-us.apache.org/repos/asf/spark/blob/69327667/core/src/test/scala/org/apache/spark/ui/storage/StorageTabSuite.scala
--
diff --git 
a/core/src/test/scala/org/apache/spark/ui/storage/StorageTabSuite.scala 
b/core/src/test/scala/org/apache/spark/ui/storage/StorageTabSuite.scala
index 7d77dee..411a0dd 100644
--- a/core/src/test/scala/org/apache/spark/ui/storage/StorageTabSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ui/storage/StorageTabSuite.scala
@@ -179,6 +179,23 @@ class StorageTabSuite extends SparkFunSuite with 
BeforeAndAfter {
 assert(storageListener.rddInfoList.size === 2)
   }
 
+  test("verify StorageTab still contains a renamed RDD") {
+val rddInfo = new RDDInfo(0, "original_name", 1, memOnly, Seq(4))
+val stageInfo0 = new StageInfo(0, 0, "stage0", 1, Seq(rddInfo), Seq.empty, 
"details")
+bus.postToAll(SparkListenerBlockManagerAdded(1L, bm1, 1000L))
+bus.postToAll(SparkListenerStageSubmitted(stageInfo0))
+val blockUpdateInfos1 = Seq(BlockUpdatedInfo(bm1, RDDBlockId(0, 1), 
memOnly, 100L, 0L))
+postUpdateBlocks(bus, blockUpdateInfos1)
+assert(storageListener.rddInfoList.size == 1)
+
+val newName = "new_name"
+val rddInfoRenamed = new RDDInfo(0, newName, 1, memOnly, Seq(4))
+val stageInfo1 = new StageInfo(1, 0, "stage1", 1, Seq(rddInfoRenamed), 
Seq.empty, "details")
+bus.postToAll(SparkListenerStageSubmitted(stageInfo1))
+assert(storageListener.rddInfoList.size == 1)
+assert(storageListener.rddInfoList.head.name == newName)
+  }
+
   private def postUpdateBlocks(
   bus: SparkListenerBus,

spark git commit: [SPARK-15345][SQL][PYSPARK] SparkSession's conf doesn't take effect when this already an existing SparkContext

2016-05-25 Thread andrewor14

Repository: spark
Updated Branches:
  refs/heads/master b120fba6a -> 01e7b9c85


[SPARK-15345][SQL][PYSPARK] SparkSession's conf doesn't take effect when this 
already an existing SparkContext

## What changes were proposed in this pull request?

Override the existing SparkContext is the provided SparkConf is different. 
PySpark part hasn't been fixed yet, will do that after the first round of 
review to ensure this is the correct approach.

## How was this patch tested?

Manually verify it in spark-shell.

rxin  Please help review it, I think this is a very critical issue for spark 2.0

Author: Jeff Zhang 

Closes #13160 from zjffdu/SPARK-15345.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/01e7b9c8
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/01e7b9c8
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/01e7b9c8

Branch: refs/heads/master
Commit: 01e7b9c85bb84924e279021f9748774dce9702c8
Parents: b120fba
Author: Jeff Zhang 
Authored: Wed May 25 10:46:51 2016 -0700
Committer: Andrew Or 
Committed: Wed May 25 10:46:51 2016 -0700

--
 .../main/scala/org/apache/spark/SparkContext.scala|  3 +++
 .../scala/org/apache/spark/sql/SparkSession.scala | 14 --
 .../apache/spark/sql/SparkSessionBuilderSuite.scala   | 14 +-
 3 files changed, 28 insertions(+), 3 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/01e7b9c8/core/src/main/scala/org/apache/spark/SparkContext.scala
--
diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala 
b/core/src/main/scala/org/apache/spark/SparkContext.scala
index 36aa3be..5018eb3 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -2254,6 +2254,9 @@ object SparkContext extends Logging {
   if (activeContext.get() == null) {
 setActiveContext(new SparkContext(config), allowMultipleContexts = 
false)
   }
+  if (config.getAll.nonEmpty) {
+logWarning("Use an existing SparkContext, some configuration may not 
take effect.")
+  }
   activeContext.get()
 }
   }

http://git-wip-us.apache.org/repos/asf/spark/blob/01e7b9c8/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
--
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
index 5c87c84..86c97b9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
@@ -636,7 +636,7 @@ object SparkSession {
   /**
* Builder for [[SparkSession]].
*/
-  class Builder {
+  class Builder extends Logging {
 
 private[this] val options = new scala.collection.mutable.HashMap[String, 
String]
 
@@ -753,6 +753,9 @@ object SparkSession {
   var session = activeThreadSession.get()
   if ((session ne null) && !session.sparkContext.isStopped) {
 options.foreach { case (k, v) => session.conf.set(k, v) }
+if (options.nonEmpty) {
+  logWarning("Use an existing SparkSession, some configuration may not 
take effect.")
+}
 return session
   }
 
@@ -762,6 +765,9 @@ object SparkSession {
 session = defaultSession.get()
 if ((session ne null) && !session.sparkContext.isStopped) {
   options.foreach { case (k, v) => session.conf.set(k, v) }
+  if (options.nonEmpty) {
+logWarning("Use an existing SparkSession, some configuration may 
not take effect.")
+  }
   return session
 }
 
@@ -774,7 +780,11 @@ object SparkSession {
 
   val sparkConf = new SparkConf()
   options.foreach { case (k, v) => sparkConf.set(k, v) }
-  SparkContext.getOrCreate(sparkConf)
+  val sc = SparkContext.getOrCreate(sparkConf)
+  // maybe this is an existing SparkContext, update its SparkConf 
which maybe used
+  // by SparkSession
+  options.foreach { case (k, v) => sc.conf.set(k, v) }
+  sc
 }
 session = new SparkSession(sparkContext)
 options.foreach { case (k, v) => session.conf.set(k, v) }

http://git-wip-us.apache.org/repos/asf/spark/blob/01e7b9c8/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionBuilderSuite.scala
--
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionBuilderSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionBuilderSuite.scala
index ec6a2b3..786956d 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionBuilderSu

spark git commit: [SPARK-15345][SQL][PYSPARK] SparkSession's conf doesn't take effect when this already an existing SparkContext

2016-05-25 Thread andrewor14

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 69327667d -> 27f26a39d


[SPARK-15345][SQL][PYSPARK] SparkSession's conf doesn't take effect when this 
already an existing SparkContext

## What changes were proposed in this pull request?

Override the existing SparkContext is the provided SparkConf is different. 
PySpark part hasn't been fixed yet, will do that after the first round of 
review to ensure this is the correct approach.

## How was this patch tested?

Manually verify it in spark-shell.

rxin  Please help review it, I think this is a very critical issue for spark 2.0

Author: Jeff Zhang 

Closes #13160 from zjffdu/SPARK-15345.

(cherry picked from commit 01e7b9c85bb84924e279021f9748774dce9702c8)
Signed-off-by: Andrew Or 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/27f26a39
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/27f26a39
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/27f26a39

Branch: refs/heads/branch-2.0
Commit: 27f26a39db021735bcf75a1f1b89b9481b199341
Parents: 6932766
Author: Jeff Zhang 
Authored: Wed May 25 10:46:51 2016 -0700
Committer: Andrew Or 
Committed: Wed May 25 10:47:00 2016 -0700

--
 .../main/scala/org/apache/spark/SparkContext.scala|  3 +++
 .../scala/org/apache/spark/sql/SparkSession.scala | 14 --
 .../apache/spark/sql/SparkSessionBuilderSuite.scala   | 14 +-
 3 files changed, 28 insertions(+), 3 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/27f26a39/core/src/main/scala/org/apache/spark/SparkContext.scala
--
diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala 
b/core/src/main/scala/org/apache/spark/SparkContext.scala
index 36aa3be..5018eb3 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -2254,6 +2254,9 @@ object SparkContext extends Logging {
   if (activeContext.get() == null) {
 setActiveContext(new SparkContext(config), allowMultipleContexts = 
false)
   }
+  if (config.getAll.nonEmpty) {
+logWarning("Use an existing SparkContext, some configuration may not 
take effect.")
+  }
   activeContext.get()
 }
   }

http://git-wip-us.apache.org/repos/asf/spark/blob/27f26a39/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
--
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
index 5c87c84..86c97b9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
@@ -636,7 +636,7 @@ object SparkSession {
   /**
* Builder for [[SparkSession]].
*/
-  class Builder {
+  class Builder extends Logging {
 
 private[this] val options = new scala.collection.mutable.HashMap[String, 
String]
 
@@ -753,6 +753,9 @@ object SparkSession {
   var session = activeThreadSession.get()
   if ((session ne null) && !session.sparkContext.isStopped) {
 options.foreach { case (k, v) => session.conf.set(k, v) }
+if (options.nonEmpty) {
+  logWarning("Use an existing SparkSession, some configuration may not 
take effect.")
+}
 return session
   }
 
@@ -762,6 +765,9 @@ object SparkSession {
 session = defaultSession.get()
 if ((session ne null) && !session.sparkContext.isStopped) {
   options.foreach { case (k, v) => session.conf.set(k, v) }
+  if (options.nonEmpty) {
+logWarning("Use an existing SparkSession, some configuration may 
not take effect.")
+  }
   return session
 }
 
@@ -774,7 +780,11 @@ object SparkSession {
 
   val sparkConf = new SparkConf()
   options.foreach { case (k, v) => sparkConf.set(k, v) }
-  SparkContext.getOrCreate(sparkConf)
+  val sc = SparkContext.getOrCreate(sparkConf)
+  // maybe this is an existing SparkContext, update its SparkConf 
which maybe used
+  // by SparkSession
+  options.foreach { case (k, v) => sc.conf.set(k, v) }
+  sc
 }
 session = new SparkSession(sparkContext)
 options.foreach { case (k, v) => session.conf.set(k, v) }

http://git-wip-us.apache.org/repos/asf/spark/blob/27f26a39/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionBuilderSuite.scala
--
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionBuilderSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionBuilderSuite.scala

spark git commit: [SPARK-15520][SQL] SparkSession builder in python should also allow overriding confs of existing sessions

2016-05-25 Thread andrewor14

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 27f26a39d -> c75ec5eaa


[SPARK-15520][SQL] SparkSession builder in python should also allow overriding 
confs of existing sessions

## What changes were proposed in this pull request?

This fixes the python SparkSession builder to allow setting confs correctly. 
This was a leftover TODO from https://github.com/apache/spark/pull/13200.

## How was this patch tested?

Python doc tests.

cc andrewor14

Author: Eric Liang 

Closes #13289 from ericl/spark-15520.

(cherry picked from commit 8239fdcb9b54ab6d13c31ad9916b8334dd1462c2)
Signed-off-by: Andrew Or 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c75ec5ea
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c75ec5ea
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c75ec5ea

Branch: refs/heads/branch-2.0
Commit: c75ec5eaa6c95b3647c80b6595902d16ab3165fa
Parents: 27f26a39
Author: Eric Liang 
Authored: Wed May 25 10:49:11 2016 -0700
Committer: Andrew Or 
Committed: Wed May 25 10:49:19 2016 -0700

--
 python/pyspark/sql/session.py | 35 ---
 1 file changed, 24 insertions(+), 11 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/c75ec5ea/python/pyspark/sql/session.py
--
diff --git a/python/pyspark/sql/session.py b/python/pyspark/sql/session.py
index 2419475..52e7f3d 100644
--- a/python/pyspark/sql/session.py
+++ b/python/pyspark/sql/session.py
@@ -138,24 +138,37 @@ class SparkSession(object):
 """Gets an existing :class:`SparkSession` or, if there is no 
existing one, creates a
 new one based on the options set in this builder.
 
-This method first checks whether there is a valid thread-local 
SparkSession,
-and if yes, return that one. It then checks whether there is a 
valid global
-default SparkSession, and if yes, return that one. If no valid 
global default
-SparkSession exists, the method creates a new SparkSession and 
assigns the
-newly created SparkSession as the global default.
+This method first checks whether there is a valid global default 
SparkSession, and if
+yes, return that one. If no valid global default SparkSession 
exists, the method
+creates a new SparkSession and assigns the newly created 
SparkSession as the global
+default.
+
+>>> s1 = SparkSession.builder.config("k1", "v1").getOrCreate()
+>>> s1.conf.get("k1") == "v1"
+True
 
 In case an existing SparkSession is returned, the config options 
specified
 in this builder will be applied to the existing SparkSession.
+
+>>> s2 = SparkSession.builder.config("k2", "v2").getOrCreate()
+>>> s1.conf.get("k1") == s2.conf.get("k1")
+True
+>>> s1.conf.get("k2") == s2.conf.get("k2")
+True
 """
 with self._lock:
-from pyspark.conf import SparkConf
 from pyspark.context import SparkContext
-from pyspark.sql.context import SQLContext
-sparkConf = SparkConf()
+from pyspark.conf import SparkConf
+session = SparkSession._instantiatedContext
+if session is None:
+sparkConf = SparkConf()
+for key, value in self._options.items():
+sparkConf.set(key, value)
+sc = SparkContext.getOrCreate(sparkConf)
+session = SparkSession(sc)
 for key, value in self._options.items():
-sparkConf.set(key, value)
-sparkContext = SparkContext.getOrCreate(sparkConf)
-return SQLContext.getOrCreate(sparkContext).sparkSession
+session.conf.set(key, value)
+return session
 
 builder = Builder()
 


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-15520][SQL] SparkSession builder in python should also allow overriding confs of existing sessions

2016-05-25 Thread andrewor14

Repository: spark
Updated Branches:
  refs/heads/master 01e7b9c85 -> 8239fdcb9


[SPARK-15520][SQL] SparkSession builder in python should also allow overriding 
confs of existing sessions

## What changes were proposed in this pull request?

This fixes the python SparkSession builder to allow setting confs correctly. 
This was a leftover TODO from https://github.com/apache/spark/pull/13200.

## How was this patch tested?

Python doc tests.

cc andrewor14

Author: Eric Liang 

Closes #13289 from ericl/spark-15520.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8239fdcb
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8239fdcb
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8239fdcb

Branch: refs/heads/master
Commit: 8239fdcb9b54ab6d13c31ad9916b8334dd1462c2
Parents: 01e7b9c
Author: Eric Liang 
Authored: Wed May 25 10:49:11 2016 -0700
Committer: Andrew Or 
Committed: Wed May 25 10:49:11 2016 -0700

--
 python/pyspark/sql/session.py | 35 ---
 1 file changed, 24 insertions(+), 11 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/8239fdcb/python/pyspark/sql/session.py
--
diff --git a/python/pyspark/sql/session.py b/python/pyspark/sql/session.py
index 2419475..52e7f3d 100644
--- a/python/pyspark/sql/session.py
+++ b/python/pyspark/sql/session.py
@@ -138,24 +138,37 @@ class SparkSession(object):
 """Gets an existing :class:`SparkSession` or, if there is no 
existing one, creates a
 new one based on the options set in this builder.
 
-This method first checks whether there is a valid thread-local 
SparkSession,
-and if yes, return that one. It then checks whether there is a 
valid global
-default SparkSession, and if yes, return that one. If no valid 
global default
-SparkSession exists, the method creates a new SparkSession and 
assigns the
-newly created SparkSession as the global default.
+This method first checks whether there is a valid global default 
SparkSession, and if
+yes, return that one. If no valid global default SparkSession 
exists, the method
+creates a new SparkSession and assigns the newly created 
SparkSession as the global
+default.
+
+>>> s1 = SparkSession.builder.config("k1", "v1").getOrCreate()
+>>> s1.conf.get("k1") == "v1"
+True
 
 In case an existing SparkSession is returned, the config options 
specified
 in this builder will be applied to the existing SparkSession.
+
+>>> s2 = SparkSession.builder.config("k2", "v2").getOrCreate()
+>>> s1.conf.get("k1") == s2.conf.get("k1")
+True
+>>> s1.conf.get("k2") == s2.conf.get("k2")
+True
 """
 with self._lock:
-from pyspark.conf import SparkConf
 from pyspark.context import SparkContext
-from pyspark.sql.context import SQLContext
-sparkConf = SparkConf()
+from pyspark.conf import SparkConf
+session = SparkSession._instantiatedContext
+if session is None:
+sparkConf = SparkConf()
+for key, value in self._options.items():
+sparkConf.set(key, value)
+sc = SparkContext.getOrCreate(sparkConf)
+session = SparkSession(sc)
 for key, value in self._options.items():
-sparkConf.set(key, value)
-sparkContext = SparkContext.getOrCreate(sparkConf)
-return SQLContext.getOrCreate(sparkContext).sparkSession
+session.conf.set(key, value)
+return session
 
 builder = Builder()
 


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [MINOR][CORE] Fix a HadoopRDD log message and remove unused imports in rdd files.

2016-05-25 Thread andrewor14

Repository: spark
Updated Branches:
  refs/heads/master 8239fdcb9 -> d6d3e5071


[MINOR][CORE] Fix a HadoopRDD log message and remove unused imports in rdd 
files.

## What changes were proposed in this pull request?

This PR fixes the following typos in log message and comments of 
`HadoopRDD.scala`. Also, this removes unused imports.
```scala
-  logWarning("Caching NewHadoopRDDs as deserialized objects usually leads 
to undesired" +
+  logWarning("Caching HadoopRDDs as deserialized objects usually leads to 
undesired" +
...
-  // since its not removed yet
+  // since it's not removed yet
```

## How was this patch tested?

Manual.

Author: Dongjoon Hyun 

Closes #13294 from dongjoon-hyun/minor_rdd_fix_log_message.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d6d3e507
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d6d3e507
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d6d3e507

Branch: refs/heads/master
Commit: d6d3e50719b01005aa0e77349fc9a6ff88fecce3
Parents: 8239fdc
Author: Dongjoon Hyun 
Authored: Wed May 25 10:51:33 2016 -0700
Committer: Andrew Or 
Committed: Wed May 25 10:51:33 2016 -0700

--
 core/src/main/scala/org/apache/spark/rdd/BinaryFileRDD.scala| 2 +-
 core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala| 5 ++---
 core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala | 1 -
 core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala | 2 +-
 core/src/main/scala/org/apache/spark/rdd/PipedRDD.scala | 1 -
 5 files changed, 4 insertions(+), 7 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/d6d3e507/core/src/main/scala/org/apache/spark/rdd/BinaryFileRDD.scala
--
diff --git a/core/src/main/scala/org/apache/spark/rdd/BinaryFileRDD.scala 
b/core/src/main/scala/org/apache/spark/rdd/BinaryFileRDD.scala
index be0cb17..41832e8 100644
--- a/core/src/main/scala/org/apache/spark/rdd/BinaryFileRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/BinaryFileRDD.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.rdd
 
-import org.apache.hadoop.conf.{ Configurable, Configuration }
+import org.apache.hadoop.conf.{Configurable, Configuration}
 import org.apache.hadoop.io.Writable
 import org.apache.hadoop.mapreduce._
 import org.apache.hadoop.mapreduce.task.JobContextImpl

http://git-wip-us.apache.org/repos/asf/spark/blob/d6d3e507/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
--
diff --git a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala 
b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
index b22134a..515fd6f 100644
--- a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
@@ -43,7 +43,6 @@ import org.apache.spark._
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.deploy.SparkHadoopUtil
-import org.apache.spark.executor.DataReadMethod
 import org.apache.spark.internal.Logging
 import org.apache.spark.rdd.HadoopRDD.HadoopMapPartitionsWithSplitRDD
 import org.apache.spark.scheduler.{HDFSCacheTaskLocation, HostTaskLocation}
@@ -70,7 +69,7 @@ private[spark] class HadoopPartition(rddId: Int, override val 
index: Int, s: Inp
 val envVars: Map[String, String] = if 
(inputSplit.value.isInstanceOf[FileSplit]) {
   val is: FileSplit = inputSplit.value.asInstanceOf[FileSplit]
   // map_input_file is deprecated in favor of mapreduce_map_input_file but 
set both
-  // since its not removed yet
+  // since it's not removed yet
   Map("map_input_file" -> is.getPath().toString(),
 "mapreduce_map_input_file" -> is.getPath().toString())
 } else {
@@ -335,7 +334,7 @@ class HadoopRDD[K, V](
 
   override def persist(storageLevel: StorageLevel): this.type = {
 if (storageLevel.deserialized) {
-  logWarning("Caching NewHadoopRDDs as deserialized objects usually leads 
to undesired" +
+  logWarning("Caching HadoopRDDs as deserialized objects usually leads to 
undesired" +
 " behavior because Hadoop's RecordReader reuses the same Writable 
object for all records." +
 " Use a map transformation to make copies of the records.")
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/d6d3e507/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
--
diff --git a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala 
b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
index ad7c221..189dc7b 100644
--- a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
+++ b/core/src/main/scala

spark git commit: [MINOR][CORE] Fix a HadoopRDD log message and remove unused imports in rdd files.

2016-05-25 Thread andrewor14

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 c75ec5eaa -> 4009ddafd


[MINOR][CORE] Fix a HadoopRDD log message and remove unused imports in rdd 
files.

## What changes were proposed in this pull request?

This PR fixes the following typos in log message and comments of 
`HadoopRDD.scala`. Also, this removes unused imports.
```scala
-  logWarning("Caching NewHadoopRDDs as deserialized objects usually leads 
to undesired" +
+  logWarning("Caching HadoopRDDs as deserialized objects usually leads to 
undesired" +
...
-  // since its not removed yet
+  // since it's not removed yet
```

## How was this patch tested?

Manual.

Author: Dongjoon Hyun 

Closes #13294 from dongjoon-hyun/minor_rdd_fix_log_message.

(cherry picked from commit d6d3e50719b01005aa0e77349fc9a6ff88fecce3)
Signed-off-by: Andrew Or 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4009ddaf
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4009ddaf
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4009ddaf

Branch: refs/heads/branch-2.0
Commit: 4009ddafd810f91f699e52d7822c8c959fe7761e
Parents: c75ec5e
Author: Dongjoon Hyun 
Authored: Wed May 25 10:51:33 2016 -0700
Committer: Andrew Or 
Committed: Wed May 25 10:51:41 2016 -0700

--
 core/src/main/scala/org/apache/spark/rdd/BinaryFileRDD.scala| 2 +-
 core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala| 5 ++---
 core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala | 1 -
 core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala | 2 +-
 core/src/main/scala/org/apache/spark/rdd/PipedRDD.scala | 1 -
 5 files changed, 4 insertions(+), 7 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/4009ddaf/core/src/main/scala/org/apache/spark/rdd/BinaryFileRDD.scala
--
diff --git a/core/src/main/scala/org/apache/spark/rdd/BinaryFileRDD.scala 
b/core/src/main/scala/org/apache/spark/rdd/BinaryFileRDD.scala
index be0cb17..41832e8 100644
--- a/core/src/main/scala/org/apache/spark/rdd/BinaryFileRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/BinaryFileRDD.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.rdd
 
-import org.apache.hadoop.conf.{ Configurable, Configuration }
+import org.apache.hadoop.conf.{Configurable, Configuration}
 import org.apache.hadoop.io.Writable
 import org.apache.hadoop.mapreduce._
 import org.apache.hadoop.mapreduce.task.JobContextImpl

http://git-wip-us.apache.org/repos/asf/spark/blob/4009ddaf/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
--
diff --git a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala 
b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
index b22134a..515fd6f 100644
--- a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
@@ -43,7 +43,6 @@ import org.apache.spark._
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.deploy.SparkHadoopUtil
-import org.apache.spark.executor.DataReadMethod
 import org.apache.spark.internal.Logging
 import org.apache.spark.rdd.HadoopRDD.HadoopMapPartitionsWithSplitRDD
 import org.apache.spark.scheduler.{HDFSCacheTaskLocation, HostTaskLocation}
@@ -70,7 +69,7 @@ private[spark] class HadoopPartition(rddId: Int, override val 
index: Int, s: Inp
 val envVars: Map[String, String] = if 
(inputSplit.value.isInstanceOf[FileSplit]) {
   val is: FileSplit = inputSplit.value.asInstanceOf[FileSplit]
   // map_input_file is deprecated in favor of mapreduce_map_input_file but 
set both
-  // since its not removed yet
+  // since it's not removed yet
   Map("map_input_file" -> is.getPath().toString(),
 "mapreduce_map_input_file" -> is.getPath().toString())
 } else {
@@ -335,7 +334,7 @@ class HadoopRDD[K, V](
 
   override def persist(storageLevel: StorageLevel): this.type = {
 if (storageLevel.deserialized) {
-  logWarning("Caching NewHadoopRDDs as deserialized objects usually leads 
to undesired" +
+  logWarning("Caching HadoopRDDs as deserialized objects usually leads to 
undesired" +
 " behavior because Hadoop's RecordReader reuses the same Writable 
object for all records." +
 " Use a map transformation to make copies of the records.")
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/4009ddaf/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
--
diff --git a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala 
b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
index ad7c221..18

spark git commit: [MINOR][MLLIB][STREAMING][SQL] Fix typos

2016-05-25 Thread andrewor14

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 4009ddafd -> 6fc367e50


[MINOR][MLLIB][STREAMING][SQL] Fix typos

fixed typos for source code for components [mllib] [streaming] and [SQL]

None and obvious.

Author: lfzCarlosC 

Closes #13298 from lfzCarlosC/master.

(cherry picked from commit 02c8072eea72425e89256347e1f373a3e76e6eba)
Signed-off-by: Andrew Or 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6fc367e5
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6fc367e5
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6fc367e5

Branch: refs/heads/branch-2.0
Commit: 6fc367e50c2de9f68237763919eae12241a2179f
Parents: 4009dda
Author: lfzCarlosC 
Authored: Wed May 25 10:53:53 2016 -0700
Committer: Andrew Or 
Committed: Wed May 25 10:54:25 2016 -0700

--
 mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala| 2 +-
 .../org/apache/spark/mllib/stat/test/KolmogorovSmirnovTest.scala | 2 +-
 .../spark/sql/catalyst/plans/logical/basicLogicalOperators.scala | 2 +-
 .../org/apache/spark/sql/execution/vectorized/ColumnVector.java  | 2 +-
 .../spark/sql/execution/streaming/state/StateStoreSuite.scala| 2 +-
 sql/hive-thriftserver/if/TCLIService.thrift  | 4 ++--
 .../java/org/apache/hive/service/ServiceStateChangeListener.java | 2 +-
 .../java/org/apache/hive/service/cli/operation/SQLOperation.java | 2 +-
 .../apache/hive/service/cli/session/HiveSessionHookContext.java  | 2 +-
 .../main/scala/org/apache/spark/sql/hive/HiveSessionState.scala  | 2 +-
 .../apache/spark/streaming/util/WriteAheadLogRecordHandle.java   | 2 +-
 11 files changed, 12 insertions(+), 12 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/6fc367e5/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala
index 9457c6e..bb4b37e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala
@@ -204,7 +204,7 @@ private object IDFModel {
* Transforms a term frequency (TF) vector to a TF-IDF vector with a IDF 
vector
*
* @param idf an IDF vector
-   * @param v a term frequence vector
+   * @param v a term frequency vector
* @return a TF-IDF vector
*/
   def transform(idf: Vector, v: Vector): Vector = {

http://git-wip-us.apache.org/repos/asf/spark/blob/6fc367e5/mllib/src/main/scala/org/apache/spark/mllib/stat/test/KolmogorovSmirnovTest.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/KolmogorovSmirnovTest.scala
 
b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/KolmogorovSmirnovTest.scala
index 9748fbf..c3de5d7 100644
--- 
a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/KolmogorovSmirnovTest.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/KolmogorovSmirnovTest.scala
@@ -45,7 +45,7 @@ import org.apache.spark.rdd.RDD
  * many elements are in each partition. Once these three values have been 
returned for every
  * partition, we can collect and operate locally. Locally, we can now adjust 
each distance by the
  * appropriate constant (the cumulative sum of number of elements in the prior 
partitions divided by
- * thedata set size). Finally, we take the maximum absolute value, and this is 
the statistic.
+ * the data set size). Finally, we take the maximum absolute value, and this 
is the statistic.
  */
 private[stat] object KolmogorovSmirnovTest extends Logging {
 

http://git-wip-us.apache.org/repos/asf/spark/blob/6fc367e5/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
index 0a9250b..8b7e21b 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
@@ -591,7 +591,7 @@ case class Expand(
   }
 
   // This operator can reuse attributes (for example making them null when 
doing a roll up) so
-  // the contraints of the child may no longer be valid.
+  // the constraints of the child may no longer be valid.
   override protected def validConstraints: Set[Expression] = 
Set.empty[Expression]
 }
 

http://git-wip-us.apache.org/repos/asf/spark/blob

spark git commit: [MINOR][MLLIB][STREAMING][SQL] Fix typos

2016-05-25 Thread andrewor14

Repository: spark
Updated Branches:
  refs/heads/master d6d3e5071 -> 02c8072ee


[MINOR][MLLIB][STREAMING][SQL] Fix typos

fixed typos for source code for components [mllib] [streaming] and [SQL]

None and obvious.

Author: lfzCarlosC 

Closes #13298 from lfzCarlosC/master.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/02c8072e
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/02c8072e
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/02c8072e

Branch: refs/heads/master
Commit: 02c8072eea72425e89256347e1f373a3e76e6eba
Parents: d6d3e50
Author: lfzCarlosC 
Authored: Wed May 25 10:53:53 2016 -0700
Committer: Andrew Or 
Committed: Wed May 25 10:53:57 2016 -0700

--
 mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala| 2 +-
 .../org/apache/spark/mllib/stat/test/KolmogorovSmirnovTest.scala | 2 +-
 .../spark/sql/catalyst/plans/logical/basicLogicalOperators.scala | 2 +-
 .../org/apache/spark/sql/execution/vectorized/ColumnVector.java  | 2 +-
 .../spark/sql/execution/streaming/state/StateStoreSuite.scala| 2 +-
 sql/hive-thriftserver/if/TCLIService.thrift  | 4 ++--
 .../java/org/apache/hive/service/ServiceStateChangeListener.java | 2 +-
 .../java/org/apache/hive/service/cli/operation/SQLOperation.java | 2 +-
 .../apache/hive/service/cli/session/HiveSessionHookContext.java  | 2 +-
 .../main/scala/org/apache/spark/sql/hive/HiveSessionState.scala  | 2 +-
 .../apache/spark/streaming/util/WriteAheadLogRecordHandle.java   | 2 +-
 11 files changed, 12 insertions(+), 12 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/02c8072e/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala
index 9457c6e..bb4b37e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala
@@ -204,7 +204,7 @@ private object IDFModel {
* Transforms a term frequency (TF) vector to a TF-IDF vector with a IDF 
vector
*
* @param idf an IDF vector
-   * @param v a term frequence vector
+   * @param v a term frequency vector
* @return a TF-IDF vector
*/
   def transform(idf: Vector, v: Vector): Vector = {

http://git-wip-us.apache.org/repos/asf/spark/blob/02c8072e/mllib/src/main/scala/org/apache/spark/mllib/stat/test/KolmogorovSmirnovTest.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/KolmogorovSmirnovTest.scala
 
b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/KolmogorovSmirnovTest.scala
index 9748fbf..c3de5d7 100644
--- 
a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/KolmogorovSmirnovTest.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/KolmogorovSmirnovTest.scala
@@ -45,7 +45,7 @@ import org.apache.spark.rdd.RDD
  * many elements are in each partition. Once these three values have been 
returned for every
  * partition, we can collect and operate locally. Locally, we can now adjust 
each distance by the
  * appropriate constant (the cumulative sum of number of elements in the prior 
partitions divided by
- * thedata set size). Finally, we take the maximum absolute value, and this is 
the statistic.
+ * the data set size). Finally, we take the maximum absolute value, and this 
is the statistic.
  */
 private[stat] object KolmogorovSmirnovTest extends Logging {
 

http://git-wip-us.apache.org/repos/asf/spark/blob/02c8072e/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
index 0a9250b..8b7e21b 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
@@ -591,7 +591,7 @@ case class Expand(
   }
 
   // This operator can reuse attributes (for example making them null when 
doing a roll up) so
-  // the contraints of the child may no longer be valid.
+  // the constraints of the child may no longer be valid.
   override protected def validConstraints: Set[Expression] = 
Set.empty[Expression]
 }
 

http://git-wip-us.apache.org/repos/asf/spark/blob/02c8072e/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVector.java
---

spark git commit: [SPARK-15500][DOC][ML][PYSPARK] Remove default value in Param doc field in ALS

2016-05-25 Thread mlnick

Repository: spark
Updated Branches:
  refs/heads/master 02c8072ee -> 1cb347fbc


[SPARK-15500][DOC][ML][PYSPARK] Remove default value in Param doc field in ALS

Remove "Default: MEMORY_AND_DISK" from `Param` doc field in ALS storage level 
params. This fixes up the output of `explainParam(s)` so that default values 
are not displayed twice.

We can revisit in the case that 
[SPARK-15130](https://issues.apache.org/jira/browse/SPARK-15130) moves ahead 
with adding defaults in some way to PySpark param doc fields.

Tests N/A.

Author: Nick Pentreath 

Closes #13277 from MLnick/SPARK-15500-als-remove-default-storage-param.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1cb347fb
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1cb347fb
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1cb347fb

Branch: refs/heads/master
Commit: 1cb347fbc446092b478ae0578fc7d1b0626a9294
Parents: 02c8072
Author: Nick Pentreath 
Authored: Wed May 25 20:41:53 2016 +0200
Committer: Nick Pentreath 
Committed: Wed May 25 20:41:53 2016 +0200

--
 .../main/scala/org/apache/spark/ml/recommendation/ALS.scala| 4 ++--
 python/pyspark/ml/recommendation.py| 6 ++
 2 files changed, 4 insertions(+), 6 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/1cb347fb/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala 
b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
index f257382..8dc7437 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
@@ -180,7 +180,7 @@ private[recommendation] trait ALSParams extends 
ALSModelParams with HasMaxIter w
* @group expertParam
*/
   val intermediateStorageLevel = new Param[String](this, 
"intermediateStorageLevel",
-"StorageLevel for intermediate datasets. Cannot be 'NONE'. Default: 
'MEMORY_AND_DISK'.",
+"StorageLevel for intermediate datasets. Cannot be 'NONE'.",
 (s: String) => Try(StorageLevel.fromString(s)).isSuccess && s != "NONE")
 
   /** @group expertGetParam */
@@ -194,7 +194,7 @@ private[recommendation] trait ALSParams extends 
ALSModelParams with HasMaxIter w
* @group expertParam
*/
   val finalStorageLevel = new Param[String](this, "finalStorageLevel",
-"StorageLevel for ALS model factors. Default: 'MEMORY_AND_DISK'.",
+"StorageLevel for ALS model factors.",
 (s: String) => Try(StorageLevel.fromString(s)).isSuccess)
 
   /** @group expertGetParam */

http://git-wip-us.apache.org/repos/asf/spark/blob/1cb347fb/python/pyspark/ml/recommendation.py
--
diff --git a/python/pyspark/ml/recommendation.py 
b/python/pyspark/ml/recommendation.py
index bac2a30..1778bfe 100644
--- a/python/pyspark/ml/recommendation.py
+++ b/python/pyspark/ml/recommendation.py
@@ -120,12 +120,10 @@ class ALS(JavaEstimator, HasCheckpointInterval, 
HasMaxIter, HasPredictionCol, Ha
 "whether to use nonnegative constraint for least 
squares",
 typeConverter=TypeConverters.toBoolean)
 intermediateStorageLevel = Param(Params._dummy(), 
"intermediateStorageLevel",
- "StorageLevel for intermediate datasets. 
Cannot be 'NONE'. " +
- "Default: 'MEMORY_AND_DISK'.",
+ "StorageLevel for intermediate datasets. 
Cannot be 'NONE'.",
  typeConverter=TypeConverters.toString)
 finalStorageLevel = Param(Params._dummy(), "finalStorageLevel",
-  "StorageLevel for ALS model factors. " +
-  "Default: 'MEMORY_AND_DISK'.",
+  "StorageLevel for ALS model factors.",
   typeConverter=TypeConverters.toString)
 
 @keyword_only


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-15500][DOC][ML][PYSPARK] Remove default value in Param doc field in ALS

2016-05-25 Thread mlnick

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 6fc367e50 -> 409eb28f7


[SPARK-15500][DOC][ML][PYSPARK] Remove default value in Param doc field in ALS

Remove "Default: MEMORY_AND_DISK" from `Param` doc field in ALS storage level 
params. This fixes up the output of `explainParam(s)` so that default values 
are not displayed twice.

We can revisit in the case that 
[SPARK-15130](https://issues.apache.org/jira/browse/SPARK-15130) moves ahead 
with adding defaults in some way to PySpark param doc fields.

Tests N/A.

Author: Nick Pentreath 

Closes #13277 from MLnick/SPARK-15500-als-remove-default-storage-param.

(cherry picked from commit 1cb347fbc446092b478ae0578fc7d1b0626a9294)
Signed-off-by: Nick Pentreath 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/409eb28f
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/409eb28f
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/409eb28f

Branch: refs/heads/branch-2.0
Commit: 409eb28f7219778d2aba36079c20c84c8d31b604
Parents: 6fc367e
Author: Nick Pentreath 
Authored: Wed May 25 20:41:53 2016 +0200
Committer: Nick Pentreath 
Committed: Wed May 25 20:42:11 2016 +0200

--
 .../main/scala/org/apache/spark/ml/recommendation/ALS.scala| 4 ++--
 python/pyspark/ml/recommendation.py| 6 ++
 2 files changed, 4 insertions(+), 6 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/409eb28f/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala 
b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
index f257382..8dc7437 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
@@ -180,7 +180,7 @@ private[recommendation] trait ALSParams extends 
ALSModelParams with HasMaxIter w
* @group expertParam
*/
   val intermediateStorageLevel = new Param[String](this, 
"intermediateStorageLevel",
-"StorageLevel for intermediate datasets. Cannot be 'NONE'. Default: 
'MEMORY_AND_DISK'.",
+"StorageLevel for intermediate datasets. Cannot be 'NONE'.",
 (s: String) => Try(StorageLevel.fromString(s)).isSuccess && s != "NONE")
 
   /** @group expertGetParam */
@@ -194,7 +194,7 @@ private[recommendation] trait ALSParams extends 
ALSModelParams with HasMaxIter w
* @group expertParam
*/
   val finalStorageLevel = new Param[String](this, "finalStorageLevel",
-"StorageLevel for ALS model factors. Default: 'MEMORY_AND_DISK'.",
+"StorageLevel for ALS model factors.",
 (s: String) => Try(StorageLevel.fromString(s)).isSuccess)
 
   /** @group expertGetParam */

http://git-wip-us.apache.org/repos/asf/spark/blob/409eb28f/python/pyspark/ml/recommendation.py
--
diff --git a/python/pyspark/ml/recommendation.py 
b/python/pyspark/ml/recommendation.py
index bac2a30..1778bfe 100644
--- a/python/pyspark/ml/recommendation.py
+++ b/python/pyspark/ml/recommendation.py
@@ -120,12 +120,10 @@ class ALS(JavaEstimator, HasCheckpointInterval, 
HasMaxIter, HasPredictionCol, Ha
 "whether to use nonnegative constraint for least 
squares",
 typeConverter=TypeConverters.toBoolean)
 intermediateStorageLevel = Param(Params._dummy(), 
"intermediateStorageLevel",
- "StorageLevel for intermediate datasets. 
Cannot be 'NONE'. " +
- "Default: 'MEMORY_AND_DISK'.",
+ "StorageLevel for intermediate datasets. 
Cannot be 'NONE'.",
  typeConverter=TypeConverters.toString)
 finalStorageLevel = Param(Params._dummy(), "finalStorageLevel",
-  "StorageLevel for ALS model factors. " +
-  "Default: 'MEMORY_AND_DISK'.",
+  "StorageLevel for ALS model factors.",
   typeConverter=TypeConverters.toString)
 
 @keyword_only


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-15483][SQL] IncrementalExecution should use extra strategies.

2016-05-25 Thread marmbrus

Repository: spark
Updated Branches:
  refs/heads/master 1cb347fbc -> 4b8806741


[SPARK-15483][SQL] IncrementalExecution should use extra strategies.

## What changes were proposed in this pull request?

Extra strategies does not work for streams because `IncrementalExecution` uses 
modified planner with stateful operations but it does not include extra 
strategies.

This pr fixes `IncrementalExecution` to include extra strategies to use them.

## How was this patch tested?

I added a test to check if extra strategies work for streams.

Author: Takuya UESHIN 

Closes #13261 from ueshin/issues/SPARK-15483.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4b880674
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4b880674
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4b880674

Branch: refs/heads/master
Commit: 4b88067416ce922ae15a1445cf953fb9b5c43427
Parents: 1cb347f
Author: Takuya UESHIN 
Authored: Wed May 25 12:02:07 2016 -0700
Committer: Michael Armbrust 
Committed: Wed May 25 12:02:07 2016 -0700

--
 .../execution/streaming/IncrementalExecution.scala   |  3 ++-
 .../org/apache/spark/sql/streaming/StreamSuite.scala | 15 +++
 2 files changed, 17 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/4b880674/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala
index 8b96f65..fe5f36e 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala
@@ -36,7 +36,8 @@ class IncrementalExecution private[sql](
   extends QueryExecution(sparkSession, logicalPlan) {
 
   // TODO: make this always part of planning.
-  val stateStrategy = 
sparkSession.sessionState.planner.StatefulAggregationStrategy :: Nil
+  val stateStrategy = 
sparkSession.sessionState.planner.StatefulAggregationStrategy +:
+sparkSession.sessionState.experimentalMethods.extraStrategies
 
   // Modified planner with stateful operations.
   override def planner: SparkPlanner =

http://git-wip-us.apache.org/repos/asf/spark/blob/4b880674/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala
--
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala
index b742206..ae89a68 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala
@@ -220,6 +220,21 @@ class StreamSuite extends StreamTest with SharedSQLContext 
{
   CheckOffsetLogLatestBatchId(2),
   CheckSinkLatestBatchId(2))
   }
+
+  test("insert an extraStrategy") {
+try {
+  spark.experimental.extraStrategies = TestStrategy :: Nil
+
+  val inputData = MemoryStream[(String, Int)]
+  val df = inputData.toDS().map(_._1).toDF("a")
+
+  testStream(df)(
+AddData(inputData, ("so slow", 1)),
+CheckAnswer("so fast"))
+} finally {
+  spark.experimental.extraStrategies = Nil
+}
+  }
 }
 
 /**


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-15483][SQL] IncrementalExecution should use extra strategies.

2016-05-25 Thread marmbrus

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 409eb28f7 -> 20cc2eb1b


[SPARK-15483][SQL] IncrementalExecution should use extra strategies.

## What changes were proposed in this pull request?

Extra strategies does not work for streams because `IncrementalExecution` uses 
modified planner with stateful operations but it does not include extra 
strategies.

This pr fixes `IncrementalExecution` to include extra strategies to use them.

## How was this patch tested?

I added a test to check if extra strategies work for streams.

Author: Takuya UESHIN 

Closes #13261 from ueshin/issues/SPARK-15483.

(cherry picked from commit 4b88067416ce922ae15a1445cf953fb9b5c43427)
Signed-off-by: Michael Armbrust 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/20cc2eb1
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/20cc2eb1
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/20cc2eb1

Branch: refs/heads/branch-2.0
Commit: 20cc2eb1b8bf9fbeda9d00a6d02da8f624973742
Parents: 409eb28
Author: Takuya UESHIN 
Authored: Wed May 25 12:02:07 2016 -0700
Committer: Michael Armbrust 
Committed: Wed May 25 12:02:16 2016 -0700

--
 .../execution/streaming/IncrementalExecution.scala   |  3 ++-
 .../org/apache/spark/sql/streaming/StreamSuite.scala | 15 +++
 2 files changed, 17 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/20cc2eb1/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala
index 8b96f65..fe5f36e 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala
@@ -36,7 +36,8 @@ class IncrementalExecution private[sql](
   extends QueryExecution(sparkSession, logicalPlan) {
 
   // TODO: make this always part of planning.
-  val stateStrategy = 
sparkSession.sessionState.planner.StatefulAggregationStrategy :: Nil
+  val stateStrategy = 
sparkSession.sessionState.planner.StatefulAggregationStrategy +:
+sparkSession.sessionState.experimentalMethods.extraStrategies
 
   // Modified planner with stateful operations.
   override def planner: SparkPlanner =

http://git-wip-us.apache.org/repos/asf/spark/blob/20cc2eb1/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala
--
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala
index b742206..ae89a68 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala
@@ -220,6 +220,21 @@ class StreamSuite extends StreamTest with SharedSQLContext 
{
   CheckOffsetLogLatestBatchId(2),
   CheckSinkLatestBatchId(2))
   }
+
+  test("insert an extraStrategy") {
+try {
+  spark.experimental.extraStrategies = TestStrategy :: Nil
+
+  val inputData = MemoryStream[(String, Int)]
+  val df = inputData.toDS().map(_._1).toDF("a")
+
+  testStream(df)(
+AddData(inputData, ("so slow", 1)),
+CheckAnswer("so fast"))
+} finally {
+  spark.experimental.extraStrategies = Nil
+}
+  }
 }
 
 /**


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-15493][SQL] default QuoteEscapingEnabled flag to true when writing CSV

2016-05-25 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master 4b8806741 -> c875d81a3


[SPARK-15493][SQL] default QuoteEscapingEnabled flag to true when writing CSV

## What changes were proposed in this pull request?

Default QuoteEscapingEnabled flag to true when writing CSV and add an 
escapeQuotes option to be able to change this.

See 
https://github.com/uniVocity/univocity-parsers/blob/f3eb2af26374940e60d91d1703bde54619f50c51/src/main/java/com/univocity/parsers/csv/CsvWriterSettings.java#L231-L247

This change is needed to be able to write RFC 4180 compatible CSV files 
(https://tools.ietf.org/html/rfc4180#section-2)

https://issues.apache.org/jira/browse/SPARK-15493

## How was this patch tested?

Added a test that verifies the output is quoted correctly.

Author: Jurriaan Pruis 

Closes #13267 from jurriaan/quote-escaping.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c875d81a
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c875d81a
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c875d81a

Branch: refs/heads/master
Commit: c875d81a3de3f209b9eb03adf96b7c740b2c7b52
Parents: 4b88067
Author: Jurriaan Pruis 
Authored: Wed May 25 12:40:16 2016 -0700
Committer: Reynold Xin 
Committed: Wed May 25 12:40:16 2016 -0700

--
 dev/deps/spark-deps-hadoop-2.2  |  2 +-
 dev/deps/spark-deps-hadoop-2.3  |  2 +-
 dev/deps/spark-deps-hadoop-2.4  |  2 +-
 dev/deps/spark-deps-hadoop-2.6  |  2 +-
 dev/deps/spark-deps-hadoop-2.7  |  2 +-
 python/pyspark/sql/readwriter.py|  7 ++-
 sql/core/pom.xml|  2 +-
 .../org/apache/spark/sql/DataFrameWriter.scala  |  3 ++
 .../execution/datasources/csv/CSVOptions.scala  |  2 +
 .../execution/datasources/csv/CSVParser.scala   |  1 +
 .../execution/datasources/csv/CSVSuite.scala| 51 
 11 files changed, 69 insertions(+), 7 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/c875d81a/dev/deps/spark-deps-hadoop-2.2
--
diff --git a/dev/deps/spark-deps-hadoop-2.2 b/dev/deps/spark-deps-hadoop-2.2
index a9068da..0ac1c00 100644
--- a/dev/deps/spark-deps-hadoop-2.2
+++ b/dev/deps/spark-deps-hadoop-2.2
@@ -159,7 +159,7 @@ stax-api-1.0.1.jar
 stream-2.7.0.jar
 stringtemplate-3.2.1.jar
 super-csv-2.2.0.jar
-univocity-parsers-2.1.0.jar
+univocity-parsers-2.1.1.jar
 validation-api-1.1.0.Final.jar
 xbean-asm5-shaded-4.4.jar
 xmlenc-0.52.jar

http://git-wip-us.apache.org/repos/asf/spark/blob/c875d81a/dev/deps/spark-deps-hadoop-2.3
--
diff --git a/dev/deps/spark-deps-hadoop-2.3 b/dev/deps/spark-deps-hadoop-2.3
index 7e60a31..fa35fa7 100644
--- a/dev/deps/spark-deps-hadoop-2.3
+++ b/dev/deps/spark-deps-hadoop-2.3
@@ -167,7 +167,7 @@ stax-api-1.0.1.jar
 stream-2.7.0.jar
 stringtemplate-3.2.1.jar
 super-csv-2.2.0.jar
-univocity-parsers-2.1.0.jar
+univocity-parsers-2.1.1.jar
 validation-api-1.1.0.Final.jar
 xbean-asm5-shaded-4.4.jar
 xmlenc-0.52.jar

http://git-wip-us.apache.org/repos/asf/spark/blob/c875d81a/dev/deps/spark-deps-hadoop-2.4
--
diff --git a/dev/deps/spark-deps-hadoop-2.4 b/dev/deps/spark-deps-hadoop-2.4
index 70d33b4..99dffa9 100644
--- a/dev/deps/spark-deps-hadoop-2.4
+++ b/dev/deps/spark-deps-hadoop-2.4
@@ -167,7 +167,7 @@ stax-api-1.0.1.jar
 stream-2.7.0.jar
 stringtemplate-3.2.1.jar
 super-csv-2.2.0.jar
-univocity-parsers-2.1.0.jar
+univocity-parsers-2.1.1.jar
 validation-api-1.1.0.Final.jar
 xbean-asm5-shaded-4.4.jar
 xmlenc-0.52.jar

http://git-wip-us.apache.org/repos/asf/spark/blob/c875d81a/dev/deps/spark-deps-hadoop-2.6
--
diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6
index a80f6bc..a3bee36 100644
--- a/dev/deps/spark-deps-hadoop-2.6
+++ b/dev/deps/spark-deps-hadoop-2.6
@@ -175,7 +175,7 @@ stax-api-1.0.1.jar
 stream-2.7.0.jar
 stringtemplate-3.2.1.jar
 super-csv-2.2.0.jar
-univocity-parsers-2.1.0.jar
+univocity-parsers-2.1.1.jar
 validation-api-1.1.0.Final.jar
 xbean-asm5-shaded-4.4.jar
 xercesImpl-2.9.1.jar

http://git-wip-us.apache.org/repos/asf/spark/blob/c875d81a/dev/deps/spark-deps-hadoop-2.7
--
diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7
index c0b53f7..dbd7a8e 100644
--- a/dev/deps/spark-deps-hadoop-2.7
+++ b/dev/deps/spark-deps-hadoop-2.7
@@ -176,7 +176,7 @@ stax-api-1.0.1.jar
 stream-2.7.0.jar
 stringtemplate-3.2.1.jar
 super-csv-2.2.0.jar
-univocity-parsers-2.1.0.jar
+univocity-parsers-2.1.1.jar
 val

spark git commit: [SPARK-15493][SQL] default QuoteEscapingEnabled flag to true when writing CSV

2016-05-25 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 20cc2eb1b -> 8629537cc


[SPARK-15493][SQL] default QuoteEscapingEnabled flag to true when writing CSV

## What changes were proposed in this pull request?

Default QuoteEscapingEnabled flag to true when writing CSV and add an 
escapeQuotes option to be able to change this.

See 
https://github.com/uniVocity/univocity-parsers/blob/f3eb2af26374940e60d91d1703bde54619f50c51/src/main/java/com/univocity/parsers/csv/CsvWriterSettings.java#L231-L247

This change is needed to be able to write RFC 4180 compatible CSV files 
(https://tools.ietf.org/html/rfc4180#section-2)

https://issues.apache.org/jira/browse/SPARK-15493

## How was this patch tested?

Added a test that verifies the output is quoted correctly.

Author: Jurriaan Pruis 

Closes #13267 from jurriaan/quote-escaping.

(cherry picked from commit c875d81a3de3f209b9eb03adf96b7c740b2c7b52)
Signed-off-by: Reynold Xin 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8629537c
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8629537c
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8629537c

Branch: refs/heads/branch-2.0
Commit: 8629537cc7d89c97ef2038326f1138d166596315
Parents: 20cc2eb
Author: Jurriaan Pruis 
Authored: Wed May 25 12:40:16 2016 -0700
Committer: Reynold Xin 
Committed: Wed May 25 12:40:26 2016 -0700

--
 dev/deps/spark-deps-hadoop-2.2  |  2 +-
 dev/deps/spark-deps-hadoop-2.3  |  2 +-
 dev/deps/spark-deps-hadoop-2.4  |  2 +-
 dev/deps/spark-deps-hadoop-2.6  |  2 +-
 dev/deps/spark-deps-hadoop-2.7  |  2 +-
 python/pyspark/sql/readwriter.py|  7 ++-
 sql/core/pom.xml|  2 +-
 .../org/apache/spark/sql/DataFrameWriter.scala  |  3 ++
 .../execution/datasources/csv/CSVOptions.scala  |  2 +
 .../execution/datasources/csv/CSVParser.scala   |  1 +
 .../execution/datasources/csv/CSVSuite.scala| 51 
 11 files changed, 69 insertions(+), 7 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/8629537c/dev/deps/spark-deps-hadoop-2.2
--
diff --git a/dev/deps/spark-deps-hadoop-2.2 b/dev/deps/spark-deps-hadoop-2.2
index a9068da..0ac1c00 100644
--- a/dev/deps/spark-deps-hadoop-2.2
+++ b/dev/deps/spark-deps-hadoop-2.2
@@ -159,7 +159,7 @@ stax-api-1.0.1.jar
 stream-2.7.0.jar
 stringtemplate-3.2.1.jar
 super-csv-2.2.0.jar
-univocity-parsers-2.1.0.jar
+univocity-parsers-2.1.1.jar
 validation-api-1.1.0.Final.jar
 xbean-asm5-shaded-4.4.jar
 xmlenc-0.52.jar

http://git-wip-us.apache.org/repos/asf/spark/blob/8629537c/dev/deps/spark-deps-hadoop-2.3
--
diff --git a/dev/deps/spark-deps-hadoop-2.3 b/dev/deps/spark-deps-hadoop-2.3
index 7e60a31..fa35fa7 100644
--- a/dev/deps/spark-deps-hadoop-2.3
+++ b/dev/deps/spark-deps-hadoop-2.3
@@ -167,7 +167,7 @@ stax-api-1.0.1.jar
 stream-2.7.0.jar
 stringtemplate-3.2.1.jar
 super-csv-2.2.0.jar
-univocity-parsers-2.1.0.jar
+univocity-parsers-2.1.1.jar
 validation-api-1.1.0.Final.jar
 xbean-asm5-shaded-4.4.jar
 xmlenc-0.52.jar

http://git-wip-us.apache.org/repos/asf/spark/blob/8629537c/dev/deps/spark-deps-hadoop-2.4
--
diff --git a/dev/deps/spark-deps-hadoop-2.4 b/dev/deps/spark-deps-hadoop-2.4
index 70d33b4..99dffa9 100644
--- a/dev/deps/spark-deps-hadoop-2.4
+++ b/dev/deps/spark-deps-hadoop-2.4
@@ -167,7 +167,7 @@ stax-api-1.0.1.jar
 stream-2.7.0.jar
 stringtemplate-3.2.1.jar
 super-csv-2.2.0.jar
-univocity-parsers-2.1.0.jar
+univocity-parsers-2.1.1.jar
 validation-api-1.1.0.Final.jar
 xbean-asm5-shaded-4.4.jar
 xmlenc-0.52.jar

http://git-wip-us.apache.org/repos/asf/spark/blob/8629537c/dev/deps/spark-deps-hadoop-2.6
--
diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6
index a80f6bc..a3bee36 100644
--- a/dev/deps/spark-deps-hadoop-2.6
+++ b/dev/deps/spark-deps-hadoop-2.6
@@ -175,7 +175,7 @@ stax-api-1.0.1.jar
 stream-2.7.0.jar
 stringtemplate-3.2.1.jar
 super-csv-2.2.0.jar
-univocity-parsers-2.1.0.jar
+univocity-parsers-2.1.1.jar
 validation-api-1.1.0.Final.jar
 xbean-asm5-shaded-4.4.jar
 xercesImpl-2.9.1.jar

http://git-wip-us.apache.org/repos/asf/spark/blob/8629537c/dev/deps/spark-deps-hadoop-2.7
--
diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7
index c0b53f7..dbd7a8e 100644
--- a/dev/deps/spark-deps-hadoop-2.7
+++ b/dev/deps/spark-deps-hadoop-2.7
@@ -176,7 +176,7 @@ stax-api-1.0.1.jar
 stream-2.7.0.jar
 st

spark git commit: [SPARK-14269][SCHEDULER] Eliminate unnecessary submitStage() call.

2016-05-25 Thread kayousterhout

Repository: spark
Updated Branches:
  refs/heads/master c875d81a3 -> 698ef762f


[SPARK-14269][SCHEDULER] Eliminate unnecessary submitStage() call.

## What changes were proposed in this pull request?

Currently a method `submitStage()` for waiting stages is called on every 
iteration of the event loop in `DAGScheduler` to submit all waiting stages, but 
most of them are not necessary because they are not related to Stage status.
The case we should try to submit waiting stages is only when their parent 
stages are successfully completed.

This elimination can improve `DAGScheduler` performance.

## How was this patch tested?

Added some checks and other existing tests, and our projects.

We have a project bottle-necked by `DAGScheduler`, having about 2000 stages.

Before this patch the almost all execution time in `Driver` process was spent 
to process `submitStage()` of `dag-scheduler-event-loop` thread but after this 
patch the performance was improved as follows:

|| total execution time | `dag-scheduler-event-loop` thread time | 
`submitStage()` |
||-:|---:|:|
| Before |  760 sec |710 sec |  
   667 sec |
| After  |  440 sec | 14 sec |  
10 sec |

Author: Takuya UESHIN 

Closes #12060 from ueshin/issues/SPARK-14269.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/698ef762
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/698ef762
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/698ef762

Branch: refs/heads/master
Commit: 698ef762f80cf4c84bc7b7cf083aa97d44b87170
Parents: c875d81
Author: Takuya UESHIN 
Authored: Wed May 25 13:57:25 2016 -0700
Committer: Kay Ousterhout 
Committed: Wed May 25 13:57:25 2016 -0700

--
 .../apache/spark/scheduler/DAGScheduler.scala   | 35 ++--
 .../spark/scheduler/DAGSchedulerSuite.scala |  7 
 2 files changed, 17 insertions(+), 25 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/698ef762/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
--
diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala 
b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
index 5291b66..766e979 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -726,7 +726,6 @@ class DAGScheduler(
   reason = "as part of cancellation of all jobs"))
 activeJobs.clear() // These should already be empty by this point,
 jobIdToActiveJob.clear() // but just in case we lost track of some jobs...
-submitWaitingStages()
   }
 
   /**
@@ -752,23 +751,21 @@ class DAGScheduler(
 submitStage(stage)
   }
 }
-submitWaitingStages()
   }
 
   /**
* Check for waiting stages which are now eligible for resubmission.
-   * Ordinarily run on every iteration of the event loop.
+   * Submits stages that depend on the given parent stage. Called when the 
parent stage completes
+   * successfully.
*/
-  private def submitWaitingStages() {
-// TODO: We might want to run this less often, when we are sure that 
something has become
-// runnable that wasn't before.
-logTrace("Checking for newly runnable parent stages")
+  private def submitWaitingChildStages(parent: Stage) {
+logTrace(s"Checking if any dependencies of $parent are now runnable")
 logTrace("running: " + runningStages)
 logTrace("waiting: " + waitingStages)
 logTrace("failed: " + failedStages)
-val waitingStagesCopy = waitingStages.toArray
-waitingStages.clear()
-for (stage <- waitingStagesCopy.sortBy(_.firstJobId)) {
+val childStages = waitingStages.filter(_.parents.contains(parent)).toArray
+waitingStages --= childStages
+for (stage <- childStages.sortBy(_.firstJobId)) {
   submitStage(stage)
 }
   }
@@ -793,7 +790,6 @@ class DAGScheduler(
 }
 val jobIds = activeInGroup.map(_.jobId)
 jobIds.foreach(handleJobCancellation(_, "part of cancelled job group 
%s".format(groupId)))
-submitWaitingStages()
   }
 
   private[scheduler] def handleBeginEvent(task: Task[_], taskInfo: TaskInfo) {
@@ -801,7 +797,6 @@ class DAGScheduler(
 // In that case, we wouldn't have the stage anymore in stageIdToStage.
 val stageAttemptId = 
stageIdToStage.get(task.stageId).map(_.latestInfo.attemptId).getOrElse(-1)
 listenerBus.post(SparkListenerTaskStart(task.stageId, stageAttemptId, 
taskInfo))
-submitWaitingStages()
   }
 
   private[scheduler] def handleTaskSetFailed(
@@ -809,7 +804,6 @@ class DAGSchedul

spark git commit: [MINOR] [PYSPARK] [EXAMPLES] Changed examples to use SparkSession.sparkContext instead of _sc

2016-05-25 Thread davies

Repository: spark
Updated Branches:
  refs/heads/master 698ef762f -> 9c297df3d


[MINOR] [PYSPARK] [EXAMPLES] Changed examples to use SparkSession.sparkContext 
instead of _sc

## What changes were proposed in this pull request?

Some PySpark examples need a SparkContext and get it by accessing _sc directly 
from the session.  These examples should use the provided property 
`sparkContext` in `SparkSession` instead.

## How was this patch tested?
Ran modified examples

Author: Bryan Cutler 

Closes #13303 from BryanCutler/pyspark-session-sparkContext-MINOR.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9c297df3
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9c297df3
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9c297df3

Branch: refs/heads/master
Commit: 9c297df3d4d5fa4bbfdffdaad15f362586db384b
Parents: 698ef76
Author: Bryan Cutler 
Authored: Wed May 25 14:29:14 2016 -0700
Committer: Davies Liu 
Committed: Wed May 25 14:29:14 2016 -0700

--
 examples/src/main/python/als.py | 2 +-
 examples/src/main/python/avro_inputformat.py| 2 +-
 examples/src/main/python/parquet_inputformat.py | 2 +-
 examples/src/main/python/pi.py  | 2 +-
 examples/src/main/python/transitive_closure.py  | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/9c297df3/examples/src/main/python/als.py
--
diff --git a/examples/src/main/python/als.py b/examples/src/main/python/als.py
index 81562e2..80290e7 100755
--- a/examples/src/main/python/als.py
+++ b/examples/src/main/python/als.py
@@ -67,7 +67,7 @@ if __name__ == "__main__":
 .appName("PythonALS")\
 .getOrCreate()
 
-sc = spark._sc
+sc = spark.sparkContext
 
 M = int(sys.argv[1]) if len(sys.argv) > 1 else 100
 U = int(sys.argv[2]) if len(sys.argv) > 2 else 500

http://git-wip-us.apache.org/repos/asf/spark/blob/9c297df3/examples/src/main/python/avro_inputformat.py
--
diff --git a/examples/src/main/python/avro_inputformat.py 
b/examples/src/main/python/avro_inputformat.py
index 3f65e8f..4422f9e 100644
--- a/examples/src/main/python/avro_inputformat.py
+++ b/examples/src/main/python/avro_inputformat.py
@@ -70,7 +70,7 @@ if __name__ == "__main__":
 .appName("AvroKeyInputFormat")\
 .getOrCreate()
 
-sc = spark._sc
+sc = spark.sparkContext
 
 conf = None
 if len(sys.argv) == 3:

http://git-wip-us.apache.org/repos/asf/spark/blob/9c297df3/examples/src/main/python/parquet_inputformat.py
--
diff --git a/examples/src/main/python/parquet_inputformat.py 
b/examples/src/main/python/parquet_inputformat.py
index 2f09f4d..29a1ac2 100644
--- a/examples/src/main/python/parquet_inputformat.py
+++ b/examples/src/main/python/parquet_inputformat.py
@@ -53,7 +53,7 @@ if __name__ == "__main__":
 .appName("ParquetInputFormat")\
 .getOrCreate()
 
-sc = spark._sc
+sc = spark.sparkContext
 
 parquet_rdd = sc.newAPIHadoopFile(
 path,

http://git-wip-us.apache.org/repos/asf/spark/blob/9c297df3/examples/src/main/python/pi.py
--
diff --git a/examples/src/main/python/pi.py b/examples/src/main/python/pi.py
index 5db03e4..b39d710 100755
--- a/examples/src/main/python/pi.py
+++ b/examples/src/main/python/pi.py
@@ -32,7 +32,7 @@ if __name__ == "__main__":
 .appName("PythonPi")\
 .getOrCreate()
 
-sc = spark._sc
+sc = spark.sparkContext
 
 partitions = int(sys.argv[1]) if len(sys.argv) > 1 else 2
 n = 10 * partitions

http://git-wip-us.apache.org/repos/asf/spark/blob/9c297df3/examples/src/main/python/transitive_closure.py
--
diff --git a/examples/src/main/python/transitive_closure.py 
b/examples/src/main/python/transitive_closure.py
index 37c41dc..d88ea94 100755
--- a/examples/src/main/python/transitive_closure.py
+++ b/examples/src/main/python/transitive_closure.py
@@ -46,7 +46,7 @@ if __name__ == "__main__":
 .appName("PythonTransitiveClosure")\
 .getOrCreate()
 
-sc = spark._sc
+sc = spark.sparkContext
 
 partitions = int(sys.argv[1]) if len(sys.argv) > 1 else 2
 tc = sc.parallelize(generateGraph(), partitions).cache()


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [MINOR] [PYSPARK] [EXAMPLES] Changed examples to use SparkSession.sparkContext instead of _sc

2016-05-25 Thread davies

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 8629537cc -> f7158c482


[MINOR] [PYSPARK] [EXAMPLES] Changed examples to use SparkSession.sparkContext 
instead of _sc

## What changes were proposed in this pull request?

Some PySpark examples need a SparkContext and get it by accessing _sc directly 
from the session.  These examples should use the provided property 
`sparkContext` in `SparkSession` instead.

## How was this patch tested?
Ran modified examples

Author: Bryan Cutler 

Closes #13303 from BryanCutler/pyspark-session-sparkContext-MINOR.

(cherry picked from commit 9c297df3d4d5fa4bbfdffdaad15f362586db384b)
Signed-off-by: Davies Liu 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f7158c48
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f7158c48
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f7158c48

Branch: refs/heads/branch-2.0
Commit: f7158c4828320e18aedd4369832da16c759b90fb
Parents: 8629537
Author: Bryan Cutler 
Authored: Wed May 25 14:29:14 2016 -0700
Committer: Davies Liu 
Committed: Wed May 25 14:29:23 2016 -0700

--
 examples/src/main/python/als.py | 2 +-
 examples/src/main/python/avro_inputformat.py| 2 +-
 examples/src/main/python/parquet_inputformat.py | 2 +-
 examples/src/main/python/pi.py  | 2 +-
 examples/src/main/python/transitive_closure.py  | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/f7158c48/examples/src/main/python/als.py
--
diff --git a/examples/src/main/python/als.py b/examples/src/main/python/als.py
index 81562e2..80290e7 100755
--- a/examples/src/main/python/als.py
+++ b/examples/src/main/python/als.py
@@ -67,7 +67,7 @@ if __name__ == "__main__":
 .appName("PythonALS")\
 .getOrCreate()
 
-sc = spark._sc
+sc = spark.sparkContext
 
 M = int(sys.argv[1]) if len(sys.argv) > 1 else 100
 U = int(sys.argv[2]) if len(sys.argv) > 2 else 500

http://git-wip-us.apache.org/repos/asf/spark/blob/f7158c48/examples/src/main/python/avro_inputformat.py
--
diff --git a/examples/src/main/python/avro_inputformat.py 
b/examples/src/main/python/avro_inputformat.py
index 3f65e8f..4422f9e 100644
--- a/examples/src/main/python/avro_inputformat.py
+++ b/examples/src/main/python/avro_inputformat.py
@@ -70,7 +70,7 @@ if __name__ == "__main__":
 .appName("AvroKeyInputFormat")\
 .getOrCreate()
 
-sc = spark._sc
+sc = spark.sparkContext
 
 conf = None
 if len(sys.argv) == 3:

http://git-wip-us.apache.org/repos/asf/spark/blob/f7158c48/examples/src/main/python/parquet_inputformat.py
--
diff --git a/examples/src/main/python/parquet_inputformat.py 
b/examples/src/main/python/parquet_inputformat.py
index 2f09f4d..29a1ac2 100644
--- a/examples/src/main/python/parquet_inputformat.py
+++ b/examples/src/main/python/parquet_inputformat.py
@@ -53,7 +53,7 @@ if __name__ == "__main__":
 .appName("ParquetInputFormat")\
 .getOrCreate()
 
-sc = spark._sc
+sc = spark.sparkContext
 
 parquet_rdd = sc.newAPIHadoopFile(
 path,

http://git-wip-us.apache.org/repos/asf/spark/blob/f7158c48/examples/src/main/python/pi.py
--
diff --git a/examples/src/main/python/pi.py b/examples/src/main/python/pi.py
index 5db03e4..b39d710 100755
--- a/examples/src/main/python/pi.py
+++ b/examples/src/main/python/pi.py
@@ -32,7 +32,7 @@ if __name__ == "__main__":
 .appName("PythonPi")\
 .getOrCreate()
 
-sc = spark._sc
+sc = spark.sparkContext
 
 partitions = int(sys.argv[1]) if len(sys.argv) > 1 else 2
 n = 10 * partitions

http://git-wip-us.apache.org/repos/asf/spark/blob/f7158c48/examples/src/main/python/transitive_closure.py
--
diff --git a/examples/src/main/python/transitive_closure.py 
b/examples/src/main/python/transitive_closure.py
index 37c41dc..d88ea94 100755
--- a/examples/src/main/python/transitive_closure.py
+++ b/examples/src/main/python/transitive_closure.py
@@ -46,7 +46,7 @@ if __name__ == "__main__":
 .appName("PythonTransitiveClosure")\
 .getOrCreate()
 
-sc = spark._sc
+sc = spark.sparkContext
 
 partitions = int(sys.argv[1]) if len(sys.argv) > 1 else 2
 tc = sc.parallelize(generateGraph(), partitions).cache()


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.ap

spark git commit: Log warnings for numIterations * miniBatchFraction < 1.0

2016-05-25 Thread srowen

Repository: spark
Updated Branches:
  refs/heads/master 9c297df3d -> 589cce93c


Log warnings for numIterations * miniBatchFraction < 1.0

## What changes were proposed in this pull request?

Add a warning log for the case that `numIterations * miniBatchFraction <1.0` 
during gradient descent. If the product of those two numbers is less than 
`1.0`, then not all training examples will be used during optimization. To put 
this concretely, suppose that `numExamples = 100`, `miniBatchFraction = 0.2` 
and `numIterations = 3`. Then, 3 iterations will occur each sampling 
approximately 6 examples each. In the best case, each of the 6 examples are 
unique; hence 18/100 examples are used.

This may be counter-intuitive to most users and led to the issue during the 
development of another Spark  ML model: 
https://github.com/zhengruifeng/spark-libFM/issues/11. If a user actually does 
not require the training data set, it would be easier and more intuitive to use 
`RDD.sample`.

## How was this patch tested?

`build/mvn -DskipTests clean package` build succeeds

Author: Gio Borje 

Closes #13265 from Hydrotoast/master.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/589cce93
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/589cce93
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/589cce93

Branch: refs/heads/master
Commit: 589cce93c821ac28e9090a478f6e7465398b7c30
Parents: 9c297df
Author: Gio Borje 
Authored: Wed May 25 16:52:31 2016 -0500
Committer: Sean Owen 
Committed: Wed May 25 16:52:31 2016 -0500

--
 .../org/apache/spark/mllib/optimization/GradientDescent.scala   | 5 +
 1 file changed, 5 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/589cce93/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala
 
b/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala
index a67ea83..735e780 100644
--- 
a/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala
@@ -197,6 +197,11 @@ object GradientDescent extends Logging {
 "< 1.0 can be unstable because of the stochasticity in sampling.")
 }
 
+if (numIterations * miniBatchFraction < 1.0) {
+  logWarning("Not all examples will be used if numIterations * 
miniBatchFraction < 1.0: " +
+s"numIterations=$numIterations and 
miniBatchFraction=$miniBatchFraction")
+}
+
 val stochasticLossHistory = new ArrayBuffer[Double](numIterations)
 // Record previous weight and current one to calculate solution vector 
difference
 


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: Log warnings for numIterations * miniBatchFraction < 1.0

2016-05-25 Thread srowen

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 f7158c482 -> 0064a4dcb


Log warnings for numIterations * miniBatchFraction < 1.0

## What changes were proposed in this pull request?

Add a warning log for the case that `numIterations * miniBatchFraction <1.0` 
during gradient descent. If the product of those two numbers is less than 
`1.0`, then not all training examples will be used during optimization. To put 
this concretely, suppose that `numExamples = 100`, `miniBatchFraction = 0.2` 
and `numIterations = 3`. Then, 3 iterations will occur each sampling 
approximately 6 examples each. In the best case, each of the 6 examples are 
unique; hence 18/100 examples are used.

This may be counter-intuitive to most users and led to the issue during the 
development of another Spark  ML model: 
https://github.com/zhengruifeng/spark-libFM/issues/11. If a user actually does 
not require the training data set, it would be easier and more intuitive to use 
`RDD.sample`.

## How was this patch tested?

`build/mvn -DskipTests clean package` build succeeds

Author: Gio Borje 

Closes #13265 from Hydrotoast/master.

(cherry picked from commit 589cce93c821ac28e9090a478f6e7465398b7c30)
Signed-off-by: Sean Owen 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0064a4dc
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0064a4dc
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0064a4dc

Branch: refs/heads/branch-2.0
Commit: 0064a4dcbed1d91732a29c2cede464b8d148aeca
Parents: f7158c4
Author: Gio Borje 
Authored: Wed May 25 16:52:31 2016 -0500
Committer: Sean Owen 
Committed: Wed May 25 16:52:48 2016 -0500

--
 .../org/apache/spark/mllib/optimization/GradientDescent.scala   | 5 +
 1 file changed, 5 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/0064a4dc/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala
 
b/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala
index a67ea83..735e780 100644
--- 
a/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala
@@ -197,6 +197,11 @@ object GradientDescent extends Logging {
 "< 1.0 can be unstable because of the stochasticity in sampling.")
 }
 
+if (numIterations * miniBatchFraction < 1.0) {
+  logWarning("Not all examples will be used if numIterations * 
miniBatchFraction < 1.0: " +
+s"numIterations=$numIterations and 
miniBatchFraction=$miniBatchFraction")
+}
+
 val stochasticLossHistory = new ArrayBuffer[Double](numIterations)
 // Record previous weight and current one to calculate solution vector 
difference
 


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-15534][SPARK-15535][SQL] Truncate table fixes

2016-05-25 Thread andrewor14

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 0064a4dcb -> 347acc4ea


[SPARK-15534][SPARK-15535][SQL] Truncate table fixes

## What changes were proposed in this pull request?

Two changes:
- When things fail, `TRUNCATE TABLE` just returns nothing. Instead, we should 
throw exceptions.
- Remove `TRUNCATE TABLE ... COLUMN`, which was never supported by either Spark 
or Hive.

## How was this patch tested?
Jenkins.

Author: Andrew Or 

Closes #13302 from andrewor14/truncate-table.

(cherry picked from commit ee682fe293b47988056b540ee46ca49861309982)
Signed-off-by: Andrew Or 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/347acc4e
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/347acc4e
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/347acc4e

Branch: refs/heads/branch-2.0
Commit: 347acc4ea411cd430cfb1d8b8b596463f5b9dd3c
Parents: 0064a4d
Author: Andrew Or 
Authored: Wed May 25 15:08:39 2016 -0700
Committer: Andrew Or 
Committed: Wed May 25 15:08:59 2016 -0700

--
 .../org/apache/spark/sql/catalyst/parser/SqlBase.g4 |  3 +--
 .../org/apache/spark/sql/execution/SparkSqlParser.scala |  7 +--
 .../org/apache/spark/sql/execution/command/tables.scala |  7 ---
 .../spark/sql/hive/execution/HiveCommandSuite.scala | 12 
 4 files changed, 6 insertions(+), 23 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/347acc4e/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
--
diff --git 
a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 
b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
index 403191a..b0e71c7 100644
--- 
a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
+++ 
b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
@@ -115,8 +115,7 @@ statement
 | CLEAR CACHE  
#clearCache
 | LOAD DATA LOCAL? INPATH path=STRING OVERWRITE? INTO TABLE
 tableIdentifier partitionSpec? 
#loadData
-| TRUNCATE TABLE tableIdentifier partitionSpec?
-(COLUMNS identifierList)?  
#truncateTable
+| TRUNCATE TABLE tableIdentifier partitionSpec?
#truncateTable
 | op=(ADD | LIST) identifier .*?   
#manageResource
 | SET ROLE .*? 
#failNativeCommand
 | SET .*?  
#setConfiguration

http://git-wip-us.apache.org/repos/asf/spark/blob/347acc4e/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
index 57f534c..cfebfc6 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
@@ -368,17 +368,12 @@ class SparkSqlAstBuilder(conf: SQLConf) extends 
AstBuilder {
* For example:
* {{{
*   TRUNCATE TABLE tablename [PARTITION (partcol1=val1, partcol2=val2 ...)]
-   *   [COLUMNS (col1, col2)]
* }}}
*/
   override def visitTruncateTable(ctx: TruncateTableContext): LogicalPlan = 
withOrigin(ctx) {
-if (ctx.identifierList != null) {
-  throw operationNotAllowed("TRUNCATE TABLE ... COLUMNS", ctx)
-}
 TruncateTableCommand(
   visitTableIdentifier(ctx.tableIdentifier),
-  Option(ctx.partitionSpec).map(visitNonOptionalPartitionSpec)
-)
+  Option(ctx.partitionSpec).map(visitNonOptionalPartitionSpec))
   }
 
   /**

http://git-wip-us.apache.org/repos/asf/spark/blob/347acc4e/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
index 13e63a1..bef4c92 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
@@ -278,7 +278,7 @@ case class LoadDataCommand(
  *
  * The syntax of this command is:
  * {{{
- *  TRUNCATE TABLE tablename [PARTITION (partcol1=val1, partcol2=val2 ...)]
+ *   TRUNCATE TABLE tablename [PARTITION (partcol1=val1, partcol2=val2 ...)]
  * }}}
  */
 case class T

spark git commit: [SPARK-15534][SPARK-15535][SQL] Truncate table fixes

2016-05-25 Thread andrewor14

Repository: spark
Updated Branches:
  refs/heads/master 589cce93c -> ee682fe29


[SPARK-15534][SPARK-15535][SQL] Truncate table fixes

## What changes were proposed in this pull request?

Two changes:
- When things fail, `TRUNCATE TABLE` just returns nothing. Instead, we should 
throw exceptions.
- Remove `TRUNCATE TABLE ... COLUMN`, which was never supported by either Spark 
or Hive.

## How was this patch tested?
Jenkins.

Author: Andrew Or 

Closes #13302 from andrewor14/truncate-table.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ee682fe2
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ee682fe2
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ee682fe2

Branch: refs/heads/master
Commit: ee682fe293b47988056b540ee46ca49861309982
Parents: 589cce9
Author: Andrew Or 
Authored: Wed May 25 15:08:39 2016 -0700
Committer: Andrew Or 
Committed: Wed May 25 15:08:39 2016 -0700

--
 .../org/apache/spark/sql/catalyst/parser/SqlBase.g4 |  3 +--
 .../org/apache/spark/sql/execution/SparkSqlParser.scala |  7 +--
 .../org/apache/spark/sql/execution/command/tables.scala |  7 ---
 .../spark/sql/hive/execution/HiveCommandSuite.scala | 12 
 4 files changed, 6 insertions(+), 23 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/ee682fe2/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
--
diff --git 
a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 
b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
index 403191a..b0e71c7 100644
--- 
a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
+++ 
b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
@@ -115,8 +115,7 @@ statement
 | CLEAR CACHE  
#clearCache
 | LOAD DATA LOCAL? INPATH path=STRING OVERWRITE? INTO TABLE
 tableIdentifier partitionSpec? 
#loadData
-| TRUNCATE TABLE tableIdentifier partitionSpec?
-(COLUMNS identifierList)?  
#truncateTable
+| TRUNCATE TABLE tableIdentifier partitionSpec?
#truncateTable
 | op=(ADD | LIST) identifier .*?   
#manageResource
 | SET ROLE .*? 
#failNativeCommand
 | SET .*?  
#setConfiguration

http://git-wip-us.apache.org/repos/asf/spark/blob/ee682fe2/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
index 57f534c..cfebfc6 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
@@ -368,17 +368,12 @@ class SparkSqlAstBuilder(conf: SQLConf) extends 
AstBuilder {
* For example:
* {{{
*   TRUNCATE TABLE tablename [PARTITION (partcol1=val1, partcol2=val2 ...)]
-   *   [COLUMNS (col1, col2)]
* }}}
*/
   override def visitTruncateTable(ctx: TruncateTableContext): LogicalPlan = 
withOrigin(ctx) {
-if (ctx.identifierList != null) {
-  throw operationNotAllowed("TRUNCATE TABLE ... COLUMNS", ctx)
-}
 TruncateTableCommand(
   visitTableIdentifier(ctx.tableIdentifier),
-  Option(ctx.partitionSpec).map(visitNonOptionalPartitionSpec)
-)
+  Option(ctx.partitionSpec).map(visitNonOptionalPartitionSpec))
   }
 
   /**

http://git-wip-us.apache.org/repos/asf/spark/blob/ee682fe2/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
index 13e63a1..bef4c92 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
@@ -278,7 +278,7 @@ case class LoadDataCommand(
  *
  * The syntax of this command is:
  * {{{
- *  TRUNCATE TABLE tablename [PARTITION (partcol1=val1, partcol2=val2 ...)]
+ *   TRUNCATE TABLE tablename [PARTITION (partcol1=val1, partcol2=val2 ...)]
  * }}}
  */
 case class TruncateTableCommand(
@@ -288,9 +288,10 @@ case class TruncateTableCommand(
   override def run(sparkSess

spark git commit: [SPARK-15525][SQL][BUILD] Upgrade ANTLR4 SBT plugin

2016-05-25 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master ee682fe29 -> 527499b62


[SPARK-15525][SQL][BUILD] Upgrade ANTLR4 SBT plugin

## What changes were proposed in this pull request?
The ANTLR4 SBT plugin has been moved from its own repo to one on bintray. The 
version was also changed from `0.7.10` to `0.7.11`. The latter actually broke 
our build (ihji has fixed this by also adding `0.7.10` and others to the 
bin-tray repo).

This PR upgrades the SBT-ANTLR4 plugin and ANTLR4 to their most recent versions 
(`0.7.11`/`4.5.3`). I have also removed a few obsolete build configurations.

## How was this patch tested?
Manually running SBT/Maven builds.

Author: Herman van Hovell 

Closes #13299 from hvanhovell/SPARK-15525.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/527499b6
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/527499b6
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/527499b6

Branch: refs/heads/master
Commit: 527499b624e743583fe0f93ea0b487031891ac3a
Parents: ee682fe
Author: Herman van Hovell 
Authored: Wed May 25 15:35:38 2016 -0700
Committer: Reynold Xin 
Committed: Wed May 25 15:35:38 2016 -0700

--
 dev/deps/spark-deps-hadoop-2.2 | 2 +-
 dev/deps/spark-deps-hadoop-2.3 | 2 +-
 dev/deps/spark-deps-hadoop-2.4 | 2 +-
 dev/deps/spark-deps-hadoop-2.6 | 2 +-
 dev/deps/spark-deps-hadoop-2.7 | 2 +-
 pom.xml| 7 +--
 project/plugins.sbt| 5 +
 7 files changed, 7 insertions(+), 15 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/527499b6/dev/deps/spark-deps-hadoop-2.2
--
diff --git a/dev/deps/spark-deps-hadoop-2.2 b/dev/deps/spark-deps-hadoop-2.2
index 0ac1c00..0d6b18e 100644
--- a/dev/deps/spark-deps-hadoop-2.2
+++ b/dev/deps/spark-deps-hadoop-2.2
@@ -3,7 +3,7 @@ RoaringBitmap-0.5.11.jar
 ST4-4.0.4.jar
 antlr-2.7.7.jar
 antlr-runtime-3.4.jar
-antlr4-runtime-4.5.2-1.jar
+antlr4-runtime-4.5.3.jar
 aopalliance-1.0.jar
 aopalliance-repackaged-2.4.0-b34.jar
 apache-log4j-extras-1.2.17.jar

http://git-wip-us.apache.org/repos/asf/spark/blob/527499b6/dev/deps/spark-deps-hadoop-2.3
--
diff --git a/dev/deps/spark-deps-hadoop-2.3 b/dev/deps/spark-deps-hadoop-2.3
index fa35fa7..a3597f4 100644
--- a/dev/deps/spark-deps-hadoop-2.3
+++ b/dev/deps/spark-deps-hadoop-2.3
@@ -4,7 +4,7 @@ ST4-4.0.4.jar
 activation-1.1.1.jar
 antlr-2.7.7.jar
 antlr-runtime-3.4.jar
-antlr4-runtime-4.5.2-1.jar
+antlr4-runtime-4.5.3.jar
 aopalliance-1.0.jar
 aopalliance-repackaged-2.4.0-b34.jar
 apache-log4j-extras-1.2.17.jar

http://git-wip-us.apache.org/repos/asf/spark/blob/527499b6/dev/deps/spark-deps-hadoop-2.4
--
diff --git a/dev/deps/spark-deps-hadoop-2.4 b/dev/deps/spark-deps-hadoop-2.4
index 99dffa9..3ca44c5 100644
--- a/dev/deps/spark-deps-hadoop-2.4
+++ b/dev/deps/spark-deps-hadoop-2.4
@@ -4,7 +4,7 @@ ST4-4.0.4.jar
 activation-1.1.1.jar
 antlr-2.7.7.jar
 antlr-runtime-3.4.jar
-antlr4-runtime-4.5.2-1.jar
+antlr4-runtime-4.5.3.jar
 aopalliance-1.0.jar
 aopalliance-repackaged-2.4.0-b34.jar
 apache-log4j-extras-1.2.17.jar

http://git-wip-us.apache.org/repos/asf/spark/blob/527499b6/dev/deps/spark-deps-hadoop-2.6
--
diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6
index a3bee36..01e7551 100644
--- a/dev/deps/spark-deps-hadoop-2.6
+++ b/dev/deps/spark-deps-hadoop-2.6
@@ -4,7 +4,7 @@ ST4-4.0.4.jar
 activation-1.1.1.jar
 antlr-2.7.7.jar
 antlr-runtime-3.4.jar
-antlr4-runtime-4.5.2-1.jar
+antlr4-runtime-4.5.3.jar
 aopalliance-1.0.jar
 aopalliance-repackaged-2.4.0-b34.jar
 apache-log4j-extras-1.2.17.jar

http://git-wip-us.apache.org/repos/asf/spark/blob/527499b6/dev/deps/spark-deps-hadoop-2.7
--
diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7
index dbd7a8e..402fd05 100644
--- a/dev/deps/spark-deps-hadoop-2.7
+++ b/dev/deps/spark-deps-hadoop-2.7
@@ -4,7 +4,7 @@ ST4-4.0.4.jar
 activation-1.1.1.jar
 antlr-2.7.7.jar
 antlr-runtime-3.4.jar
-antlr4-runtime-4.5.2-1.jar
+antlr4-runtime-4.5.3.jar
 aopalliance-1.0.jar
 aopalliance-repackaged-2.4.0-b34.jar
 apache-log4j-extras-1.2.17.jar

http://git-wip-us.apache.org/repos/asf/spark/blob/527499b6/pom.xml
--
diff --git a/pom.xml b/pom.xml
index f28aa14..3fa0eeb 100644
--- a/pom.xml
+++ b/pom.xml
@@ -177,7 +177,7 @@
 3.5.2
 1.3.9
 0.9.2
-4.5.2-1
+4.5.3
 1.1
 2.52.0
 2.8
@@ -1954,11 +1954,6 @@
 
 
   org.antlr
-

spark git commit: [SPARK-15525][SQL][BUILD] Upgrade ANTLR4 SBT plugin

2016-05-25 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 347acc4ea -> 733cb44e3


[SPARK-15525][SQL][BUILD] Upgrade ANTLR4 SBT plugin

## What changes were proposed in this pull request?
The ANTLR4 SBT plugin has been moved from its own repo to one on bintray. The 
version was also changed from `0.7.10` to `0.7.11`. The latter actually broke 
our build (ihji has fixed this by also adding `0.7.10` and others to the 
bin-tray repo).

This PR upgrades the SBT-ANTLR4 plugin and ANTLR4 to their most recent versions 
(`0.7.11`/`4.5.3`). I have also removed a few obsolete build configurations.

## How was this patch tested?
Manually running SBT/Maven builds.

Author: Herman van Hovell 

Closes #13299 from hvanhovell/SPARK-15525.

(cherry picked from commit 527499b624e743583fe0f93ea0b487031891ac3a)
Signed-off-by: Reynold Xin 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/733cb44e
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/733cb44e
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/733cb44e

Branch: refs/heads/branch-2.0
Commit: 733cb44e3321744d521f0791e241cc0fe062d01a
Parents: 347acc4
Author: Herman van Hovell 
Authored: Wed May 25 15:35:38 2016 -0700
Committer: Reynold Xin 
Committed: Wed May 25 15:36:04 2016 -0700

--
 dev/deps/spark-deps-hadoop-2.2 | 2 +-
 dev/deps/spark-deps-hadoop-2.3 | 2 +-
 dev/deps/spark-deps-hadoop-2.4 | 2 +-
 dev/deps/spark-deps-hadoop-2.6 | 2 +-
 dev/deps/spark-deps-hadoop-2.7 | 2 +-
 pom.xml| 7 +--
 project/plugins.sbt| 5 +
 7 files changed, 7 insertions(+), 15 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/733cb44e/dev/deps/spark-deps-hadoop-2.2
--
diff --git a/dev/deps/spark-deps-hadoop-2.2 b/dev/deps/spark-deps-hadoop-2.2
index 0ac1c00..0d6b18e 100644
--- a/dev/deps/spark-deps-hadoop-2.2
+++ b/dev/deps/spark-deps-hadoop-2.2
@@ -3,7 +3,7 @@ RoaringBitmap-0.5.11.jar
 ST4-4.0.4.jar
 antlr-2.7.7.jar
 antlr-runtime-3.4.jar
-antlr4-runtime-4.5.2-1.jar
+antlr4-runtime-4.5.3.jar
 aopalliance-1.0.jar
 aopalliance-repackaged-2.4.0-b34.jar
 apache-log4j-extras-1.2.17.jar

http://git-wip-us.apache.org/repos/asf/spark/blob/733cb44e/dev/deps/spark-deps-hadoop-2.3
--
diff --git a/dev/deps/spark-deps-hadoop-2.3 b/dev/deps/spark-deps-hadoop-2.3
index fa35fa7..a3597f4 100644
--- a/dev/deps/spark-deps-hadoop-2.3
+++ b/dev/deps/spark-deps-hadoop-2.3
@@ -4,7 +4,7 @@ ST4-4.0.4.jar
 activation-1.1.1.jar
 antlr-2.7.7.jar
 antlr-runtime-3.4.jar
-antlr4-runtime-4.5.2-1.jar
+antlr4-runtime-4.5.3.jar
 aopalliance-1.0.jar
 aopalliance-repackaged-2.4.0-b34.jar
 apache-log4j-extras-1.2.17.jar

http://git-wip-us.apache.org/repos/asf/spark/blob/733cb44e/dev/deps/spark-deps-hadoop-2.4
--
diff --git a/dev/deps/spark-deps-hadoop-2.4 b/dev/deps/spark-deps-hadoop-2.4
index 99dffa9..3ca44c5 100644
--- a/dev/deps/spark-deps-hadoop-2.4
+++ b/dev/deps/spark-deps-hadoop-2.4
@@ -4,7 +4,7 @@ ST4-4.0.4.jar
 activation-1.1.1.jar
 antlr-2.7.7.jar
 antlr-runtime-3.4.jar
-antlr4-runtime-4.5.2-1.jar
+antlr4-runtime-4.5.3.jar
 aopalliance-1.0.jar
 aopalliance-repackaged-2.4.0-b34.jar
 apache-log4j-extras-1.2.17.jar

http://git-wip-us.apache.org/repos/asf/spark/blob/733cb44e/dev/deps/spark-deps-hadoop-2.6
--
diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6
index a3bee36..01e7551 100644
--- a/dev/deps/spark-deps-hadoop-2.6
+++ b/dev/deps/spark-deps-hadoop-2.6
@@ -4,7 +4,7 @@ ST4-4.0.4.jar
 activation-1.1.1.jar
 antlr-2.7.7.jar
 antlr-runtime-3.4.jar
-antlr4-runtime-4.5.2-1.jar
+antlr4-runtime-4.5.3.jar
 aopalliance-1.0.jar
 aopalliance-repackaged-2.4.0-b34.jar
 apache-log4j-extras-1.2.17.jar

http://git-wip-us.apache.org/repos/asf/spark/blob/733cb44e/dev/deps/spark-deps-hadoop-2.7
--
diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7
index dbd7a8e..402fd05 100644
--- a/dev/deps/spark-deps-hadoop-2.7
+++ b/dev/deps/spark-deps-hadoop-2.7
@@ -4,7 +4,7 @@ ST4-4.0.4.jar
 activation-1.1.1.jar
 antlr-2.7.7.jar
 antlr-runtime-3.4.jar
-antlr4-runtime-4.5.2-1.jar
+antlr4-runtime-4.5.3.jar
 aopalliance-1.0.jar
 aopalliance-repackaged-2.4.0-b34.jar
 apache-log4j-extras-1.2.17.jar

http://git-wip-us.apache.org/repos/asf/spark/blob/733cb44e/pom.xml
--
diff --git a/pom.xml b/pom.xml
index f28aa14..3fa0eeb 100644
--- a/pom.xml
+++ b/pom.xml
@@ -177,7 +177,7 @@
 3.5.2
 1.3.9
 0.9.2
-4.5.2-1
+

spark git commit: [SPARK-15533][SQL] Deprecate Dataset.explode

2016-05-25 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master 527499b62 -> 06ed1fa3e


[SPARK-15533][SQL] Deprecate Dataset.explode

## What changes were proposed in this pull request?

This patch deprecates `Dataset.explode` and documents appropriate workarounds 
to use `flatMap()` or `functions.explode()` instead.

## How was this patch tested?

N/A

Author: Sameer Agarwal 

Closes #13312 from sameeragarwal/deprecate.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/06ed1fa3
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/06ed1fa3
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/06ed1fa3

Branch: refs/heads/master
Commit: 06ed1fa3e45adfc11b0f615cb8b97c99fadc735f
Parents: 527499b
Author: Sameer Agarwal 
Authored: Wed May 25 19:10:57 2016 -0700
Committer: Reynold Xin 
Committed: Wed May 25 19:10:57 2016 -0700

--
 .../scala/org/apache/spark/sql/Dataset.scala| 33 +---
 1 file changed, 22 insertions(+), 11 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/06ed1fa3/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
--
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index 78a167e..e5140fc 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -1556,30 +1556,33 @@ class Dataset[T] private[sql](
   }
 
   /**
-   * :: Experimental ::
* (Scala-specific) Returns a new [[Dataset]] where each row has been 
expanded to zero or more
* rows by the provided function. This is similar to a `LATERAL VIEW` in 
HiveQL. The columns of
* the input row are implicitly joined with each row that is output by the 
function.
*
-   * The following example uses this function to count the number of books 
which contain
-   * a given word:
+   * Given that this is deprecated, as an alternative, you can explode columns 
either using
+   * `functions.explode()` or `flatMap()`. The following example uses these 
alternatives to count
+   * the number of books that contain a given word:
*
* {{{
*   case class Book(title: String, words: String)
*   val ds: Dataset[Book]
*
-   *   case class Word(word: String)
-   *   val allWords = ds.explode('words) {
-   * case Row(words: String) => words.split(" ").map(Word(_))
-   *   }
+   *   val allWords = ds.select('title, explode(split('words, " ")).as("word"))
*
*   val bookCountPerWord = 
allWords.groupBy("word").agg(countDistinct("title"))
* }}}
*
+   * Using `flatMap()` this can similarly be exploded as:
+   *
+   * {{{
+   *   ds.flatMap(_.words.split(" "))
+   * }}}
+   *
* @group untypedrel
* @since 2.0.0
*/
-  @Experimental
+  @deprecated("use flatMap() or select() with functions.explode() instead", 
"2.0.0")
   def explode[A <: Product : TypeTag](input: Column*)(f: Row => 
TraversableOnce[A]): DataFrame = {
 val elementSchema = 
ScalaReflection.schemaFor[A].dataType.asInstanceOf[StructType]
 
@@ -1596,19 +1599,27 @@ class Dataset[T] private[sql](
   }
 
   /**
-   * :: Experimental ::
* (Scala-specific) Returns a new [[Dataset]] where a single column has been 
expanded to zero
* or more rows by the provided function. This is similar to a `LATERAL 
VIEW` in HiveQL. All
* columns of the input row are implicitly joined with each value that is 
output by the function.
*
+   * Given that this is deprecated, as an alternative, you can explode columns 
either using
+   * `functions.explode()`:
+   *
+   * {{{
+   *   ds.select(explode(split('words, " ")).as("word"))
+   * }}}
+   *
+   * or `flatMap()`:
+   *
* {{{
-   *   ds.explode("words", "word") {words: String => words.split(" ")}
+   *   ds.flatMap(_.words.split(" "))
* }}}
*
* @group untypedrel
* @since 2.0.0
*/
-  @Experimental
+  @deprecated("use flatMap() or select() with functions.explode() instead", 
"2.0.0")
   def explode[A, B : TypeTag](inputColumn: String, outputColumn: String)(f: A 
=> TraversableOnce[B])
 : DataFrame = {
 val dataType = ScalaReflection.schemaFor[B].dataType


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-15533][SQL] Deprecate Dataset.explode

2016-05-25 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 733cb44e3 -> 15a2dba66


[SPARK-15533][SQL] Deprecate Dataset.explode

## What changes were proposed in this pull request?

This patch deprecates `Dataset.explode` and documents appropriate workarounds 
to use `flatMap()` or `functions.explode()` instead.

## How was this patch tested?

N/A

Author: Sameer Agarwal 

Closes #13312 from sameeragarwal/deprecate.

(cherry picked from commit 06ed1fa3e45adfc11b0f615cb8b97c99fadc735f)
Signed-off-by: Reynold Xin 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/15a2dba6
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/15a2dba6
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/15a2dba6

Branch: refs/heads/branch-2.0
Commit: 15a2dba6619438c5fb228437eaab50f399839473
Parents: 733cb44
Author: Sameer Agarwal 
Authored: Wed May 25 19:10:57 2016 -0700
Committer: Reynold Xin 
Committed: Wed May 25 19:11:10 2016 -0700

--
 .../scala/org/apache/spark/sql/Dataset.scala| 33 +---
 1 file changed, 22 insertions(+), 11 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/15a2dba6/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
--
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index 78a167e..e5140fc 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -1556,30 +1556,33 @@ class Dataset[T] private[sql](
   }
 
   /**
-   * :: Experimental ::
* (Scala-specific) Returns a new [[Dataset]] where each row has been 
expanded to zero or more
* rows by the provided function. This is similar to a `LATERAL VIEW` in 
HiveQL. The columns of
* the input row are implicitly joined with each row that is output by the 
function.
*
-   * The following example uses this function to count the number of books 
which contain
-   * a given word:
+   * Given that this is deprecated, as an alternative, you can explode columns 
either using
+   * `functions.explode()` or `flatMap()`. The following example uses these 
alternatives to count
+   * the number of books that contain a given word:
*
* {{{
*   case class Book(title: String, words: String)
*   val ds: Dataset[Book]
*
-   *   case class Word(word: String)
-   *   val allWords = ds.explode('words) {
-   * case Row(words: String) => words.split(" ").map(Word(_))
-   *   }
+   *   val allWords = ds.select('title, explode(split('words, " ")).as("word"))
*
*   val bookCountPerWord = 
allWords.groupBy("word").agg(countDistinct("title"))
* }}}
*
+   * Using `flatMap()` this can similarly be exploded as:
+   *
+   * {{{
+   *   ds.flatMap(_.words.split(" "))
+   * }}}
+   *
* @group untypedrel
* @since 2.0.0
*/
-  @Experimental
+  @deprecated("use flatMap() or select() with functions.explode() instead", 
"2.0.0")
   def explode[A <: Product : TypeTag](input: Column*)(f: Row => 
TraversableOnce[A]): DataFrame = {
 val elementSchema = 
ScalaReflection.schemaFor[A].dataType.asInstanceOf[StructType]
 
@@ -1596,19 +1599,27 @@ class Dataset[T] private[sql](
   }
 
   /**
-   * :: Experimental ::
* (Scala-specific) Returns a new [[Dataset]] where a single column has been 
expanded to zero
* or more rows by the provided function. This is similar to a `LATERAL 
VIEW` in HiveQL. All
* columns of the input row are implicitly joined with each value that is 
output by the function.
*
+   * Given that this is deprecated, as an alternative, you can explode columns 
either using
+   * `functions.explode()`:
+   *
+   * {{{
+   *   ds.select(explode(split('words, " ")).as("word"))
+   * }}}
+   *
+   * or `flatMap()`:
+   *
* {{{
-   *   ds.explode("words", "word") {words: String => words.split(" ")}
+   *   ds.flatMap(_.words.split(" "))
* }}}
*
* @group untypedrel
* @since 2.0.0
*/
-  @Experimental
+  @deprecated("use flatMap() or select() with functions.explode() instead", 
"2.0.0")
   def explode[A, B : TypeTag](inputColumn: String, outputColumn: String)(f: A 
=> TraversableOnce[B])
 : DataFrame = {
 val dataType = ScalaReflection.schemaFor[B].dataType


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-15439][SPARKR] Failed to run unit test in SparkR

2016-05-25 Thread shivaram

Repository: spark
Updated Branches:
  refs/heads/master 06ed1fa3e -> 06bae8af1


[SPARK-15439][SPARKR] Failed to run unit test in SparkR

## What changes were proposed in this pull request?

(Please fill in changes proposed in this fix)
There are some failures when running SparkR unit tests.
In this PR, I fixed two of these failures in test_context.R and test_sparkSQL.R
The first one is due to different masked name. I added missed names in the 
expected arrays.
The second one is because one PR removed the logic of a previous fix of missing 
subset method.

The file privilege issue is still there. I am debugging it. SparkR shell can 
run the test case successfully.
test_that("pipeRDD() on RDDs", {
  actual <- collect(pipeRDD(rdd, "more"))
When using run-test script, it complains no such directories as below:
cannot open file '/tmp/Rtmp4FQbah/filee2273f9d47f7': No such file or directory

## How was this patch tested?

(Please explain how this patch was tested. E.g. unit tests, integration tests, 
manual tests)
Manually test it

Author: wm...@hotmail.com 

Closes #13284 from wangmiao1981/R.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/06bae8af
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/06bae8af
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/06bae8af

Branch: refs/heads/master
Commit: 06bae8af17d9478c889d206a4556a697b5d629e7
Parents: 06ed1fa
Author: wm...@hotmail.com 
Authored: Wed May 25 21:08:03 2016 -0700
Committer: Shivaram Venkataraman 
Committed: Wed May 25 21:08:03 2016 -0700

--
 R/pkg/R/DataFrame.R  | 6 +-
 R/pkg/inst/tests/testthat/test_context.R | 6 +-
 2 files changed, 10 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/06bae8af/R/pkg/R/DataFrame.R
--
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 0c2a194..f719173 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -1445,7 +1445,11 @@ setMethod("[", signature(x = "SparkDataFrame"),
 #' }
 setMethod("subset", signature(x = "SparkDataFrame"),
   function(x, subset, select, drop = F, ...) {
-x[subset, select, drop = drop]
+if (missing(subset)) {
+x[, select, drop = drop, ...]
+} else {
+x[subset, select, drop = drop, ...]
+}
   })
 
 #' Select

http://git-wip-us.apache.org/repos/asf/spark/blob/06bae8af/R/pkg/inst/tests/testthat/test_context.R
--
diff --git a/R/pkg/inst/tests/testthat/test_context.R 
b/R/pkg/inst/tests/testthat/test_context.R
index 0e5e15c..95258ba 100644
--- a/R/pkg/inst/tests/testthat/test_context.R
+++ b/R/pkg/inst/tests/testthat/test_context.R
@@ -27,6 +27,11 @@ test_that("Check masked functions", {
   namesOfMasked <- c("describe", "cov", "filter", "lag", "na.omit", "predict", 
"sd", "var",
  "colnames", "colnames<-", "intersect", "rank", "rbind", 
"sample", "subset",
  "summary", "transform", "drop", "window", "as.data.frame")
+  namesOfMaskedCompletely <- c("cov", "filter", "sample")
+  if (as.numeric(R.version$major) == 3 && as.numeric(R.version$minor) > 2) {
+namesOfMasked <- c("endsWith", "startsWith", namesOfMasked)
+namesOfMaskedCompletely <- c("endsWith", "startsWith", 
namesOfMaskedCompletely)
+  }
   expect_equal(length(maskedBySparkR), length(namesOfMasked))
   expect_equal(sort(maskedBySparkR), sort(namesOfMasked))
   # above are those reported as masked when `library(SparkR)`
@@ -36,7 +41,6 @@ test_that("Check masked functions", {
 any(grepl("=\"ANY\"", 
capture.output(showMethods(x)[-1])))
   }))
   maskedCompletely <- masked[!funcHasAny]
-  namesOfMaskedCompletely <- c("cov", "filter", "sample")
   expect_equal(length(maskedCompletely), length(namesOfMaskedCompletely))
   expect_equal(sort(maskedCompletely), sort(namesOfMaskedCompletely))
 })


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-15439][SPARKR] Failed to run unit test in SparkR

2016-05-25 Thread shivaram

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 15a2dba66 -> bcad1d13f


[SPARK-15439][SPARKR] Failed to run unit test in SparkR

## What changes were proposed in this pull request?

(Please fill in changes proposed in this fix)
There are some failures when running SparkR unit tests.
In this PR, I fixed two of these failures in test_context.R and test_sparkSQL.R
The first one is due to different masked name. I added missed names in the 
expected arrays.
The second one is because one PR removed the logic of a previous fix of missing 
subset method.

The file privilege issue is still there. I am debugging it. SparkR shell can 
run the test case successfully.
test_that("pipeRDD() on RDDs", {
  actual <- collect(pipeRDD(rdd, "more"))
When using run-test script, it complains no such directories as below:
cannot open file '/tmp/Rtmp4FQbah/filee2273f9d47f7': No such file or directory

## How was this patch tested?

(Please explain how this patch was tested. E.g. unit tests, integration tests, 
manual tests)
Manually test it

Author: wm...@hotmail.com 

Closes #13284 from wangmiao1981/R.

(cherry picked from commit 06bae8af17d9478c889d206a4556a697b5d629e7)
Signed-off-by: Shivaram Venkataraman 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/bcad1d13
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/bcad1d13
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/bcad1d13

Branch: refs/heads/branch-2.0
Commit: bcad1d13f58a119948e3374072824f70a14a6d34
Parents: 15a2dba
Author: wm...@hotmail.com 
Authored: Wed May 25 21:08:03 2016 -0700
Committer: Shivaram Venkataraman 
Committed: Wed May 25 21:08:17 2016 -0700

--
 R/pkg/R/DataFrame.R  | 6 +-
 R/pkg/inst/tests/testthat/test_context.R | 6 +-
 2 files changed, 10 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/bcad1d13/R/pkg/R/DataFrame.R
--
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 0c2a194..f719173 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -1445,7 +1445,11 @@ setMethod("[", signature(x = "SparkDataFrame"),
 #' }
 setMethod("subset", signature(x = "SparkDataFrame"),
   function(x, subset, select, drop = F, ...) {
-x[subset, select, drop = drop]
+if (missing(subset)) {
+x[, select, drop = drop, ...]
+} else {
+x[subset, select, drop = drop, ...]
+}
   })
 
 #' Select

http://git-wip-us.apache.org/repos/asf/spark/blob/bcad1d13/R/pkg/inst/tests/testthat/test_context.R
--
diff --git a/R/pkg/inst/tests/testthat/test_context.R 
b/R/pkg/inst/tests/testthat/test_context.R
index 0e5e15c..95258ba 100644
--- a/R/pkg/inst/tests/testthat/test_context.R
+++ b/R/pkg/inst/tests/testthat/test_context.R
@@ -27,6 +27,11 @@ test_that("Check masked functions", {
   namesOfMasked <- c("describe", "cov", "filter", "lag", "na.omit", "predict", 
"sd", "var",
  "colnames", "colnames<-", "intersect", "rank", "rbind", 
"sample", "subset",
  "summary", "transform", "drop", "window", "as.data.frame")
+  namesOfMaskedCompletely <- c("cov", "filter", "sample")
+  if (as.numeric(R.version$major) == 3 && as.numeric(R.version$minor) > 2) {
+namesOfMasked <- c("endsWith", "startsWith", namesOfMasked)
+namesOfMaskedCompletely <- c("endsWith", "startsWith", 
namesOfMaskedCompletely)
+  }
   expect_equal(length(maskedBySparkR), length(namesOfMasked))
   expect_equal(sort(maskedBySparkR), sort(namesOfMasked))
   # above are those reported as masked when `library(SparkR)`
@@ -36,7 +41,6 @@ test_that("Check masked functions", {
 any(grepl("=\"ANY\"", 
capture.output(showMethods(x)[-1])))
   }))
   maskedCompletely <- masked[!funcHasAny]
-  namesOfMaskedCompletely <- c("cov", "filter", "sample")
   expect_equal(length(maskedCompletely), length(namesOfMaskedCompletely))
   expect_equal(sort(maskedCompletely), sort(namesOfMaskedCompletely))
 })


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-10372] [CORE] basic test framework for entire spark scheduler

2016-05-25 Thread irashid

Repository: spark
Updated Branches:
  refs/heads/master 06bae8af1 -> dfc9fc02c


[SPARK-10372] [CORE] basic test framework for entire spark scheduler

This is a basic framework for testing the entire scheduler.  The tests this 
adds aren't very interesting -- the point of this PR is just to setup the 
framework, to keep the initial change small, but it can be built upon to test 
more features (eg., speculation, killing tasks, blacklisting, etc.).

Author: Imran Rashid 

Closes #8559 from squito/SPARK-10372-scheduler-integs.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/dfc9fc02
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/dfc9fc02
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/dfc9fc02

Branch: refs/heads/master
Commit: dfc9fc02ccbceb09213c394177d54b9ca56b6f24
Parents: 06bae8a
Author: Imran Rashid 
Authored: Thu May 26 00:29:09 2016 -0500
Committer: Imran Rashid 
Committed: Thu May 26 00:29:09 2016 -0500

--
 .../apache/spark/scheduler/DAGScheduler.scala   |   2 +-
 .../org/apache/spark/scheduler/TaskResult.scala |   2 +-
 ...pache.spark.scheduler.ExternalClusterManager |   3 +-
 .../scheduler/BlacklistIntegrationSuite.scala   | 130 +
 .../spark/scheduler/DAGSchedulerSuite.scala |  15 +-
 .../scheduler/ExternalClusterManagerSuite.scala |  25 +-
 .../scheduler/SchedulerIntegrationSuite.scala   | 564 +++
 7 files changed, 728 insertions(+), 13 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/dfc9fc02/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
--
diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala 
b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
index 766e979..a2eadbc 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -1396,7 +1396,7 @@ class DAGScheduler(
   stage.clearFailures()
 } else {
   stage.latestInfo.stageFailed(errorMessage.get)
-  logInfo("%s (%s) failed in %s s".format(stage, stage.name, serviceTime))
+  logInfo(s"$stage (${stage.name}) failed in $serviceTime s due to 
${errorMessage.get}")
 }
 
 outputCommitCoordinator.stageEnd(stage.id)

http://git-wip-us.apache.org/repos/asf/spark/blob/dfc9fc02/core/src/main/scala/org/apache/spark/scheduler/TaskResult.scala
--
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskResult.scala 
b/core/src/main/scala/org/apache/spark/scheduler/TaskResult.scala
index 80f2bf4..77fda6f 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskResult.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskResult.scala
@@ -59,7 +59,7 @@ private[spark] class DirectTaskResult[T](
 
 val numUpdates = in.readInt
 if (numUpdates == 0) {
-  accumUpdates = null
+  accumUpdates = Seq()
 } else {
   val _accumUpdates = new ArrayBuffer[AccumulatorV2[_, _]]
   for (i <- 0 until numUpdates) {

http://git-wip-us.apache.org/repos/asf/spark/blob/dfc9fc02/core/src/test/resources/META-INF/services/org.apache.spark.scheduler.ExternalClusterManager
--
diff --git 
a/core/src/test/resources/META-INF/services/org.apache.spark.scheduler.ExternalClusterManager
 
b/core/src/test/resources/META-INF/services/org.apache.spark.scheduler.ExternalClusterManager
index 3c570ff..757c6d2 100644
--- 
a/core/src/test/resources/META-INF/services/org.apache.spark.scheduler.ExternalClusterManager
+++ 
b/core/src/test/resources/META-INF/services/org.apache.spark.scheduler.ExternalClusterManager
@@ -1 +1,2 @@
-org.apache.spark.scheduler.DummyExternalClusterManager
\ No newline at end of file
+org.apache.spark.scheduler.DummyExternalClusterManager
+org.apache.spark.scheduler.MockExternalClusterManager
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/spark/blob/dfc9fc02/core/src/test/scala/org/apache/spark/scheduler/BlacklistIntegrationSuite.scala
--
diff --git 
a/core/src/test/scala/org/apache/spark/scheduler/BlacklistIntegrationSuite.scala
 
b/core/src/test/scala/org/apache/spark/scheduler/BlacklistIntegrationSuite.scala
new file mode 100644
index 000..6c9d4fb
--- /dev/null
+++ 
b/core/src/test/scala/org/apache/spark/scheduler/BlacklistIntegrationSuite.scala
@@ -0,0 +1,130 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The AS

[4/4] spark git commit: [SPARK-15543][SQL] Rename DefaultSources to make them more self-describing

2016-05-25 Thread rxin

[SPARK-15543][SQL] Rename DefaultSources to make them more self-describing

## What changes were proposed in this pull request?
This patch renames various DefaultSources to make their names more 
self-describing. The choice of "DefaultSource" was from the days when we did 
not have a good way to specify short names.

They are now named:
- LibSVMFileFormat
- CSVFileFormat
- JdbcRelationProvider
- JsonFileFormat
- ParquetFileFormat
- TextFileFormat

Backward compatibility is maintained through aliasing.

## How was this patch tested?
Updated relevant test cases too.

Author: Reynold Xin 

Closes #13311 from rxin/SPARK-15543.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/361ebc28
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/361ebc28
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/361ebc28

Branch: refs/heads/master
Commit: 361ebc282b2d09dc6dcf21419a53c5c617b1b6bd
Parents: dfc9fc0
Author: Reynold Xin 
Authored: Wed May 25 23:54:24 2016 -0700
Committer: Reynold Xin 
Committed: Wed May 25 23:54:24 2016 -0700

--
 apache.spark.sql.sources.DataSourceRegister |   2 +-
 .../spark/ml/source/libsvm/LibSVMRelation.scala |   8 +-
 project/MimaExcludes.scala  |   4 +-
 apache.spark.sql.sources.DataSourceRegister |  10 +-
 .../spark/sql/execution/ExistingRDD.scala   |   2 +-
 .../sql/execution/datasources/DataSource.scala  |  45 +-
 .../datasources/csv/CSVFileFormat.scala | 185 
 .../datasources/csv/DefaultSource.scala | 187 
 .../datasources/jdbc/DefaultSource.scala|  55 --
 .../datasources/jdbc/JdbcRelationProvider.scala |  55 ++
 .../datasources/json/JSONRelation.scala | 198 
 .../datasources/json/JsonFileFormat.scala   | 198 
 .../datasources/parquet/ParquetFileFormat.scala | 923 +++
 .../datasources/parquet/ParquetRelation.scala   | 923 ---
 .../datasources/text/DefaultSource.scala| 141 ---
 .../datasources/text/TextFileFormat.scala   | 141 +++
 .../execution/datasources/json/JsonSuite.scala  |   4 +-
 .../parquet/ParquetSchemaSuite.scala|  12 +-
 .../sql/sources/ResolvedDataSourceSuite.scala   |  18 +-
 .../streaming/DataFrameReaderWriterSuite.scala  | 542 ---
 .../sql/streaming/FileStreamSinkSuite.scala |   4 +-
 .../test/DataFrameReaderWriterSuite.scala   | 541 +++
 apache.spark.sql.sources.DataSourceRegister |   2 +-
 .../spark/sql/hive/HiveMetastoreCatalog.scala   |  18 +-
 .../spark/sql/hive/orc/OrcFileFormat.scala  | 375 
 .../apache/spark/sql/hive/orc/OrcRelation.scala | 371 
 .../sql/hive/orc/OrcHadoopFsRelationSuite.scala |   2 +-
 27 files changed, 2498 insertions(+), 2468 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/361ebc28/mllib/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister
--
diff --git 
a/mllib/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister
 
b/mllib/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister
index f632dd6..a865cbe 100644
--- 
a/mllib/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister
+++ 
b/mllib/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister
@@ -1 +1 @@
-org.apache.spark.ml.source.libsvm.DefaultSource
+org.apache.spark.ml.source.libsvm.LibSVMFileFormat

http://git-wip-us.apache.org/repos/asf/spark/blob/361ebc28/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala 
b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala
index 5ba768d..64ebf0c 100644
--- 
a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala
@@ -90,7 +90,7 @@ private[libsvm] class LibSVMOutputWriter(
  * .load("data/mllib/sample_libsvm_data.txt")
  *
  *   // Java
- *   DataFrame df = spark.read().format("libsvm")
+ *   Dataset df = spark.read().format("libsvm")
  * .option("numFeatures, "780")
  * .load("data/mllib/sample_libsvm_data.txt");
  * }}}
@@ -105,9 +105,13 @@ private[libsvm] class LibSVMOutputWriter(
  *  - "vectorType": feature vector type, "sparse" (default) or "dense".
  *
  *  @see [[https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/ LIBSVM 
datasets]]
+ *
+ * Note that this class is public for documentation purpose. Please don't use 
this class directly.
+ * Rather, use the data source API as illustrated abo

[2/4] spark git commit: [SPARK-15543][SQL] Rename DefaultSources to make them more self-describing

2016-05-25 Thread rxin

http://git-wip-us.apache.org/repos/asf/spark/blob/361ebc28/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/DefaultSource.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/DefaultSource.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/DefaultSource.scala
deleted file mode 100644
index f091615..000
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/DefaultSource.scala
+++ /dev/null
@@ -1,141 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.execution.datasources.text
-
-import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.fs.{FileStatus, Path}
-import org.apache.hadoop.io.{NullWritable, Text}
-import org.apache.hadoop.mapreduce.{Job, RecordWriter, TaskAttemptContext}
-import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
-
-import org.apache.spark.sql.{AnalysisException, Row, SparkSession}
-import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.UnsafeRow
-import org.apache.spark.sql.catalyst.expressions.codegen.{BufferHolder, 
UnsafeRowWriter}
-import org.apache.spark.sql.execution.datasources._
-import org.apache.spark.sql.sources._
-import org.apache.spark.sql.types.{StringType, StructType}
-import org.apache.spark.util.SerializableConfiguration
-
-/**
- * A data source for reading text files.
- */
-class DefaultSource extends FileFormat with DataSourceRegister {
-
-  override def shortName(): String = "text"
-
-  private def verifySchema(schema: StructType): Unit = {
-if (schema.size != 1) {
-  throw new AnalysisException(
-s"Text data source supports only a single column, and you have 
${schema.size} columns.")
-}
-val tpe = schema(0).dataType
-if (tpe != StringType) {
-  throw new AnalysisException(
-s"Text data source supports only a string column, but you have 
${tpe.simpleString}.")
-}
-  }
-
-  override def inferSchema(
-  sparkSession: SparkSession,
-  options: Map[String, String],
-  files: Seq[FileStatus]): Option[StructType] = Some(new 
StructType().add("value", StringType))
-
-  override def prepareWrite(
-  sparkSession: SparkSession,
-  job: Job,
-  options: Map[String, String],
-  dataSchema: StructType): OutputWriterFactory = {
-verifySchema(dataSchema)
-
-val conf = job.getConfiguration
-val compressionCodec = 
options.get("compression").map(CompressionCodecs.getCodecClassName)
-compressionCodec.foreach { codec =>
-  CompressionCodecs.setCodecConfiguration(conf, codec)
-}
-
-new OutputWriterFactory {
-  override def newInstance(
-  path: String,
-  bucketId: Option[Int],
-  dataSchema: StructType,
-  context: TaskAttemptContext): OutputWriter = {
-if (bucketId.isDefined) {
-  throw new AnalysisException("Text doesn't support bucketing")
-}
-new TextOutputWriter(path, dataSchema, context)
-  }
-}
-  }
-
-  override def buildReader(
-  sparkSession: SparkSession,
-  dataSchema: StructType,
-  partitionSchema: StructType,
-  requiredSchema: StructType,
-  filters: Seq[Filter],
-  options: Map[String, String],
-  hadoopConf: Configuration): PartitionedFile => Iterator[InternalRow] = {
-val broadcastedHadoopConf =
-  sparkSession.sparkContext.broadcast(new 
SerializableConfiguration(hadoopConf))
-
-(file: PartitionedFile) => {
-  val unsafeRow = new UnsafeRow(1)
-  val bufferHolder = new BufferHolder(unsafeRow)
-  val unsafeRowWriter = new UnsafeRowWriter(bufferHolder, 1)
-
-  new HadoopFileLinesReader(file, broadcastedHadoopConf.value.value).map { 
line =>
-// Writes to an UnsafeRow directly
-bufferHolder.reset()
-unsafeRowWriter.write(0, line.getBytes, 0, line.getLength)
-unsafeRow.setTotalSize(bufferHolder.totalSize())
-unsafeRow
-  }
-}
-  }
-}
-
-class TextOutputWriter(path: String, dataSchema: StructType, context: 
TaskAttemptContext)
-  extends OutputWrit

[4/4] spark git commit: [SPARK-15543][SQL] Rename DefaultSources to make them more self-describing

2016-05-25 Thread rxin

[SPARK-15543][SQL] Rename DefaultSources to make them more self-describing

## What changes were proposed in this pull request?
This patch renames various DefaultSources to make their names more 
self-describing. The choice of "DefaultSource" was from the days when we did 
not have a good way to specify short names.

They are now named:
- LibSVMFileFormat
- CSVFileFormat
- JdbcRelationProvider
- JsonFileFormat
- ParquetFileFormat
- TextFileFormat

Backward compatibility is maintained through aliasing.

## How was this patch tested?
Updated relevant test cases too.

Author: Reynold Xin 

Closes #13311 from rxin/SPARK-15543.

(cherry picked from commit 361ebc282b2d09dc6dcf21419a53c5c617b1b6bd)
Signed-off-by: Reynold Xin 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b3ee53b8
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b3ee53b8
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b3ee53b8

Branch: refs/heads/branch-2.0
Commit: b3ee53b8424bf7be4f3111744e07a6100a01e4da
Parents: bcad1d1
Author: Reynold Xin 
Authored: Wed May 25 23:54:24 2016 -0700
Committer: Reynold Xin 
Committed: Wed May 25 23:54:30 2016 -0700

--
 apache.spark.sql.sources.DataSourceRegister |   2 +-
 .../spark/ml/source/libsvm/LibSVMRelation.scala |   8 +-
 project/MimaExcludes.scala  |   4 +-
 apache.spark.sql.sources.DataSourceRegister |  10 +-
 .../spark/sql/execution/ExistingRDD.scala   |   2 +-
 .../sql/execution/datasources/DataSource.scala  |  45 +-
 .../datasources/csv/CSVFileFormat.scala | 185 
 .../datasources/csv/DefaultSource.scala | 187 
 .../datasources/jdbc/DefaultSource.scala|  55 --
 .../datasources/jdbc/JdbcRelationProvider.scala |  55 ++
 .../datasources/json/JSONRelation.scala | 198 
 .../datasources/json/JsonFileFormat.scala   | 198 
 .../datasources/parquet/ParquetFileFormat.scala | 923 +++
 .../datasources/parquet/ParquetRelation.scala   | 923 ---
 .../datasources/text/DefaultSource.scala| 141 ---
 .../datasources/text/TextFileFormat.scala   | 141 +++
 .../execution/datasources/json/JsonSuite.scala  |   4 +-
 .../parquet/ParquetSchemaSuite.scala|  12 +-
 .../sql/sources/ResolvedDataSourceSuite.scala   |  18 +-
 .../streaming/DataFrameReaderWriterSuite.scala  | 542 ---
 .../sql/streaming/FileStreamSinkSuite.scala |   4 +-
 .../test/DataFrameReaderWriterSuite.scala   | 541 +++
 apache.spark.sql.sources.DataSourceRegister |   2 +-
 .../spark/sql/hive/HiveMetastoreCatalog.scala   |  18 +-
 .../spark/sql/hive/orc/OrcFileFormat.scala  | 375 
 .../apache/spark/sql/hive/orc/OrcRelation.scala | 371 
 .../sql/hive/orc/OrcHadoopFsRelationSuite.scala |   2 +-
 27 files changed, 2498 insertions(+), 2468 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/b3ee53b8/mllib/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister
--
diff --git 
a/mllib/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister
 
b/mllib/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister
index f632dd6..a865cbe 100644
--- 
a/mllib/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister
+++ 
b/mllib/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister
@@ -1 +1 @@
-org.apache.spark.ml.source.libsvm.DefaultSource
+org.apache.spark.ml.source.libsvm.LibSVMFileFormat

http://git-wip-us.apache.org/repos/asf/spark/blob/b3ee53b8/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala 
b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala
index 5ba768d..64ebf0c 100644
--- 
a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala
@@ -90,7 +90,7 @@ private[libsvm] class LibSVMOutputWriter(
  * .load("data/mllib/sample_libsvm_data.txt")
  *
  *   // Java
- *   DataFrame df = spark.read().format("libsvm")
+ *   Dataset df = spark.read().format("libsvm")
  * .option("numFeatures, "780")
  * .load("data/mllib/sample_libsvm_data.txt");
  * }}}
@@ -105,9 +105,13 @@ private[libsvm] class LibSVMOutputWriter(
  *  - "vectorType": feature vector type, "sparse" (default) or "dense".
  *
  *  @see [[https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/ LIBSVM 
datasets]]
+ *
+ * Note that this class is public for documentation

[3/4] spark git commit: [SPARK-15543][SQL] Rename DefaultSources to make them more self-describing

2016-05-25 Thread rxin

http://git-wip-us.apache.org/repos/asf/spark/blob/b3ee53b8/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
new file mode 100644
index 000..b47d41e
--- /dev/null
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
@@ -0,0 +1,923 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources.parquet
+
+import java.net.URI
+import java.util.logging.{Logger => JLogger}
+
+import scala.collection.JavaConverters._
+import scala.collection.mutable
+import scala.util.{Failure, Try}
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.{FileStatus, Path}
+import org.apache.hadoop.mapreduce._
+import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat, FileSplit}
+import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl
+import org.apache.parquet.{Log => ApacheParquetLog}
+import org.apache.parquet.filter2.compat.FilterCompat
+import org.apache.parquet.filter2.predicate.FilterApi
+import org.apache.parquet.hadoop._
+import org.apache.parquet.hadoop.util.ContextUtil
+import org.apache.parquet.schema.MessageType
+import org.slf4j.bridge.SLF4JBridgeHandler
+
+import org.apache.spark.SparkException
+import org.apache.spark.internal.Logging
+import org.apache.spark.sql._
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions._
+import 
org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection
+import org.apache.spark.sql.catalyst.parser.LegacyTypeStringParser
+import org.apache.spark.sql.execution.datasources._
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.sources._
+import org.apache.spark.sql.types._
+import org.apache.spark.util.SerializableConfiguration
+
+private[sql] class ParquetFileFormat
+  extends FileFormat
+  with DataSourceRegister
+  with Logging
+  with Serializable {
+
+  override def shortName(): String = "parquet"
+
+  override def toString: String = "ParquetFormat"
+
+  override def hashCode(): Int = getClass.hashCode()
+
+  override def equals(other: Any): Boolean = 
other.isInstanceOf[ParquetFileFormat]
+
+  override def prepareWrite(
+  sparkSession: SparkSession,
+  job: Job,
+  options: Map[String, String],
+  dataSchema: StructType): OutputWriterFactory = {
+
+val parquetOptions = new ParquetOptions(options, 
sparkSession.sessionState.conf)
+
+val conf = ContextUtil.getConfiguration(job)
+
+val committerClass =
+  conf.getClass(
+SQLConf.PARQUET_OUTPUT_COMMITTER_CLASS.key,
+classOf[ParquetOutputCommitter],
+classOf[ParquetOutputCommitter])
+
+if (conf.get(SQLConf.PARQUET_OUTPUT_COMMITTER_CLASS.key) == null) {
+  logInfo("Using default output committer for Parquet: " +
+classOf[ParquetOutputCommitter].getCanonicalName)
+} else {
+  logInfo("Using user defined output committer for Parquet: " + 
committerClass.getCanonicalName)
+}
+
+conf.setClass(
+  SQLConf.OUTPUT_COMMITTER_CLASS.key,
+  committerClass,
+  classOf[ParquetOutputCommitter])
+
+// We're not really using `ParquetOutputFormat[Row]` for writing data 
here, because we override
+// it in `ParquetOutputWriter` to support appending and dynamic 
partitioning.  The reason why
+// we set it here is to setup the output committer class to 
`ParquetOutputCommitter`, which is
+// bundled with `ParquetOutputFormat[Row]`.
+job.setOutputFormatClass(classOf[ParquetOutputFormat[Row]])
+
+ParquetOutputFormat.setWriteSupportClass(job, 
classOf[CatalystWriteSupport])
+
+// We want to clear this temporary metadata from saving into Parquet file.
+// This metadata is only useful for detecting optional columns when 
pushdowning filters.
+val dataSchemaToWrite = 
StructType.removeMetadata(StructType.metadataKeyForOptionalFi

[2/4] spark git commit: [SPARK-15543][SQL] Rename DefaultSources to make them more self-describing

2016-05-25 Thread rxin

http://git-wip-us.apache.org/repos/asf/spark/blob/b3ee53b8/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/DefaultSource.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/DefaultSource.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/DefaultSource.scala
deleted file mode 100644
index f091615..000
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/DefaultSource.scala
+++ /dev/null
@@ -1,141 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.execution.datasources.text
-
-import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.fs.{FileStatus, Path}
-import org.apache.hadoop.io.{NullWritable, Text}
-import org.apache.hadoop.mapreduce.{Job, RecordWriter, TaskAttemptContext}
-import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
-
-import org.apache.spark.sql.{AnalysisException, Row, SparkSession}
-import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.UnsafeRow
-import org.apache.spark.sql.catalyst.expressions.codegen.{BufferHolder, 
UnsafeRowWriter}
-import org.apache.spark.sql.execution.datasources._
-import org.apache.spark.sql.sources._
-import org.apache.spark.sql.types.{StringType, StructType}
-import org.apache.spark.util.SerializableConfiguration
-
-/**
- * A data source for reading text files.
- */
-class DefaultSource extends FileFormat with DataSourceRegister {
-
-  override def shortName(): String = "text"
-
-  private def verifySchema(schema: StructType): Unit = {
-if (schema.size != 1) {
-  throw new AnalysisException(
-s"Text data source supports only a single column, and you have 
${schema.size} columns.")
-}
-val tpe = schema(0).dataType
-if (tpe != StringType) {
-  throw new AnalysisException(
-s"Text data source supports only a string column, but you have 
${tpe.simpleString}.")
-}
-  }
-
-  override def inferSchema(
-  sparkSession: SparkSession,
-  options: Map[String, String],
-  files: Seq[FileStatus]): Option[StructType] = Some(new 
StructType().add("value", StringType))
-
-  override def prepareWrite(
-  sparkSession: SparkSession,
-  job: Job,
-  options: Map[String, String],
-  dataSchema: StructType): OutputWriterFactory = {
-verifySchema(dataSchema)
-
-val conf = job.getConfiguration
-val compressionCodec = 
options.get("compression").map(CompressionCodecs.getCodecClassName)
-compressionCodec.foreach { codec =>
-  CompressionCodecs.setCodecConfiguration(conf, codec)
-}
-
-new OutputWriterFactory {
-  override def newInstance(
-  path: String,
-  bucketId: Option[Int],
-  dataSchema: StructType,
-  context: TaskAttemptContext): OutputWriter = {
-if (bucketId.isDefined) {
-  throw new AnalysisException("Text doesn't support bucketing")
-}
-new TextOutputWriter(path, dataSchema, context)
-  }
-}
-  }
-
-  override def buildReader(
-  sparkSession: SparkSession,
-  dataSchema: StructType,
-  partitionSchema: StructType,
-  requiredSchema: StructType,
-  filters: Seq[Filter],
-  options: Map[String, String],
-  hadoopConf: Configuration): PartitionedFile => Iterator[InternalRow] = {
-val broadcastedHadoopConf =
-  sparkSession.sparkContext.broadcast(new 
SerializableConfiguration(hadoopConf))
-
-(file: PartitionedFile) => {
-  val unsafeRow = new UnsafeRow(1)
-  val bufferHolder = new BufferHolder(unsafeRow)
-  val unsafeRowWriter = new UnsafeRowWriter(bufferHolder, 1)
-
-  new HadoopFileLinesReader(file, broadcastedHadoopConf.value.value).map { 
line =>
-// Writes to an UnsafeRow directly
-bufferHolder.reset()
-unsafeRowWriter.write(0, line.getBytes, 0, line.getLength)
-unsafeRow.setTotalSize(bufferHolder.totalSize())
-unsafeRow
-  }
-}
-  }
-}
-
-class TextOutputWriter(path: String, dataSchema: StructType, context: 
TaskAttemptContext)
-  extends OutputWrit

[1/4] spark git commit: [SPARK-15543][SQL] Rename DefaultSources to make them more self-describing

2016-05-25 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 bcad1d13f -> b3ee53b84


http://git-wip-us.apache.org/repos/asf/spark/blob/b3ee53b8/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
--
diff --git 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
deleted file mode 100644
index 38f50c1..000
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
+++ /dev/null
@@ -1,371 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.hive.orc
-
-import java.net.URI
-import java.util.Properties
-
-import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.fs.{FileStatus, Path}
-import org.apache.hadoop.hive.conf.HiveConf.ConfVars
-import org.apache.hadoop.hive.ql.io.orc._
-import 
org.apache.hadoop.hive.serde2.objectinspector.{SettableStructObjectInspector, 
StructObjectInspector}
-import org.apache.hadoop.hive.serde2.typeinfo.{StructTypeInfo, TypeInfoUtils}
-import org.apache.hadoop.io.{NullWritable, Writable}
-import org.apache.hadoop.mapred.{InputFormat => MapRedInputFormat, JobConf, 
OutputFormat => MapRedOutputFormat, RecordWriter, Reporter}
-import org.apache.hadoop.mapreduce._
-import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat, FileSplit}
-
-import org.apache.spark.internal.Logging
-import org.apache.spark.rdd.{HadoopRDD, RDD}
-import org.apache.spark.sql.{Row, SparkSession}
-import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.execution.datasources._
-import org.apache.spark.sql.hive.{HiveInspectors, HiveShim}
-import org.apache.spark.sql.sources.{Filter, _}
-import org.apache.spark.sql.types.StructType
-import org.apache.spark.util.SerializableConfiguration
-
-private[sql] class DefaultSource
-  extends FileFormat with DataSourceRegister with Serializable {
-
-  override def shortName(): String = "orc"
-
-  override def toString: String = "ORC"
-
-  override def inferSchema(
-  sparkSession: SparkSession,
-  options: Map[String, String],
-  files: Seq[FileStatus]): Option[StructType] = {
-OrcFileOperator.readSchema(
-  files.map(_.getPath.toUri.toString),
-  Some(sparkSession.sessionState.newHadoopConf())
-)
-  }
-
-  override def prepareWrite(
-  sparkSession: SparkSession,
-  job: Job,
-  options: Map[String, String],
-  dataSchema: StructType): OutputWriterFactory = {
-val orcOptions = new OrcOptions(options)
-
-val configuration = job.getConfiguration
-
-configuration.set(OrcRelation.ORC_COMPRESSION, orcOptions.compressionCodec)
-configuration match {
-  case conf: JobConf =>
-conf.setOutputFormat(classOf[OrcOutputFormat])
-  case conf =>
-conf.setClass(
-  "mapred.output.format.class",
-  classOf[OrcOutputFormat],
-  classOf[MapRedOutputFormat[_, _]])
-}
-
-new OutputWriterFactory {
-  override def newInstance(
-  path: String,
-  bucketId: Option[Int],
-  dataSchema: StructType,
-  context: TaskAttemptContext): OutputWriter = {
-new OrcOutputWriter(path, bucketId, dataSchema, context)
-  }
-}
-  }
-
-  override def buildReader(
-  sparkSession: SparkSession,
-  dataSchema: StructType,
-  partitionSchema: StructType,
-  requiredSchema: StructType,
-  filters: Seq[Filter],
-  options: Map[String, String],
-  hadoopConf: Configuration): (PartitionedFile) => Iterator[InternalRow] = 
{
-if (sparkSession.sessionState.conf.orcFilterPushDown) {
-  // Sets pushed predicates
-  OrcFilters.createFilter(requiredSchema, filters.toArray).foreach { f =>
-hadoopConf.set(OrcTableScan.SARG_PUSHDOWN, f.toKryo)
-hadoopConf.setBoolean(ConfVars.HIVEOPTINDEXFILTER.varname, true)
-  }
-}
-
-val broadcastedHadoopConf =
-  sparkSession.sparkContext.broadcast(new 
SerializableConfiguration(hadoopConf))
-
-(file: PartitionedFile) => {
-  val conf = broadcastedHadoopConf.value.value

[3/4] spark git commit: [SPARK-15543][SQL] Rename DefaultSources to make them more self-describing

2016-05-25 Thread rxin

http://git-wip-us.apache.org/repos/asf/spark/blob/361ebc28/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
new file mode 100644
index 000..b47d41e
--- /dev/null
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
@@ -0,0 +1,923 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources.parquet
+
+import java.net.URI
+import java.util.logging.{Logger => JLogger}
+
+import scala.collection.JavaConverters._
+import scala.collection.mutable
+import scala.util.{Failure, Try}
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.{FileStatus, Path}
+import org.apache.hadoop.mapreduce._
+import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat, FileSplit}
+import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl
+import org.apache.parquet.{Log => ApacheParquetLog}
+import org.apache.parquet.filter2.compat.FilterCompat
+import org.apache.parquet.filter2.predicate.FilterApi
+import org.apache.parquet.hadoop._
+import org.apache.parquet.hadoop.util.ContextUtil
+import org.apache.parquet.schema.MessageType
+import org.slf4j.bridge.SLF4JBridgeHandler
+
+import org.apache.spark.SparkException
+import org.apache.spark.internal.Logging
+import org.apache.spark.sql._
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions._
+import 
org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection
+import org.apache.spark.sql.catalyst.parser.LegacyTypeStringParser
+import org.apache.spark.sql.execution.datasources._
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.sources._
+import org.apache.spark.sql.types._
+import org.apache.spark.util.SerializableConfiguration
+
+private[sql] class ParquetFileFormat
+  extends FileFormat
+  with DataSourceRegister
+  with Logging
+  with Serializable {
+
+  override def shortName(): String = "parquet"
+
+  override def toString: String = "ParquetFormat"
+
+  override def hashCode(): Int = getClass.hashCode()
+
+  override def equals(other: Any): Boolean = 
other.isInstanceOf[ParquetFileFormat]
+
+  override def prepareWrite(
+  sparkSession: SparkSession,
+  job: Job,
+  options: Map[String, String],
+  dataSchema: StructType): OutputWriterFactory = {
+
+val parquetOptions = new ParquetOptions(options, 
sparkSession.sessionState.conf)
+
+val conf = ContextUtil.getConfiguration(job)
+
+val committerClass =
+  conf.getClass(
+SQLConf.PARQUET_OUTPUT_COMMITTER_CLASS.key,
+classOf[ParquetOutputCommitter],
+classOf[ParquetOutputCommitter])
+
+if (conf.get(SQLConf.PARQUET_OUTPUT_COMMITTER_CLASS.key) == null) {
+  logInfo("Using default output committer for Parquet: " +
+classOf[ParquetOutputCommitter].getCanonicalName)
+} else {
+  logInfo("Using user defined output committer for Parquet: " + 
committerClass.getCanonicalName)
+}
+
+conf.setClass(
+  SQLConf.OUTPUT_COMMITTER_CLASS.key,
+  committerClass,
+  classOf[ParquetOutputCommitter])
+
+// We're not really using `ParquetOutputFormat[Row]` for writing data 
here, because we override
+// it in `ParquetOutputWriter` to support appending and dynamic 
partitioning.  The reason why
+// we set it here is to setup the output committer class to 
`ParquetOutputCommitter`, which is
+// bundled with `ParquetOutputFormat[Row]`.
+job.setOutputFormatClass(classOf[ParquetOutputFormat[Row]])
+
+ParquetOutputFormat.setWriteSupportClass(job, 
classOf[CatalystWriteSupport])
+
+// We want to clear this temporary metadata from saving into Parquet file.
+// This metadata is only useful for detecting optional columns when 
pushdowning filters.
+val dataSchemaToWrite = 
StructType.removeMetadata(StructType.metadataKeyForOptionalFi

[1/4] spark git commit: [SPARK-15543][SQL] Rename DefaultSources to make them more self-describing

2016-05-25 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master dfc9fc02c -> 361ebc282


http://git-wip-us.apache.org/repos/asf/spark/blob/361ebc28/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
--
diff --git 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
deleted file mode 100644
index 38f50c1..000
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
+++ /dev/null
@@ -1,371 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.hive.orc
-
-import java.net.URI
-import java.util.Properties
-
-import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.fs.{FileStatus, Path}
-import org.apache.hadoop.hive.conf.HiveConf.ConfVars
-import org.apache.hadoop.hive.ql.io.orc._
-import 
org.apache.hadoop.hive.serde2.objectinspector.{SettableStructObjectInspector, 
StructObjectInspector}
-import org.apache.hadoop.hive.serde2.typeinfo.{StructTypeInfo, TypeInfoUtils}
-import org.apache.hadoop.io.{NullWritable, Writable}
-import org.apache.hadoop.mapred.{InputFormat => MapRedInputFormat, JobConf, 
OutputFormat => MapRedOutputFormat, RecordWriter, Reporter}
-import org.apache.hadoop.mapreduce._
-import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat, FileSplit}
-
-import org.apache.spark.internal.Logging
-import org.apache.spark.rdd.{HadoopRDD, RDD}
-import org.apache.spark.sql.{Row, SparkSession}
-import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.execution.datasources._
-import org.apache.spark.sql.hive.{HiveInspectors, HiveShim}
-import org.apache.spark.sql.sources.{Filter, _}
-import org.apache.spark.sql.types.StructType
-import org.apache.spark.util.SerializableConfiguration
-
-private[sql] class DefaultSource
-  extends FileFormat with DataSourceRegister with Serializable {
-
-  override def shortName(): String = "orc"
-
-  override def toString: String = "ORC"
-
-  override def inferSchema(
-  sparkSession: SparkSession,
-  options: Map[String, String],
-  files: Seq[FileStatus]): Option[StructType] = {
-OrcFileOperator.readSchema(
-  files.map(_.getPath.toUri.toString),
-  Some(sparkSession.sessionState.newHadoopConf())
-)
-  }
-
-  override def prepareWrite(
-  sparkSession: SparkSession,
-  job: Job,
-  options: Map[String, String],
-  dataSchema: StructType): OutputWriterFactory = {
-val orcOptions = new OrcOptions(options)
-
-val configuration = job.getConfiguration
-
-configuration.set(OrcRelation.ORC_COMPRESSION, orcOptions.compressionCodec)
-configuration match {
-  case conf: JobConf =>
-conf.setOutputFormat(classOf[OrcOutputFormat])
-  case conf =>
-conf.setClass(
-  "mapred.output.format.class",
-  classOf[OrcOutputFormat],
-  classOf[MapRedOutputFormat[_, _]])
-}
-
-new OutputWriterFactory {
-  override def newInstance(
-  path: String,
-  bucketId: Option[Int],
-  dataSchema: StructType,
-  context: TaskAttemptContext): OutputWriter = {
-new OrcOutputWriter(path, bucketId, dataSchema, context)
-  }
-}
-  }
-
-  override def buildReader(
-  sparkSession: SparkSession,
-  dataSchema: StructType,
-  partitionSchema: StructType,
-  requiredSchema: StructType,
-  filters: Seq[Filter],
-  options: Map[String, String],
-  hadoopConf: Configuration): (PartitionedFile) => Iterator[InternalRow] = 
{
-if (sparkSession.sessionState.conf.orcFilterPushDown) {
-  // Sets pushed predicates
-  OrcFilters.createFilter(requiredSchema, filters.toArray).foreach { f =>
-hadoopConf.set(OrcTableScan.SARG_PUSHDOWN, f.toKryo)
-hadoopConf.setBoolean(ConfVars.HIVEOPTINDEXFILTER.varname, true)
-  }
-}
-
-val broadcastedHadoopConf =
-  sparkSession.sparkContext.broadcast(new 
SerializableConfiguration(hadoopConf))
-
-(file: PartitionedFile) => {
-  val conf = broadcastedHadoopConf.value.value
-
-

40 matches

Mail list logo