date:20150811

spark git commit: [SPARK-9814] [SQL] EqualNotNull not passing to data sources

2015-08-11 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/branch-1.5 e9d1eab92 - eead87ef2


[SPARK-9814] [SQL] EqualNotNull not passing to data sources

Author: hyukjinkwon gurwls...@gmail.com
Author: ê¶íì§ gurwls...@gmail.com

Closes #8096 from HyukjinKwon/master.

(cherry picked from commit 00c02728a6c6c4282c389ca90641dd78dd5e3d32)
Signed-off-by: Reynold Xin r...@databricks.com


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/eead87ef
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/eead87ef
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/eead87ef

Branch: refs/heads/branch-1.5
Commit: eead87ef2459a8c1d5257ea0b22526a76ddf1f69
Parents: e9d1eab
Author: hyukjinkwon gurwls...@gmail.com
Authored: Tue Aug 11 14:04:09 2015 -0700
Committer: Reynold Xin r...@databricks.com
Committed: Tue Aug 11 14:04:23 2015 -0700

--
 .../sql/execution/datasources/DataSourceStrategy.scala  | 5 +
 .../main/scala/org/apache/spark/sql/sources/filters.scala   | 9 +
 .../org/apache/spark/sql/sources/FilteredScanSuite.scala| 1 +
 3 files changed, 15 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/eead87ef/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
index 78a4acd..2a4c40d 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
@@ -349,6 +349,11 @@ private[sql] object DataSourceStrategy extends Strategy 
with Logging {
   case expressions.EqualTo(Literal(v, _), a: Attribute) =
 Some(sources.EqualTo(a.name, v))
 
+  case expressions.EqualNullSafe(a: Attribute, Literal(v, _)) =
+Some(sources.EqualNullSafe(a.name, v))
+  case expressions.EqualNullSafe(Literal(v, _), a: Attribute) =
+Some(sources.EqualNullSafe(a.name, v))
+
   case expressions.GreaterThan(a: Attribute, Literal(v, _)) =
 Some(sources.GreaterThan(a.name, v))
   case expressions.GreaterThan(Literal(v, _), a: Attribute) =

http://git-wip-us.apache.org/repos/asf/spark/blob/eead87ef/sql/core/src/main/scala/org/apache/spark/sql/sources/filters.scala
--
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/filters.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/sources/filters.scala
index 4d942e4..3780cbb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/filters.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/filters.scala
@@ -37,6 +37,15 @@ abstract class Filter
 case class EqualTo(attribute: String, value: Any) extends Filter
 
 /**
+ * Performs equality comparison, similar to [[EqualTo]]. However, this differs 
from [[EqualTo]]
+ * in that it returns `true` (rather than NULL) if both inputs are NULL, and 
`false`
+ * (rather than NULL) if one of the input is NULL and the other is not NULL.
+ *
+ * @since 1.5.0
+ */
+case class EqualNullSafe(attribute: String, value: Any) extends Filter
+
+/**
  * A filter that evaluates to `true` iff the attribute evaluates to a value
  * greater than `value`.
  *

http://git-wip-us.apache.org/repos/asf/spark/blob/eead87ef/sql/core/src/test/scala/org/apache/spark/sql/sources/FilteredScanSuite.scala
--
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/sources/FilteredScanSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/sources/FilteredScanSuite.scala
index 81b3a0f..5ef3657 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/sources/FilteredScanSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/sources/FilteredScanSuite.scala
@@ -56,6 +56,7 @@ case class SimpleFilteredScan(from: Int, to: Int)(@transient 
val sqlContext: SQL
 // Predicate test on integer column
 def translateFilterOnA(filter: Filter): Int = Boolean = filter match {
   case EqualTo(a, v) = (a: Int) = a == v
+  case EqualNullSafe(a, v) = (a: Int) = a == v
   case LessThan(a, v: Int) = (a: Int) = a  v
   case LessThanOrEqual(a, v: Int) = (a: Int) = a = v
   case GreaterThan(a, v: Int) = (a: Int) = a  v


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-9824] [CORE] Fix the issue that InternalAccumulator leaks WeakReference

2015-08-11 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master 00c02728a - f16bc68df


[SPARK-9824] [CORE] Fix the issue that InternalAccumulator leaks WeakReference

`InternalAccumulator.create` doesn't call `registerAccumulatorForCleanup` to 
register itself with ContextCleaner, so `WeakReference`s for these accumulators 
in `Accumulators.originals` won't be removed.

This PR added `registerAccumulatorForCleanup` for internal accumulators to 
avoid the memory leak.

Author: zsxwing zsxw...@gmail.com

Closes #8108 from zsxwing/internal-accumulators-leak.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f16bc68d
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f16bc68d
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f16bc68d

Branch: refs/heads/master
Commit: f16bc68dfb25c7b746ae031a57840ace9bafa87f
Parents: 00c0272
Author: zsxwing zsxw...@gmail.com
Authored: Tue Aug 11 14:06:23 2015 -0700
Committer: Reynold Xin r...@databricks.com
Committed: Tue Aug 11 14:06:23 2015 -0700

--
 .../scala/org/apache/spark/Accumulators.scala   | 22 
 .../org/apache/spark/scheduler/Stage.scala  |  2 +-
 .../org/apache/spark/AccumulatorSuite.scala |  3 ++-
 3 files changed, 16 insertions(+), 11 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/f16bc68d/core/src/main/scala/org/apache/spark/Accumulators.scala
--
diff --git a/core/src/main/scala/org/apache/spark/Accumulators.scala 
b/core/src/main/scala/org/apache/spark/Accumulators.scala
index 064246d..c39c866 100644
--- a/core/src/main/scala/org/apache/spark/Accumulators.scala
+++ b/core/src/main/scala/org/apache/spark/Accumulators.scala
@@ -382,14 +382,18 @@ private[spark] object InternalAccumulator {
* add to the same set of accumulators. We do this to report the 
distribution of accumulator
* values across all tasks within each stage.
*/
-  def create(): Seq[Accumulator[Long]] = {
-Seq(
-  // Execution memory refers to the memory used by internal data 
structures created
-  // during shuffles, aggregations and joins. The value of this 
accumulator should be
-  // approximately the sum of the peak sizes across all such data 
structures created
-  // in this task. For SQL jobs, this only tracks all unsafe operators and 
ExternalSort.
-  new Accumulator(
-0L, AccumulatorParam.LongAccumulatorParam, 
Some(PEAK_EXECUTION_MEMORY), internal = true)
-) ++ maybeTestAccumulator.toSeq
+  def create(sc: SparkContext): Seq[Accumulator[Long]] = {
+val internalAccumulators = Seq(
+// Execution memory refers to the memory used by internal data 
structures created
+// during shuffles, aggregations and joins. The value of this 
accumulator should be
+// approximately the sum of the peak sizes across all such data 
structures created
+// in this task. For SQL jobs, this only tracks all unsafe operators 
and ExternalSort.
+new Accumulator(
+  0L, AccumulatorParam.LongAccumulatorParam, 
Some(PEAK_EXECUTION_MEMORY), internal = true)
+  ) ++ maybeTestAccumulator.toSeq
+internalAccumulators.foreach { accumulator =
+  sc.cleaner.foreach(_.registerAccumulatorForCleanup(accumulator))
+}
+internalAccumulators
   }
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/f16bc68d/core/src/main/scala/org/apache/spark/scheduler/Stage.scala
--
diff --git a/core/src/main/scala/org/apache/spark/scheduler/Stage.scala 
b/core/src/main/scala/org/apache/spark/scheduler/Stage.scala
index de05ee2..1cf0685 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/Stage.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/Stage.scala
@@ -81,7 +81,7 @@ private[spark] abstract class Stage(
* accumulators here again will override partial values from the finished 
tasks.
*/
   def resetInternalAccumulators(): Unit = {
-_internalAccumulators = InternalAccumulator.create()
+_internalAccumulators = InternalAccumulator.create(rdd.sparkContext)
   }
 
   /**

http://git-wip-us.apache.org/repos/asf/spark/blob/f16bc68d/core/src/test/scala/org/apache/spark/AccumulatorSuite.scala
--
diff --git a/core/src/test/scala/org/apache/spark/AccumulatorSuite.scala 
b/core/src/test/scala/org/apache/spark/AccumulatorSuite.scala
index 48f5495..0eb2293 100644
--- a/core/src/test/scala/org/apache/spark/AccumulatorSuite.scala
+++ b/core/src/test/scala/org/apache/spark/AccumulatorSuite.scala
@@ -160,7 +160,8 @@ class AccumulatorSuite extends SparkFunSuite with Matchers 
with LocalSparkContex
   }
 
   test(internal accumulators in TaskContext) {
-

spark git commit: Closes #1290 Closes #4934

2015-08-11 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master f16bc68df - 423cdfd83


Closes #1290
Closes #4934


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/423cdfd8
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/423cdfd8
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/423cdfd8

Branch: refs/heads/master
Commit: 423cdfd83d7fd02a4f8cf3e714db913fd3f9ca09
Parents: f16bc68
Author: Xiangrui Meng m...@databricks.com
Authored: Tue Aug 11 14:08:09 2015 -0700
Committer: Xiangrui Meng m...@databricks.com
Committed: Tue Aug 11 14:08:09 2015 -0700

--

--



-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-9788] [MLLIB] Fix LDA Binary Compatibility

2015-08-11 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/branch-1.5 cdf781db6 - 2273e7432


[SPARK-9788] [MLLIB] Fix LDA Binary Compatibility

1. Add âasymmetricDocConcentrationâ and revert docConcentration changes. If 
the (internal) doc concentration vector is a single value, 
âgetDocConcentration returns it. If it is a constant vector, 
getDocConcentration returns the first item, and fails otherwise.
2. Give `LDAModel.gammaShape` a default value in `LDAModel` concrete class 
constructors.

jkbradley

Author: Feynman Liang fli...@databricks.com

Closes #8077 from feynmanliang/SPARK-9788 and squashes the following commits:

6b07bc8 [Feynman Liang] Code review changes
9d6a71e [Feynman Liang] Add asymmetricAlpha alias
bf4e685 [Feynman Liang] Asymmetric docConcentration
4cab972 [Feynman Liang] Default gammaShape

(cherry picked from commit be3e27164133025db860781bd5cdd3ca233edd21)
Signed-off-by: Joseph K. Bradley jos...@databricks.com


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2273e743
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2273e743
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2273e743

Branch: refs/heads/branch-1.5
Commit: 2273e7432ec218ba163a94f86307ad11904a1dee
Parents: cdf781d
Author: Feynman Liang fli...@databricks.com
Authored: Tue Aug 11 14:21:53 2015 -0700
Committer: Joseph K. Bradley jos...@databricks.com
Committed: Tue Aug 11 14:22:02 2015 -0700

--
 .../org/apache/spark/mllib/clustering/LDA.scala | 27 ---
 .../spark/mllib/clustering/LDAModel.scala   | 11 
 .../spark/mllib/clustering/LDAOptimizer.scala   | 28 ++--
 .../spark/mllib/clustering/LDASuite.scala   |  4 +--
 4 files changed, 46 insertions(+), 24 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/2273e743/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
index ab124e6..0fc9b1a 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
@@ -79,7 +79,24 @@ class LDA private (
*
* This is the parameter to a Dirichlet distribution.
*/
-  def getDocConcentration: Vector = this.docConcentration
+  def getAsymmetricDocConcentration: Vector = this.docConcentration
+
+  /**
+   * Concentration parameter (commonly named alpha) for the prior placed on 
documents'
+   * distributions over topics (theta).
+   *
+   * This method assumes the Dirichlet distribution is symmetric and can be 
described by a single
+   * [[Double]] parameter. It should fail if docConcentration is asymmetric.
+   */
+  def getDocConcentration: Double = {
+val parameter = docConcentration(0)
+if (docConcentration.size == 1) {
+  parameter
+} else {
+  require(docConcentration.toArray.forall(_ == parameter))
+  parameter
+}
+  }
 
   /**
* Concentration parameter (commonly named alpha) for the prior placed on 
documents'
@@ -106,18 +123,22 @@ class LDA private (
*   [[https://github.com/Blei-Lab/onlineldavb]].
*/
   def setDocConcentration(docConcentration: Vector): this.type = {
+require(docConcentration.size  0, docConcentration must have  0 
elements)
 this.docConcentration = docConcentration
 this
   }
 
-  /** Replicates Double to create a symmetric prior */
+  /** Replicates a [[Double]] docConcentration to create a symmetric prior. */
   def setDocConcentration(docConcentration: Double): this.type = {
 this.docConcentration = Vectors.dense(docConcentration)
 this
   }
 
+  /** Alias for [[getAsymmetricDocConcentration]] */
+  def getAsymmetricAlpha: Vector = getAsymmetricDocConcentration
+
   /** Alias for [[getDocConcentration]] */
-  def getAlpha: Vector = getDocConcentration
+  def getAlpha: Double = getDocConcentration
 
   /** Alias for [[setDocConcentration()]] */
   def setAlpha(alpha: Vector): this.type = setDocConcentration(alpha)

http://git-wip-us.apache.org/repos/asf/spark/blob/2273e743/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
index 33babda..5dc637e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
@@ -27,7 +27,6 @@ import org.json4s.jackson.JsonMethods._
 import org.apache.spark.SparkContext
 import

spark git commit: [SPARK-9788] [MLLIB] Fix LDA Binary Compatibility

2015-08-11 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/master 423cdfd83 - be3e27164


[SPARK-9788] [MLLIB] Fix LDA Binary Compatibility

1. Add âasymmetricDocConcentrationâ and revert docConcentration changes. If 
the (internal) doc concentration vector is a single value, 
âgetDocConcentration returns it. If it is a constant vector, 
getDocConcentration returns the first item, and fails otherwise.
2. Give `LDAModel.gammaShape` a default value in `LDAModel` concrete class 
constructors.

jkbradley

Author: Feynman Liang fli...@databricks.com

Closes #8077 from feynmanliang/SPARK-9788 and squashes the following commits:

6b07bc8 [Feynman Liang] Code review changes
9d6a71e [Feynman Liang] Add asymmetricAlpha alias
bf4e685 [Feynman Liang] Asymmetric docConcentration
4cab972 [Feynman Liang] Default gammaShape


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/be3e2716
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/be3e2716
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/be3e2716

Branch: refs/heads/master
Commit: be3e27164133025db860781bd5cdd3ca233edd21
Parents: 423cdfd
Author: Feynman Liang fli...@databricks.com
Authored: Tue Aug 11 14:21:53 2015 -0700
Committer: Joseph K. Bradley jos...@databricks.com
Committed: Tue Aug 11 14:21:53 2015 -0700

--
 .../org/apache/spark/mllib/clustering/LDA.scala | 27 ---
 .../spark/mllib/clustering/LDAModel.scala   | 11 
 .../spark/mllib/clustering/LDAOptimizer.scala   | 28 ++--
 .../spark/mllib/clustering/LDASuite.scala   |  4 +--
 4 files changed, 46 insertions(+), 24 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/be3e2716/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
index ab124e6..0fc9b1a 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
@@ -79,7 +79,24 @@ class LDA private (
*
* This is the parameter to a Dirichlet distribution.
*/
-  def getDocConcentration: Vector = this.docConcentration
+  def getAsymmetricDocConcentration: Vector = this.docConcentration
+
+  /**
+   * Concentration parameter (commonly named alpha) for the prior placed on 
documents'
+   * distributions over topics (theta).
+   *
+   * This method assumes the Dirichlet distribution is symmetric and can be 
described by a single
+   * [[Double]] parameter. It should fail if docConcentration is asymmetric.
+   */
+  def getDocConcentration: Double = {
+val parameter = docConcentration(0)
+if (docConcentration.size == 1) {
+  parameter
+} else {
+  require(docConcentration.toArray.forall(_ == parameter))
+  parameter
+}
+  }
 
   /**
* Concentration parameter (commonly named alpha) for the prior placed on 
documents'
@@ -106,18 +123,22 @@ class LDA private (
*   [[https://github.com/Blei-Lab/onlineldavb]].
*/
   def setDocConcentration(docConcentration: Vector): this.type = {
+require(docConcentration.size  0, docConcentration must have  0 
elements)
 this.docConcentration = docConcentration
 this
   }
 
-  /** Replicates Double to create a symmetric prior */
+  /** Replicates a [[Double]] docConcentration to create a symmetric prior. */
   def setDocConcentration(docConcentration: Double): this.type = {
 this.docConcentration = Vectors.dense(docConcentration)
 this
   }
 
+  /** Alias for [[getAsymmetricDocConcentration]] */
+  def getAsymmetricAlpha: Vector = getAsymmetricDocConcentration
+
   /** Alias for [[getDocConcentration]] */
-  def getAlpha: Vector = getDocConcentration
+  def getAlpha: Double = getDocConcentration
 
   /** Alias for [[setDocConcentration()]] */
   def setAlpha(alpha: Vector): this.type = setDocConcentration(alpha)

http://git-wip-us.apache.org/repos/asf/spark/blob/be3e2716/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
index 33babda..5dc637e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
@@ -27,7 +27,6 @@ import org.json4s.jackson.JsonMethods._
 import org.apache.spark.SparkContext
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.api.java.JavaPairRDD
-import org.apache.spark.broadcast.Broadcast
 import

Git Push Summary

2015-08-11 Thread pwendell

Repository: spark
Updated Tags:  refs/tags/v1.5.0-snapshot-20150811 [created] e9329ef6a

-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[1/2] spark git commit: Preparing Spark release v1.5.0-snapshot-20150811

2015-08-11 Thread pwendell

Repository: spark
Updated Branches:
  refs/heads/branch-1.5 ef961ed48 - 725e5c7a4


Preparing Spark release v1.5.0-snapshot-20150811


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e9329ef6
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e9329ef6
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e9329ef6

Branch: refs/heads/branch-1.5
Commit: e9329ef6a48d141446777c64f58467827ee5faaa
Parents: ef961ed
Author: Patrick Wendell pwend...@gmail.com
Authored: Tue Aug 11 14:32:37 2015 -0700
Committer: Patrick Wendell pwend...@gmail.com
Committed: Tue Aug 11 14:32:37 2015 -0700

--
 assembly/pom.xml| 2 +-
 bagel/pom.xml   | 2 +-
 core/pom.xml| 2 +-
 examples/pom.xml| 2 +-
 external/flume-assembly/pom.xml | 2 +-
 external/flume-sink/pom.xml | 2 +-
 external/flume/pom.xml  | 2 +-
 external/kafka-assembly/pom.xml | 2 +-
 external/kafka/pom.xml  | 2 +-
 external/mqtt-assembly/pom.xml  | 2 +-
 external/mqtt/pom.xml   | 2 +-
 external/twitter/pom.xml| 2 +-
 external/zeromq/pom.xml | 2 +-
 extras/java8-tests/pom.xml  | 2 +-
 extras/kinesis-asl-assembly/pom.xml | 2 +-
 extras/kinesis-asl/pom.xml  | 2 +-
 extras/spark-ganglia-lgpl/pom.xml   | 2 +-
 graphx/pom.xml  | 2 +-
 launcher/pom.xml| 2 +-
 mllib/pom.xml   | 2 +-
 network/common/pom.xml  | 2 +-
 network/shuffle/pom.xml | 2 +-
 network/yarn/pom.xml| 2 +-
 pom.xml | 2 +-
 repl/pom.xml| 2 +-
 sql/catalyst/pom.xml| 2 +-
 sql/core/pom.xml| 2 +-
 sql/hive-thriftserver/pom.xml   | 2 +-
 sql/hive/pom.xml| 2 +-
 streaming/pom.xml   | 2 +-
 tools/pom.xml   | 2 +-
 unsafe/pom.xml  | 2 +-
 yarn/pom.xml| 2 +-
 33 files changed, 33 insertions(+), 33 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/e9329ef6/assembly/pom.xml
--
diff --git a/assembly/pom.xml b/assembly/pom.xml
index e9c6d26..3ef7d6f 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -21,7 +21,7 @@
   parent
 groupIdorg.apache.spark/groupId
 artifactIdspark-parent_2.10/artifactId
-version1.5.0-SNAPSHOT/version
+version1.5.0/version
 relativePath../pom.xml/relativePath
   /parent
 

http://git-wip-us.apache.org/repos/asf/spark/blob/e9329ef6/bagel/pom.xml
--
diff --git a/bagel/pom.xml b/bagel/pom.xml
index ed5c37e..684e07b 100644
--- a/bagel/pom.xml
+++ b/bagel/pom.xml
@@ -21,7 +21,7 @@
   parent
 groupIdorg.apache.spark/groupId
 artifactIdspark-parent_2.10/artifactId
-version1.5.0-SNAPSHOT/version
+version1.5.0/version
 relativePath../pom.xml/relativePath
   /parent
 

http://git-wip-us.apache.org/repos/asf/spark/blob/e9329ef6/core/pom.xml
--
diff --git a/core/pom.xml b/core/pom.xml
index 0e53a79..bb25652 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -21,7 +21,7 @@
   parent
 groupIdorg.apache.spark/groupId
 artifactIdspark-parent_2.10/artifactId
-version1.5.0-SNAPSHOT/version
+version1.5.0/version
 relativePath../pom.xml/relativePath
   /parent
 

http://git-wip-us.apache.org/repos/asf/spark/blob/e9329ef6/examples/pom.xml
--
diff --git a/examples/pom.xml b/examples/pom.xml
index e6884b0..9ef1eda 100644
--- a/examples/pom.xml
+++ b/examples/pom.xml
@@ -21,7 +21,7 @@
   parent
 groupIdorg.apache.spark/groupId
 artifactIdspark-parent_2.10/artifactId
-version1.5.0-SNAPSHOT/version
+version1.5.0/version
 relativePath../pom.xml/relativePath
   /parent
 

http://git-wip-us.apache.org/repos/asf/spark/blob/e9329ef6/external/flume-assembly/pom.xml
--
diff --git a/external/flume-assembly/pom.xml b/external/flume-assembly/pom.xml
index 1318959..6377c3e 100644
--- a/external/flume-assembly/pom.xml
+++ b/external/flume-assembly/pom.xml
@@ -21,7 +21,7 @@
   parent
 groupIdorg.apache.spark/groupId
 artifactIdspark-parent_2.10/artifactId
-version1.5.0-SNAPSHOT/version
+version1.5.0/version
 relativePath../../pom.xml/relativePath
   /parent
 

http://git-wip-us.apache.org/repos/asf/spark/blob/e9329ef6/external/flume-sink/pom.xml

[2/2] spark git commit: Preparing development version 1.5.0-SNAPSHOT

2015-08-11 Thread pwendell

Preparing development version 1.5.0-SNAPSHOT


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/725e5c7a
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/725e5c7a
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/725e5c7a

Branch: refs/heads/branch-1.5
Commit: 725e5c7a44f08deda23d2a15617557a354f21dc9
Parents: e9329ef
Author: Patrick Wendell pwend...@gmail.com
Authored: Tue Aug 11 14:32:43 2015 -0700
Committer: Patrick Wendell pwend...@gmail.com
Committed: Tue Aug 11 14:32:43 2015 -0700

--
 assembly/pom.xml| 2 +-
 bagel/pom.xml   | 2 +-
 core/pom.xml| 2 +-
 examples/pom.xml| 2 +-
 external/flume-assembly/pom.xml | 2 +-
 external/flume-sink/pom.xml | 2 +-
 external/flume/pom.xml  | 2 +-
 external/kafka-assembly/pom.xml | 2 +-
 external/kafka/pom.xml  | 2 +-
 external/mqtt-assembly/pom.xml  | 2 +-
 external/mqtt/pom.xml   | 2 +-
 external/twitter/pom.xml| 2 +-
 external/zeromq/pom.xml | 2 +-
 extras/java8-tests/pom.xml  | 2 +-
 extras/kinesis-asl-assembly/pom.xml | 2 +-
 extras/kinesis-asl/pom.xml  | 2 +-
 extras/spark-ganglia-lgpl/pom.xml   | 2 +-
 graphx/pom.xml  | 2 +-
 launcher/pom.xml| 2 +-
 mllib/pom.xml   | 2 +-
 network/common/pom.xml  | 2 +-
 network/shuffle/pom.xml | 2 +-
 network/yarn/pom.xml| 2 +-
 pom.xml | 2 +-
 repl/pom.xml| 2 +-
 sql/catalyst/pom.xml| 2 +-
 sql/core/pom.xml| 2 +-
 sql/hive-thriftserver/pom.xml   | 2 +-
 sql/hive/pom.xml| 2 +-
 streaming/pom.xml   | 2 +-
 tools/pom.xml   | 2 +-
 unsafe/pom.xml  | 2 +-
 yarn/pom.xml| 2 +-
 33 files changed, 33 insertions(+), 33 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/725e5c7a/assembly/pom.xml
--
diff --git a/assembly/pom.xml b/assembly/pom.xml
index 3ef7d6f..e9c6d26 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -21,7 +21,7 @@
   parent
 groupIdorg.apache.spark/groupId
 artifactIdspark-parent_2.10/artifactId
-version1.5.0/version
+version1.5.0-SNAPSHOT/version
 relativePath../pom.xml/relativePath
   /parent
 

http://git-wip-us.apache.org/repos/asf/spark/blob/725e5c7a/bagel/pom.xml
--
diff --git a/bagel/pom.xml b/bagel/pom.xml
index 684e07b..ed5c37e 100644
--- a/bagel/pom.xml
+++ b/bagel/pom.xml
@@ -21,7 +21,7 @@
   parent
 groupIdorg.apache.spark/groupId
 artifactIdspark-parent_2.10/artifactId
-version1.5.0/version
+version1.5.0-SNAPSHOT/version
 relativePath../pom.xml/relativePath
   /parent
 

http://git-wip-us.apache.org/repos/asf/spark/blob/725e5c7a/core/pom.xml
--
diff --git a/core/pom.xml b/core/pom.xml
index bb25652..0e53a79 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -21,7 +21,7 @@
   parent
 groupIdorg.apache.spark/groupId
 artifactIdspark-parent_2.10/artifactId
-version1.5.0/version
+version1.5.0-SNAPSHOT/version
 relativePath../pom.xml/relativePath
   /parent
 

http://git-wip-us.apache.org/repos/asf/spark/blob/725e5c7a/examples/pom.xml
--
diff --git a/examples/pom.xml b/examples/pom.xml
index 9ef1eda..e6884b0 100644
--- a/examples/pom.xml
+++ b/examples/pom.xml
@@ -21,7 +21,7 @@
   parent
 groupIdorg.apache.spark/groupId
 artifactIdspark-parent_2.10/artifactId
-version1.5.0/version
+version1.5.0-SNAPSHOT/version
 relativePath../pom.xml/relativePath
   /parent
 

http://git-wip-us.apache.org/repos/asf/spark/blob/725e5c7a/external/flume-assembly/pom.xml
--
diff --git a/external/flume-assembly/pom.xml b/external/flume-assembly/pom.xml
index 6377c3e..1318959 100644
--- a/external/flume-assembly/pom.xml
+++ b/external/flume-assembly/pom.xml
@@ -21,7 +21,7 @@
   parent
 groupIdorg.apache.spark/groupId
 artifactIdspark-parent_2.10/artifactId
-version1.5.0/version
+version1.5.0-SNAPSHOT/version
 relativePath../../pom.xml/relativePath
   /parent
 

http://git-wip-us.apache.org/repos/asf/spark/blob/725e5c7a/external/flume-sink/pom.xml
--
diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml
index

spark git commit: [SPARK-7726] Add import so Scaladoc doesn't fail.

2015-08-11 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master 520ad44b1 - 2a3be4ddf


[SPARK-7726] Add import so Scaladoc doesn't fail.

This is another import needed so Scala 2.11 doc generation doesn't fail.
See SPARK-7726 for more detail. I tested this locally and the 2.11
install goes from failing to succeeding with this patch.

Author: Patrick Wendell patr...@databricks.com

Closes #8095 from pwendell/scaladoc.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2a3be4dd
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2a3be4dd
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2a3be4dd

Branch: refs/heads/master
Commit: 2a3be4ddf9d9527353f07ea0ab204ce17dbcba9a
Parents: 520ad44
Author: Patrick Wendell patr...@databricks.com
Authored: Tue Aug 11 14:02:23 2015 -0700
Committer: Reynold Xin r...@databricks.com
Committed: Tue Aug 11 14:02:23 2015 -0700

--
 .../spark/network/shuffle/protocol/mesos/RegisterDriver.java  | 3 +++
 1 file changed, 3 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/2a3be4dd/network/shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/mesos/RegisterDriver.java
--
diff --git 
a/network/shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/mesos/RegisterDriver.java
 
b/network/shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/mesos/RegisterDriver.java
index 1c28fc1..94a61d6 100644
--- 
a/network/shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/mesos/RegisterDriver.java
+++ 
b/network/shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/mesos/RegisterDriver.java
@@ -23,6 +23,9 @@ import io.netty.buffer.ByteBuf;
 import org.apache.spark.network.protocol.Encoders;
 import org.apache.spark.network.shuffle.protocol.BlockTransferMessage;
 
+// Needed by ScalaDoc. See SPARK-7726
+import static 
org.apache.spark.network.shuffle.protocol.BlockTransferMessage.Type;
+
 /**
  * A message sent from the driver to register with the 
MesosExternalShuffleService.
  */


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-7726] Add import so Scaladoc doesn't fail.

2015-08-11 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/branch-1.5 811d23f1c - e9d1eab92


[SPARK-7726] Add import so Scaladoc doesn't fail.

This is another import needed so Scala 2.11 doc generation doesn't fail.
See SPARK-7726 for more detail. I tested this locally and the 2.11
install goes from failing to succeeding with this patch.

Author: Patrick Wendell patr...@databricks.com

Closes #8095 from pwendell/scaladoc.

(cherry picked from commit 2a3be4ddf9d9527353f07ea0ab204ce17dbcba9a)
Signed-off-by: Reynold Xin r...@databricks.com


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e9d1eab9
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e9d1eab9
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e9d1eab9

Branch: refs/heads/branch-1.5
Commit: e9d1eab925df5510085928eb34a43b4a15eb01a2
Parents: 811d23f
Author: Patrick Wendell patr...@databricks.com
Authored: Tue Aug 11 14:02:23 2015 -0700
Committer: Reynold Xin r...@databricks.com
Committed: Tue Aug 11 14:02:46 2015 -0700

--
 .../spark/network/shuffle/protocol/mesos/RegisterDriver.java  | 3 +++
 1 file changed, 3 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/e9d1eab9/network/shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/mesos/RegisterDriver.java
--
diff --git 
a/network/shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/mesos/RegisterDriver.java
 
b/network/shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/mesos/RegisterDriver.java
index 1c28fc1..94a61d6 100644
--- 
a/network/shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/mesos/RegisterDriver.java
+++ 
b/network/shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/mesos/RegisterDriver.java
@@ -23,6 +23,9 @@ import io.netty.buffer.ByteBuf;
 import org.apache.spark.network.protocol.Encoders;
 import org.apache.spark.network.shuffle.protocol.BlockTransferMessage;
 
+// Needed by ScalaDoc. See SPARK-7726
+import static 
org.apache.spark.network.shuffle.protocol.BlockTransferMessage.Type;
+
 /**
  * A message sent from the driver to register with the 
MesosExternalShuffleService.
  */


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-9814] [SQL] EqualNotNull not passing to data sources

2015-08-11 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master 2a3be4ddf - 00c02728a


[SPARK-9814] [SQL] EqualNotNull not passing to data sources

Author: hyukjinkwon gurwls...@gmail.com
Author: ê¶íì§ gurwls...@gmail.com

Closes #8096 from HyukjinKwon/master.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/00c02728
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/00c02728
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/00c02728

Branch: refs/heads/master
Commit: 00c02728a6c6c4282c389ca90641dd78dd5e3d32
Parents: 2a3be4d
Author: hyukjinkwon gurwls...@gmail.com
Authored: Tue Aug 11 14:04:09 2015 -0700
Committer: Reynold Xin r...@databricks.com
Committed: Tue Aug 11 14:04:09 2015 -0700

--
 .../sql/execution/datasources/DataSourceStrategy.scala  | 5 +
 .../main/scala/org/apache/spark/sql/sources/filters.scala   | 9 +
 .../org/apache/spark/sql/sources/FilteredScanSuite.scala| 1 +
 3 files changed, 15 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/00c02728/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
index 78a4acd..2a4c40d 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
@@ -349,6 +349,11 @@ private[sql] object DataSourceStrategy extends Strategy 
with Logging {
   case expressions.EqualTo(Literal(v, _), a: Attribute) =
 Some(sources.EqualTo(a.name, v))
 
+  case expressions.EqualNullSafe(a: Attribute, Literal(v, _)) =
+Some(sources.EqualNullSafe(a.name, v))
+  case expressions.EqualNullSafe(Literal(v, _), a: Attribute) =
+Some(sources.EqualNullSafe(a.name, v))
+
   case expressions.GreaterThan(a: Attribute, Literal(v, _)) =
 Some(sources.GreaterThan(a.name, v))
   case expressions.GreaterThan(Literal(v, _), a: Attribute) =

http://git-wip-us.apache.org/repos/asf/spark/blob/00c02728/sql/core/src/main/scala/org/apache/spark/sql/sources/filters.scala
--
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/filters.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/sources/filters.scala
index 4d942e4..3780cbb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/filters.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/filters.scala
@@ -37,6 +37,15 @@ abstract class Filter
 case class EqualTo(attribute: String, value: Any) extends Filter
 
 /**
+ * Performs equality comparison, similar to [[EqualTo]]. However, this differs 
from [[EqualTo]]
+ * in that it returns `true` (rather than NULL) if both inputs are NULL, and 
`false`
+ * (rather than NULL) if one of the input is NULL and the other is not NULL.
+ *
+ * @since 1.5.0
+ */
+case class EqualNullSafe(attribute: String, value: Any) extends Filter
+
+/**
  * A filter that evaluates to `true` iff the attribute evaluates to a value
  * greater than `value`.
  *

http://git-wip-us.apache.org/repos/asf/spark/blob/00c02728/sql/core/src/test/scala/org/apache/spark/sql/sources/FilteredScanSuite.scala
--
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/sources/FilteredScanSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/sources/FilteredScanSuite.scala
index 81b3a0f..5ef3657 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/sources/FilteredScanSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/sources/FilteredScanSuite.scala
@@ -56,6 +56,7 @@ case class SimpleFilteredScan(from: Int, to: Int)(@transient 
val sqlContext: SQL
 // Predicate test on integer column
 def translateFilterOnA(filter: Filter): Int = Boolean = filter match {
   case EqualTo(a, v) = (a: Int) = a == v
+  case EqualNullSafe(a, v) = (a: Int) = a == v
   case LessThan(a, v: Int) = (a: Int) = a  v
   case LessThanOrEqual(a, v: Int) = (a: Int) = a = v
   case GreaterThan(a, v: Int) = (a: Int) = a  v


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-8925] [MLLIB] Add @since tags to mllib.util

2015-08-11 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-1.5 2273e7432 - ef961ed48


[SPARK-8925] [MLLIB] Add @since tags to mllib.util

Went thru the history of changes the file MLUtils.scala and picked up the 
version that the change went in.

Author: Sudhakar Thota sudhakarth...@yahoo.com
Author: Sudhakar Thota sudhakarth...@sudhakars-mbp-2.usca.ibm.com

Closes #7436 from sthota2014/SPARK-8925_thotas.

(cherry picked from commit 017b5de07ef6cff249e984a2ab781c520249ac76)
Signed-off-by: Xiangrui Meng m...@databricks.com


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ef961ed4
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ef961ed4
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ef961ed4

Branch: refs/heads/branch-1.5
Commit: ef961ed48a4f45447f0e0ad256b040c7ab2d78d9
Parents: 2273e74
Author: Sudhakar Thota sudhakarth...@yahoo.com
Authored: Tue Aug 11 14:31:51 2015 -0700
Committer: Xiangrui Meng m...@databricks.com
Committed: Tue Aug 11 14:32:01 2015 -0700

--
 .../org/apache/spark/mllib/util/MLUtils.scala   | 22 +++-
 1 file changed, 21 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/ef961ed4/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
index 7c5cfa7..26eb84a 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
@@ -64,6 +64,7 @@ object MLUtils {
*feature dimensions.
* @param minPartitions min number of partitions
* @return labeled data stored as an RDD[LabeledPoint]
+   * @since 1.0.0
*/
   def loadLibSVMFile(
   sc: SparkContext,
@@ -113,7 +114,10 @@ object MLUtils {
   }
 
   // Convenient methods for `loadLibSVMFile`.
-
+  
+  /**
+   * @since 1.0.0
+   */
   @deprecated(use method without multiclass argument, which no longer has 
effect, 1.1.0)
   def loadLibSVMFile(
   sc: SparkContext,
@@ -126,6 +130,7 @@ object MLUtils {
   /**
* Loads labeled data in the LIBSVM format into an RDD[LabeledPoint], with 
the default number of
* partitions.
+   * @since 1.0.0
*/
   def loadLibSVMFile(
   sc: SparkContext,
@@ -133,6 +138,9 @@ object MLUtils {
   numFeatures: Int): RDD[LabeledPoint] =
 loadLibSVMFile(sc, path, numFeatures, sc.defaultMinPartitions)
 
+  /**
+   * @since 1.0.0
+   */
   @deprecated(use method without multiclass argument, which no longer has 
effect, 1.1.0)
   def loadLibSVMFile(
   sc: SparkContext,
@@ -141,6 +149,9 @@ object MLUtils {
   numFeatures: Int): RDD[LabeledPoint] =
 loadLibSVMFile(sc, path, numFeatures)
 
+  /**
+   * @since 1.0.0
+   */
   @deprecated(use method without multiclass argument, which no longer has 
effect, 1.1.0)
   def loadLibSVMFile(
   sc: SparkContext,
@@ -151,6 +162,7 @@ object MLUtils {
   /**
* Loads binary labeled data in the LIBSVM format into an RDD[LabeledPoint], 
with number of
* features determined automatically and the default number of partitions.
+   * @since 1.0.0
*/
   def loadLibSVMFile(sc: SparkContext, path: String): RDD[LabeledPoint] =
 loadLibSVMFile(sc, path, -1)
@@ -181,12 +193,14 @@ object MLUtils {
* @param path file or directory path in any Hadoop-supported file system URI
* @param minPartitions min number of partitions
* @return vectors stored as an RDD[Vector]
+   * @since 1.1.0
*/
   def loadVectors(sc: SparkContext, path: String, minPartitions: Int): 
RDD[Vector] =
 sc.textFile(path, minPartitions).map(Vectors.parse)
 
   /**
* Loads vectors saved using `RDD[Vector].saveAsTextFile` with the default 
number of partitions.
+   * @since 1.1.0
*/
   def loadVectors(sc: SparkContext, path: String): RDD[Vector] =
 sc.textFile(path, sc.defaultMinPartitions).map(Vectors.parse)
@@ -197,6 +211,7 @@ object MLUtils {
* @param path file or directory path in any Hadoop-supported file system URI
* @param minPartitions min number of partitions
* @return labeled points stored as an RDD[LabeledPoint]
+   * @since 1.1.0
*/
   def loadLabeledPoints(sc: SparkContext, path: String, minPartitions: Int): 
RDD[LabeledPoint] =
 sc.textFile(path, minPartitions).map(LabeledPoint.parse)
@@ -204,6 +219,7 @@ object MLUtils {
   /**
* Loads labeled points saved using `RDD[LabeledPoint].saveAsTextFile` with 
the default number of
* partitions.
+   * @since 1.1.0
*/
   def loadLabeledPoints(sc: SparkContext, dir: String): RDD[LabeledPoint] =
 loadLabeledPoints(sc, dir, sc.defaultMinPartitions)
@@ -220,6 +236,7 @@

spark git commit: [SPARK-8925] [MLLIB] Add @since tags to mllib.util

2015-08-11 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master be3e27164 - 017b5de07


[SPARK-8925] [MLLIB] Add @since tags to mllib.util

Went thru the history of changes the file MLUtils.scala and picked up the 
version that the change went in.

Author: Sudhakar Thota sudhakarth...@yahoo.com
Author: Sudhakar Thota sudhakarth...@sudhakars-mbp-2.usca.ibm.com

Closes #7436 from sthota2014/SPARK-8925_thotas.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/017b5de0
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/017b5de0
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/017b5de0

Branch: refs/heads/master
Commit: 017b5de07ef6cff249e984a2ab781c520249ac76
Parents: be3e271
Author: Sudhakar Thota sudhakarth...@yahoo.com
Authored: Tue Aug 11 14:31:51 2015 -0700
Committer: Xiangrui Meng m...@databricks.com
Committed: Tue Aug 11 14:31:51 2015 -0700

--
 .../org/apache/spark/mllib/util/MLUtils.scala   | 22 +++-
 1 file changed, 21 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/017b5de0/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
index 7c5cfa7..26eb84a 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
@@ -64,6 +64,7 @@ object MLUtils {
*feature dimensions.
* @param minPartitions min number of partitions
* @return labeled data stored as an RDD[LabeledPoint]
+   * @since 1.0.0
*/
   def loadLibSVMFile(
   sc: SparkContext,
@@ -113,7 +114,10 @@ object MLUtils {
   }
 
   // Convenient methods for `loadLibSVMFile`.
-
+  
+  /**
+   * @since 1.0.0
+   */
   @deprecated(use method without multiclass argument, which no longer has 
effect, 1.1.0)
   def loadLibSVMFile(
   sc: SparkContext,
@@ -126,6 +130,7 @@ object MLUtils {
   /**
* Loads labeled data in the LIBSVM format into an RDD[LabeledPoint], with 
the default number of
* partitions.
+   * @since 1.0.0
*/
   def loadLibSVMFile(
   sc: SparkContext,
@@ -133,6 +138,9 @@ object MLUtils {
   numFeatures: Int): RDD[LabeledPoint] =
 loadLibSVMFile(sc, path, numFeatures, sc.defaultMinPartitions)
 
+  /**
+   * @since 1.0.0
+   */
   @deprecated(use method without multiclass argument, which no longer has 
effect, 1.1.0)
   def loadLibSVMFile(
   sc: SparkContext,
@@ -141,6 +149,9 @@ object MLUtils {
   numFeatures: Int): RDD[LabeledPoint] =
 loadLibSVMFile(sc, path, numFeatures)
 
+  /**
+   * @since 1.0.0
+   */
   @deprecated(use method without multiclass argument, which no longer has 
effect, 1.1.0)
   def loadLibSVMFile(
   sc: SparkContext,
@@ -151,6 +162,7 @@ object MLUtils {
   /**
* Loads binary labeled data in the LIBSVM format into an RDD[LabeledPoint], 
with number of
* features determined automatically and the default number of partitions.
+   * @since 1.0.0
*/
   def loadLibSVMFile(sc: SparkContext, path: String): RDD[LabeledPoint] =
 loadLibSVMFile(sc, path, -1)
@@ -181,12 +193,14 @@ object MLUtils {
* @param path file or directory path in any Hadoop-supported file system URI
* @param minPartitions min number of partitions
* @return vectors stored as an RDD[Vector]
+   * @since 1.1.0
*/
   def loadVectors(sc: SparkContext, path: String, minPartitions: Int): 
RDD[Vector] =
 sc.textFile(path, minPartitions).map(Vectors.parse)
 
   /**
* Loads vectors saved using `RDD[Vector].saveAsTextFile` with the default 
number of partitions.
+   * @since 1.1.0
*/
   def loadVectors(sc: SparkContext, path: String): RDD[Vector] =
 sc.textFile(path, sc.defaultMinPartitions).map(Vectors.parse)
@@ -197,6 +211,7 @@ object MLUtils {
* @param path file or directory path in any Hadoop-supported file system URI
* @param minPartitions min number of partitions
* @return labeled points stored as an RDD[LabeledPoint]
+   * @since 1.1.0
*/
   def loadLabeledPoints(sc: SparkContext, path: String, minPartitions: Int): 
RDD[LabeledPoint] =
 sc.textFile(path, minPartitions).map(LabeledPoint.parse)
@@ -204,6 +219,7 @@ object MLUtils {
   /**
* Loads labeled points saved using `RDD[LabeledPoint].saveAsTextFile` with 
the default number of
* partitions.
+   * @since 1.1.0
*/
   def loadLabeledPoints(sc: SparkContext, dir: String): RDD[LabeledPoint] =
 loadLabeledPoints(sc, dir, sc.defaultMinPartitions)
@@ -220,6 +236,7 @@ object MLUtils {
*
* @deprecated Should use [[org.apache.spark.rdd.RDD#saveAsTextFile]] for 
saving and
*

spark git commit: [SPARK-9649] Fix flaky test MasterSuite again - disable REST

2015-08-11 Thread joshrosen

Repository: spark
Updated Branches:
  refs/heads/branch-1.5 ec7a4b9b0 - 0119edf52


[SPARK-9649] Fix flaky test MasterSuite again - disable REST

The REST server is not actually used in most tests and so we can disable it. It 
is a source of flakiness because it tries to bind to a specific port in vain. 
There was also some code that avoided the shuffle service in tests. This is 
actually not necessary because the shuffle service is already off by default.

Author: Andrew Or and...@databricks.com

Closes #8084 from andrewor14/fix-master-suite-again.

(cherry picked from commit ca8f70e9d473d2c81866f3c330cc6545c33bdac7)
Signed-off-by: Josh Rosen joshro...@databricks.com


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0119edf5
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0119edf5
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0119edf5

Branch: refs/heads/branch-1.5
Commit: 0119edf52885c6c798cd00bf545f5b0b1f6910af
Parents: ec7a4b9
Author: Andrew Or and...@databricks.com
Authored: Tue Aug 11 20:46:58 2015 -0700
Committer: Josh Rosen joshro...@databricks.com
Committed: Tue Aug 11 20:47:19 2015 -0700

--
 pom.xml  | 1 +
 project/SparkBuild.scala | 1 +
 2 files changed, 2 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/0119edf5/pom.xml
--
diff --git a/pom.xml b/pom.xml
index 8942836..cfd7d32 100644
--- a/pom.xml
+++ b/pom.xml
@@ -1895,6 +1895,7 @@
   java.io.tmpdir${project.build.directory}/tmp/java.io.tmpdir
   spark.test.home${spark.test.home}/spark.test.home
   spark.testing1/spark.testing
+  spark.master.rest.enabledfalse/spark.master.rest.enabled
   spark.ui.enabledfalse/spark.ui.enabled
   
spark.ui.showConsoleProgressfalse/spark.ui.showConsoleProgress
   
spark.driver.allowMultipleContextstrue/spark.driver.allowMultipleContexts

http://git-wip-us.apache.org/repos/asf/spark/blob/0119edf5/project/SparkBuild.scala
--
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index cad7067..74f815f 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -546,6 +546,7 @@ object TestSettings {
 javaOptions in Test += -Dspark.test.home= + sparkHome,
 javaOptions in Test += -Dspark.testing=1,
 javaOptions in Test += -Dspark.port.maxRetries=100,
+javaOptions in Test += -Dspark.master.rest.enabled=false,
 javaOptions in Test += -Dspark.ui.enabled=false,
 javaOptions in Test += -Dspark.ui.showConsoleProgress=false,
 javaOptions in Test += -Dspark.driver.allowMultipleContexts=true,


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-9640] [STREAMING] [TEST] Do not run Python Kinesis tests when the Kinesis assembly JAR has not been generated

2015-08-11 Thread tdas

Repository: spark
Updated Branches:
  refs/heads/branch-1.5 f9beef998 - c7f009040


[SPARK-9640] [STREAMING] [TEST] Do not run Python Kinesis tests when the 
Kinesis assembly JAR has not been generated

Author: Tathagata Das tathagata.das1...@gmail.com

Closes #7961 from tdas/SPARK-9640 and squashes the following commits:

974ce19 [Tathagata Das] Undo changes related to SPARK-9727
004ae26 [Tathagata Das] style fixes
9bbb97d [Tathagata Das] Minor style fies
e6a677e [Tathagata Das] Merge remote-tracking branch 'apache-github/master' 
into SPARK-9640
ca90719 [Tathagata Das] Removed extra line
ba9cfc7 [Tathagata Das] Improved kinesis test selection logic
88d59bd [Tathagata Das] updated test modules
871fcc8 [Tathagata Das] Fixed SparkBuild
94be631 [Tathagata Das] Fixed style
b858196 [Tathagata Das] Fixed conditions and few other things based on PR 
comments.
e292e64 [Tathagata Das] Added filters for Kinesis python tests

(cherry picked from commit 0f90d6055e5bea9ceb1d454db84f4aa1d59b284d)
Signed-off-by: Tathagata Das tathagata.das1...@gmail.com


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c7f00904
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c7f00904
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c7f00904

Branch: refs/heads/branch-1.5
Commit: c7f0090409c2a94a43404271730beded421a0f2f
Parents: f9beef9
Author: Tathagata Das tathagata.das1...@gmail.com
Authored: Mon Aug 10 23:41:53 2015 -0700
Committer: Tathagata Das tathagata.das1...@gmail.com
Committed: Mon Aug 10 23:42:44 2015 -0700

--
 python/pyspark/streaming/tests.py | 56 ++
 1 file changed, 44 insertions(+), 12 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/c7f00904/python/pyspark/streaming/tests.py
--
diff --git a/python/pyspark/streaming/tests.py 
b/python/pyspark/streaming/tests.py
index 66ae334..f0ed415 100644
--- a/python/pyspark/streaming/tests.py
+++ b/python/pyspark/streaming/tests.py
@@ -971,8 +971,10 @@ class KinesisStreamTests(PySparkStreamingTestCase):
 awsAccessKey, awsSecretKey)
 
 def test_kinesis_stream(self):
-if os.environ.get('ENABLE_KINESIS_TESTS') != '1':
-print(Skip test_kinesis_stream)
+if not are_kinesis_tests_enabled:
+sys.stderr.write(
+Skipped test_kinesis_stream (enable by setting environment 
variable %s=1
+% kinesis_test_environ_var)
 return
 
 import random
@@ -1013,6 +1015,7 @@ class KinesisStreamTests(PySparkStreamingTestCase):
 traceback.print_exc()
 raise
 finally:
+self.ssc.stop(False)
 kinesisTestUtils.deleteStream()
 kinesisTestUtils.deleteDynamoDBTable(kinesisAppName)
 
@@ -1027,7 +1030,7 @@ def search_kafka_assembly_jar():
 (Failed to find Spark Streaming kafka assembly jar in %s.  % 
kafka_assembly_dir) +
 You need to build Spark with 
 'build/sbt assembly/assembly streaming-kafka-assembly/assembly' 
or 
-'build/mvn package' before running this test)
+'build/mvn package' before running this test.)
 elif len(jars)  1:
 raise Exception((Found multiple Spark Streaming Kafka assembly JARs 
in %s; please 
  remove all but one) % kafka_assembly_dir)
@@ -1045,7 +1048,7 @@ def search_flume_assembly_jar():
 (Failed to find Spark Streaming Flume assembly jar in %s.  % 
flume_assembly_dir) +
 You need to build Spark with 
 'build/sbt assembly/assembly streaming-flume-assembly/assembly' 
or 
-'build/mvn package' before running this test)
+'build/mvn package' before running this test.)
 elif len(jars)  1:
 raise Exception((Found multiple Spark Streaming Flume assembly JARs 
in %s; please 
 remove all but one) % flume_assembly_dir)
@@ -1095,11 +1098,7 @@ def search_kinesis_asl_assembly_jar():
 os.path.join(kinesis_asl_assembly_dir,
  
target/scala-*/spark-streaming-kinesis-asl-assembly-*.jar))
 if not jars:
-raise Exception(
-(Failed to find Spark Streaming Kinesis ASL assembly jar in %s.  
%
- kinesis_asl_assembly_dir) + You need to build Spark with 
-'build/sbt -Pkinesis-asl assembly/assembly 
streaming-kinesis-asl-assembly/assembly' 
-or 'build/mvn -Pkinesis-asl package' before running this test)
+return None
 elif len(jars)  1:
 raise Exception((Found multiple Spark Streaming Kinesis ASL assembly 
JARs in %s; please 
  remove all but one) % kinesis_asl_assembly_dir)
@@ -1107,6 +1106,10 @@

spark git commit: [SPARK-9640] [STREAMING] [TEST] Do not run Python Kinesis tests when the Kinesis assembly JAR has not been generated

2015-08-11 Thread tdas

Repository: spark
Updated Branches:
  refs/heads/master 91e9389f3 - 0f90d6055


[SPARK-9640] [STREAMING] [TEST] Do not run Python Kinesis tests when the 
Kinesis assembly JAR has not been generated

Author: Tathagata Das tathagata.das1...@gmail.com

Closes #7961 from tdas/SPARK-9640 and squashes the following commits:

974ce19 [Tathagata Das] Undo changes related to SPARK-9727
004ae26 [Tathagata Das] style fixes
9bbb97d [Tathagata Das] Minor style fies
e6a677e [Tathagata Das] Merge remote-tracking branch 'apache-github/master' 
into SPARK-9640
ca90719 [Tathagata Das] Removed extra line
ba9cfc7 [Tathagata Das] Improved kinesis test selection logic
88d59bd [Tathagata Das] updated test modules
871fcc8 [Tathagata Das] Fixed SparkBuild
94be631 [Tathagata Das] Fixed style
b858196 [Tathagata Das] Fixed conditions and few other things based on PR 
comments.
e292e64 [Tathagata Das] Added filters for Kinesis python tests


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0f90d605
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0f90d605
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0f90d605

Branch: refs/heads/master
Commit: 0f90d6055e5bea9ceb1d454db84f4aa1d59b284d
Parents: 91e9389
Author: Tathagata Das tathagata.das1...@gmail.com
Authored: Mon Aug 10 23:41:53 2015 -0700
Committer: Tathagata Das tathagata.das1...@gmail.com
Committed: Mon Aug 10 23:41:53 2015 -0700

--
 python/pyspark/streaming/tests.py | 56 ++
 1 file changed, 44 insertions(+), 12 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/0f90d605/python/pyspark/streaming/tests.py
--
diff --git a/python/pyspark/streaming/tests.py 
b/python/pyspark/streaming/tests.py
index 66ae334..f0ed415 100644
--- a/python/pyspark/streaming/tests.py
+++ b/python/pyspark/streaming/tests.py
@@ -971,8 +971,10 @@ class KinesisStreamTests(PySparkStreamingTestCase):
 awsAccessKey, awsSecretKey)
 
 def test_kinesis_stream(self):
-if os.environ.get('ENABLE_KINESIS_TESTS') != '1':
-print(Skip test_kinesis_stream)
+if not are_kinesis_tests_enabled:
+sys.stderr.write(
+Skipped test_kinesis_stream (enable by setting environment 
variable %s=1
+% kinesis_test_environ_var)
 return
 
 import random
@@ -1013,6 +1015,7 @@ class KinesisStreamTests(PySparkStreamingTestCase):
 traceback.print_exc()
 raise
 finally:
+self.ssc.stop(False)
 kinesisTestUtils.deleteStream()
 kinesisTestUtils.deleteDynamoDBTable(kinesisAppName)
 
@@ -1027,7 +1030,7 @@ def search_kafka_assembly_jar():
 (Failed to find Spark Streaming kafka assembly jar in %s.  % 
kafka_assembly_dir) +
 You need to build Spark with 
 'build/sbt assembly/assembly streaming-kafka-assembly/assembly' 
or 
-'build/mvn package' before running this test)
+'build/mvn package' before running this test.)
 elif len(jars)  1:
 raise Exception((Found multiple Spark Streaming Kafka assembly JARs 
in %s; please 
  remove all but one) % kafka_assembly_dir)
@@ -1045,7 +1048,7 @@ def search_flume_assembly_jar():
 (Failed to find Spark Streaming Flume assembly jar in %s.  % 
flume_assembly_dir) +
 You need to build Spark with 
 'build/sbt assembly/assembly streaming-flume-assembly/assembly' 
or 
-'build/mvn package' before running this test)
+'build/mvn package' before running this test.)
 elif len(jars)  1:
 raise Exception((Found multiple Spark Streaming Flume assembly JARs 
in %s; please 
 remove all but one) % flume_assembly_dir)
@@ -1095,11 +1098,7 @@ def search_kinesis_asl_assembly_jar():
 os.path.join(kinesis_asl_assembly_dir,
  
target/scala-*/spark-streaming-kinesis-asl-assembly-*.jar))
 if not jars:
-raise Exception(
-(Failed to find Spark Streaming Kinesis ASL assembly jar in %s.  
%
- kinesis_asl_assembly_dir) + You need to build Spark with 
-'build/sbt -Pkinesis-asl assembly/assembly 
streaming-kinesis-asl-assembly/assembly' 
-or 'build/mvn -Pkinesis-asl package' before running this test)
+return None
 elif len(jars)  1:
 raise Exception((Found multiple Spark Streaming Kinesis ASL assembly 
JARs in %s; please 
  remove all but one) % kinesis_asl_assembly_dir)
@@ -1107,6 +1106,10 @@ def search_kinesis_asl_assembly_jar():
 return jars[0]
 
 
+# Must be same as the variable and condition defined in

spark git commit: [SPARK-9727] [STREAMING] [BUILD] Updated streaming kinesis SBT project name to be more consistent

2015-08-11 Thread tdas

Repository: spark
Updated Branches:
  refs/heads/master 55752d883 - 600031ebe


[SPARK-9727] [STREAMING] [BUILD] Updated streaming kinesis SBT project name to 
be more consistent

Author: Tathagata Das tathagata.das1...@gmail.com

Closes #8092 from tdas/SPARK-9727 and squashes the following commits:

b1b01fd [Tathagata Das] Updated streaming kinesis project name


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/600031eb
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/600031eb
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/600031eb

Branch: refs/heads/master
Commit: 600031ebe27473d8fffe6ea436c2149223b82896
Parents: 55752d8
Author: Tathagata Das tathagata.das1...@gmail.com
Authored: Tue Aug 11 02:41:03 2015 -0700
Committer: Tathagata Das tathagata.das1...@gmail.com
Committed: Tue Aug 11 02:41:03 2015 -0700

--
 dev/sparktestsupport/modules.py | 4 ++--
 extras/kinesis-asl/pom.xml  | 2 +-
 project/SparkBuild.scala| 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/600031eb/dev/sparktestsupport/modules.py
--
diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
index d82c0cc..346452f 100644
--- a/dev/sparktestsupport/modules.py
+++ b/dev/sparktestsupport/modules.py
@@ -134,7 +134,7 @@ streaming = Module(
 # files in streaming_kinesis_asl are changed, so that if Kinesis experiences 
an outage, we don't
 # fail other PRs.
 streaming_kinesis_asl = Module(
-name=kinesis-asl,
+name=streaming-kinesis-asl,
 dependencies=[],
 source_file_regexes=[
 extras/kinesis-asl/,
@@ -147,7 +147,7 @@ streaming_kinesis_asl = Module(
 ENABLE_KINESIS_TESTS: 1
 },
 sbt_test_goals=[
-kinesis-asl/test,
+streaming-kinesis-asl/test,
 ]
 )
 

http://git-wip-us.apache.org/repos/asf/spark/blob/600031eb/extras/kinesis-asl/pom.xml
--
diff --git a/extras/kinesis-asl/pom.xml b/extras/kinesis-asl/pom.xml
index c242e7a..521b53e 100644
--- a/extras/kinesis-asl/pom.xml
+++ b/extras/kinesis-asl/pom.xml
@@ -31,7 +31,7 @@
   nameSpark Kinesis Integration/name
 
   properties
-sbt.project.namekinesis-asl/sbt.project.name
+sbt.project.namestreaming-kinesis-asl/sbt.project.name
   /properties
 
   dependencies

http://git-wip-us.apache.org/repos/asf/spark/blob/600031eb/project/SparkBuild.scala
--
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index 41a85fa..cad7067 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -42,8 +42,8 @@ object BuildCommons {
   streaming-zeromq, launcher, unsafe).map(ProjectRef(buildLocation, 
_))
 
   val optionallyEnabledProjects@Seq(yarn, yarnStable, java8Tests, 
sparkGangliaLgpl,
-sparkKinesisAsl) = Seq(yarn, yarn-stable, java8-tests, 
ganglia-lgpl,
-kinesis-asl).map(ProjectRef(buildLocation, _))
+streamingKinesisAsl) = Seq(yarn, yarn-stable, java8-tests, 
ganglia-lgpl,
+streaming-kinesis-asl).map(ProjectRef(buildLocation, _))
 
   val assemblyProjects@Seq(assembly, examples, networkYarn, 
streamingFlumeAssembly, streamingKafkaAssembly, streamingMqttAssembly, 
streamingKinesisAslAssembly) =
 Seq(assembly, examples, network-yarn, streaming-flume-assembly, 
streaming-kafka-assembly, streaming-mqtt-assembly, 
streaming-kinesis-asl-assembly)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-9727] [STREAMING] [BUILD] Updated streaming kinesis SBT project name to be more consistent

2015-08-11 Thread tdas

Repository: spark
Updated Branches:
  refs/heads/branch-1.5 c7f009040 - ebbd3b616


[SPARK-9727] [STREAMING] [BUILD] Updated streaming kinesis SBT project name to 
be more consistent

Author: Tathagata Das tathagata.das1...@gmail.com

Closes #8092 from tdas/SPARK-9727 and squashes the following commits:

b1b01fd [Tathagata Das] Updated streaming kinesis project name

(cherry picked from commit 600031ebe27473d8fffe6ea436c2149223b82896)
Signed-off-by: Tathagata Das tathagata.das1...@gmail.com


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ebbd3b61
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ebbd3b61
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ebbd3b61

Branch: refs/heads/branch-1.5
Commit: ebbd3b616bf49701c2466bde5193241f69cf3e30
Parents: c7f0090
Author: Tathagata Das tathagata.das1...@gmail.com
Authored: Tue Aug 11 02:41:03 2015 -0700
Committer: Tathagata Das tathagata.das1...@gmail.com
Committed: Tue Aug 11 02:41:25 2015 -0700

--
 dev/sparktestsupport/modules.py | 4 ++--
 extras/kinesis-asl/pom.xml  | 2 +-
 project/SparkBuild.scala| 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/ebbd3b61/dev/sparktestsupport/modules.py
--
diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
index d82c0cc..346452f 100644
--- a/dev/sparktestsupport/modules.py
+++ b/dev/sparktestsupport/modules.py
@@ -134,7 +134,7 @@ streaming = Module(
 # files in streaming_kinesis_asl are changed, so that if Kinesis experiences 
an outage, we don't
 # fail other PRs.
 streaming_kinesis_asl = Module(
-name=kinesis-asl,
+name=streaming-kinesis-asl,
 dependencies=[],
 source_file_regexes=[
 extras/kinesis-asl/,
@@ -147,7 +147,7 @@ streaming_kinesis_asl = Module(
 ENABLE_KINESIS_TESTS: 1
 },
 sbt_test_goals=[
-kinesis-asl/test,
+streaming-kinesis-asl/test,
 ]
 )
 

http://git-wip-us.apache.org/repos/asf/spark/blob/ebbd3b61/extras/kinesis-asl/pom.xml
--
diff --git a/extras/kinesis-asl/pom.xml b/extras/kinesis-asl/pom.xml
index c242e7a..521b53e 100644
--- a/extras/kinesis-asl/pom.xml
+++ b/extras/kinesis-asl/pom.xml
@@ -31,7 +31,7 @@
   nameSpark Kinesis Integration/name
 
   properties
-sbt.project.namekinesis-asl/sbt.project.name
+sbt.project.namestreaming-kinesis-asl/sbt.project.name
   /properties
 
   dependencies

http://git-wip-us.apache.org/repos/asf/spark/blob/ebbd3b61/project/SparkBuild.scala
--
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index 41a85fa..cad7067 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -42,8 +42,8 @@ object BuildCommons {
   streaming-zeromq, launcher, unsafe).map(ProjectRef(buildLocation, 
_))
 
   val optionallyEnabledProjects@Seq(yarn, yarnStable, java8Tests, 
sparkGangliaLgpl,
-sparkKinesisAsl) = Seq(yarn, yarn-stable, java8-tests, 
ganglia-lgpl,
-kinesis-asl).map(ProjectRef(buildLocation, _))
+streamingKinesisAsl) = Seq(yarn, yarn-stable, java8-tests, 
ganglia-lgpl,
+streaming-kinesis-asl).map(ProjectRef(buildLocation, _))
 
   val assemblyProjects@Seq(assembly, examples, networkYarn, 
streamingFlumeAssembly, streamingKafkaAssembly, streamingMqttAssembly, 
streamingKinesisAslAssembly) =
 Seq(assembly, examples, network-yarn, streaming-flume-assembly, 
streaming-kafka-assembly, streaming-mqtt-assembly, 
streaming-kinesis-asl-assembly)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-9810] [BUILD] Remove individual commit messages from the squash commit message

2015-08-11 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master 0f90d6055 - 55752d883


[SPARK-9810] [BUILD] Remove individual commit messages from the squash commit 
message

For more information, please see the JIRA ticket and the associated dev list 
discussion.

https://issues.apache.org/jira/browse/SPARK-9810

http://apache-spark-developers-list.1001551.n3.nabble.com/discuss-Removing-individual-commit-messages-from-the-squash-commit-message-td13295.html

Author: Reynold Xin r...@databricks.com

Closes #8091 from rxin/SPARK-9810.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/55752d88
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/55752d88
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/55752d88

Branch: refs/heads/master
Commit: 55752d88321925da815823f968128832de6fdbbb
Parents: 0f90d60
Author: Reynold Xin r...@databricks.com
Authored: Tue Aug 11 01:08:30 2015 -0700
Committer: Reynold Xin r...@databricks.com
Committed: Tue Aug 11 01:08:30 2015 -0700

--
 dev/merge_spark_pr.py | 6 +-
 1 file changed, 1 insertion(+), 5 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/55752d88/dev/merge_spark_pr.py
--
diff --git a/dev/merge_spark_pr.py b/dev/merge_spark_pr.py
index ad4b766..b9bdec3 100755
--- a/dev/merge_spark_pr.py
+++ b/dev/merge_spark_pr.py
@@ -159,11 +159,7 @@ def merge_pr(pr_num, target_ref, title, body, 
pr_repo_desc):
 merge_message_flags += [-m, message]
 
 # The string Closes #%s string is required for GitHub to correctly close 
the PR
-merge_message_flags += [
--m,
-Closes #%s from %s and squashes the following commits: % (pr_num, 
pr_repo_desc)]
-for c in commits:
-merge_message_flags += [-m, c]
+merge_message_flags += [-m, Closes #%s from %s. % (pr_num, 
pr_repo_desc)]
 
 run_cmd(['git', 'commit', '--author=%s' % primary_author] + 
merge_message_flags)
 


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[2/2] spark git commit: [SPARK-9815] Rename PlatformDependent.UNSAFE - Platform.

2015-08-11 Thread davies

[SPARK-9815] Rename PlatformDependent.UNSAFE - Platform.

PlatformDependent.UNSAFE is way too verbose.

Author: Reynold Xin r...@databricks.com

Closes #8094 from rxin/SPARK-9815 and squashes the following commits:

229b603 [Reynold Xin] [SPARK-9815] Rename PlatformDependent.UNSAFE - Platform.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d378396f
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d378396f
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d378396f

Branch: refs/heads/master
Commit: d378396f86f625f006738d87fe5dbc2ff8fd913d
Parents: 600031e
Author: Reynold Xin r...@databricks.com
Authored: Tue Aug 11 08:41:06 2015 -0700
Committer: Davies Liu davies@gmail.com
Committed: Tue Aug 11 08:41:06 2015 -0700

--
 .../serializer/DummySerializerInstance.java |   6 +-
 .../unsafe/UnsafeShuffleExternalSorter.java |  22 +--
 .../shuffle/unsafe/UnsafeShuffleWriter.java |   4 +-
 .../spark/unsafe/map/BytesToBytesMap.java   |  20 +-
 .../unsafe/sort/PrefixComparators.java  |   5 +-
 .../unsafe/sort/UnsafeExternalSorter.java   |  22 +--
 .../unsafe/sort/UnsafeInMemorySorter.java   |   4 +-
 .../unsafe/sort/UnsafeSorterSpillReader.java|   4 +-
 .../unsafe/sort/UnsafeSorterSpillWriter.java|   6 +-
 .../UnsafeShuffleInMemorySorterSuite.java   |  20 +-
 .../map/AbstractBytesToBytesMapSuite.java   |  94 +-
 .../unsafe/sort/UnsafeExternalSorterSuite.java  |  20 +-
 .../unsafe/sort/UnsafeInMemorySorterSuite.java  |  20 +-
 .../catalyst/expressions/UnsafeArrayData.java   |  51 ++---
 .../sql/catalyst/expressions/UnsafeReaders.java |   8 +-
 .../sql/catalyst/expressions/UnsafeRow.java | 108 +--
 .../catalyst/expressions/UnsafeRowWriters.java  |  41 ++--
 .../sql/catalyst/expressions/UnsafeWriters.java |  43 ++---
 .../sql/execution/UnsafeExternalRowSorter.java  |   4 +-
 .../expressions/codegen/CodeGenerator.scala |   4 +-
 .../codegen/GenerateUnsafeProjection.scala  |  32 ++--
 .../codegen/GenerateUnsafeRowJoiner.scala   |  16 +-
 .../catalyst/expressions/stringOperations.scala |   4 +-
 .../GenerateUnsafeRowJoinerBitsetSuite.scala|   4 +-
 .../UnsafeFixedWidthAggregationMap.java |   4 +-
 .../sql/execution/UnsafeKVExternalSorter.java   |   4 +-
 .../sql/execution/UnsafeRowSerializer.scala |   6 +-
 .../sql/execution/joins/HashedRelation.scala|  13 +-
 .../org/apache/spark/sql/UnsafeRowSuite.scala   |   4 +-
 .../java/org/apache/spark/unsafe/Platform.java  | 173 +
 .../apache/spark/unsafe/PlatformDependent.java  | 187 ---
 .../spark/unsafe/array/ByteArrayMethods.java|  14 +-
 .../apache/spark/unsafe/array/LongArray.java|   6 +-
 .../spark/unsafe/bitset/BitSetMethods.java  |  19 +-
 .../spark/unsafe/hash/Murmur3_x86_32.java   |   4 +-
 .../apache/spark/unsafe/memory/MemoryBlock.java |   4 +-
 .../unsafe/memory/UnsafeMemoryAllocator.java|   6 +-
 .../apache/spark/unsafe/types/ByteArray.java|  10 +-
 .../apache/spark/unsafe/types/UTF8String.java   |  30 ++-
 .../spark/unsafe/hash/Murmur3_x86_32Suite.java  |  14 +-
 40 files changed, 466 insertions(+), 594 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/d378396f/core/src/main/java/org/apache/spark/serializer/DummySerializerInstance.java
--
diff --git 
a/core/src/main/java/org/apache/spark/serializer/DummySerializerInstance.java 
b/core/src/main/java/org/apache/spark/serializer/DummySerializerInstance.java
index 0399abc..0e58bb4 100644
--- 
a/core/src/main/java/org/apache/spark/serializer/DummySerializerInstance.java
+++ 
b/core/src/main/java/org/apache/spark/serializer/DummySerializerInstance.java
@@ -25,7 +25,7 @@ import java.nio.ByteBuffer;
 import scala.reflect.ClassTag;
 
 import org.apache.spark.annotation.Private;
-import org.apache.spark.unsafe.PlatformDependent;
+import org.apache.spark.unsafe.Platform;
 
 /**
  * Unfortunately, we need a serializer instance in order to construct a 
DiskBlockObjectWriter.
@@ -49,7 +49,7 @@ public final class DummySerializerInstance extends 
SerializerInstance {
 try {
   s.flush();
 } catch (IOException e) {
-  PlatformDependent.throwException(e);
+  Platform.throwException(e);
 }
   }
 
@@ -64,7 +64,7 @@ public final class DummySerializerInstance extends 
SerializerInstance {
 try {
   s.close();
 } catch (IOException e) {
-  PlatformDependent.throwException(e);
+  Platform.throwException(e);
 }
   }
 };

http://git-wip-us.apache.org/repos/asf/spark/blob/d378396f/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleExternalSorter.java

[2/2] spark git commit: [SPARK-9815] Rename PlatformDependent.UNSAFE - Platform.

2015-08-11 Thread davies

[SPARK-9815] Rename PlatformDependent.UNSAFE - Platform.

PlatformDependent.UNSAFE is way too verbose.

Author: Reynold Xin r...@databricks.com

Closes #8094 from rxin/SPARK-9815 and squashes the following commits:

229b603 [Reynold Xin] [SPARK-9815] Rename PlatformDependent.UNSAFE - Platform.

(cherry picked from commit d378396f86f625f006738d87fe5dbc2ff8fd913d)
Signed-off-by: Davies Liu davies@gmail.com


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/84ba990f
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/84ba990f
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/84ba990f

Branch: refs/heads/branch-1.5
Commit: 84ba990f2e27ef6d05716307ebe9644c7efffee9
Parents: ebbd3b6
Author: Reynold Xin r...@databricks.com
Authored: Tue Aug 11 08:41:06 2015 -0700
Committer: Davies Liu davies@gmail.com
Committed: Tue Aug 11 08:41:28 2015 -0700

--
 .../serializer/DummySerializerInstance.java |   6 +-
 .../unsafe/UnsafeShuffleExternalSorter.java |  22 +--
 .../shuffle/unsafe/UnsafeShuffleWriter.java |   4 +-
 .../spark/unsafe/map/BytesToBytesMap.java   |  20 +-
 .../unsafe/sort/PrefixComparators.java  |   5 +-
 .../unsafe/sort/UnsafeExternalSorter.java   |  22 +--
 .../unsafe/sort/UnsafeInMemorySorter.java   |   4 +-
 .../unsafe/sort/UnsafeSorterSpillReader.java|   4 +-
 .../unsafe/sort/UnsafeSorterSpillWriter.java|   6 +-
 .../UnsafeShuffleInMemorySorterSuite.java   |  20 +-
 .../map/AbstractBytesToBytesMapSuite.java   |  94 +-
 .../unsafe/sort/UnsafeExternalSorterSuite.java  |  20 +-
 .../unsafe/sort/UnsafeInMemorySorterSuite.java  |  20 +-
 .../catalyst/expressions/UnsafeArrayData.java   |  51 ++---
 .../sql/catalyst/expressions/UnsafeReaders.java |   8 +-
 .../sql/catalyst/expressions/UnsafeRow.java | 108 +--
 .../catalyst/expressions/UnsafeRowWriters.java  |  41 ++--
 .../sql/catalyst/expressions/UnsafeWriters.java |  43 ++---
 .../sql/execution/UnsafeExternalRowSorter.java  |   4 +-
 .../expressions/codegen/CodeGenerator.scala |   4 +-
 .../codegen/GenerateUnsafeProjection.scala  |  32 ++--
 .../codegen/GenerateUnsafeRowJoiner.scala   |  16 +-
 .../catalyst/expressions/stringOperations.scala |   4 +-
 .../GenerateUnsafeRowJoinerBitsetSuite.scala|   4 +-
 .../UnsafeFixedWidthAggregationMap.java |   4 +-
 .../sql/execution/UnsafeKVExternalSorter.java   |   4 +-
 .../sql/execution/UnsafeRowSerializer.scala |   6 +-
 .../sql/execution/joins/HashedRelation.scala|  13 +-
 .../org/apache/spark/sql/UnsafeRowSuite.scala   |   4 +-
 .../java/org/apache/spark/unsafe/Platform.java  | 173 +
 .../apache/spark/unsafe/PlatformDependent.java  | 187 ---
 .../spark/unsafe/array/ByteArrayMethods.java|  14 +-
 .../apache/spark/unsafe/array/LongArray.java|   6 +-
 .../spark/unsafe/bitset/BitSetMethods.java  |  19 +-
 .../spark/unsafe/hash/Murmur3_x86_32.java   |   4 +-
 .../apache/spark/unsafe/memory/MemoryBlock.java |   4 +-
 .../unsafe/memory/UnsafeMemoryAllocator.java|   6 +-
 .../apache/spark/unsafe/types/ByteArray.java|  10 +-
 .../apache/spark/unsafe/types/UTF8String.java   |  30 ++-
 .../spark/unsafe/hash/Murmur3_x86_32Suite.java  |  14 +-
 40 files changed, 466 insertions(+), 594 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/84ba990f/core/src/main/java/org/apache/spark/serializer/DummySerializerInstance.java
--
diff --git 
a/core/src/main/java/org/apache/spark/serializer/DummySerializerInstance.java 
b/core/src/main/java/org/apache/spark/serializer/DummySerializerInstance.java
index 0399abc..0e58bb4 100644
--- 
a/core/src/main/java/org/apache/spark/serializer/DummySerializerInstance.java
+++ 
b/core/src/main/java/org/apache/spark/serializer/DummySerializerInstance.java
@@ -25,7 +25,7 @@ import java.nio.ByteBuffer;
 import scala.reflect.ClassTag;
 
 import org.apache.spark.annotation.Private;
-import org.apache.spark.unsafe.PlatformDependent;
+import org.apache.spark.unsafe.Platform;
 
 /**
  * Unfortunately, we need a serializer instance in order to construct a 
DiskBlockObjectWriter.
@@ -49,7 +49,7 @@ public final class DummySerializerInstance extends 
SerializerInstance {
 try {
   s.flush();
 } catch (IOException e) {
-  PlatformDependent.throwException(e);
+  Platform.throwException(e);
 }
   }
 
@@ -64,7 +64,7 @@ public final class DummySerializerInstance extends 
SerializerInstance {
 try {
   s.close();
 } catch (IOException e) {
-  PlatformDependent.throwException(e);
+  Platform.throwException(e);
 }
   }
 };

spark git commit: [SPARK-9785] [SQL] HashPartitioning compatibility should consider expression ordering

2015-08-11 Thread yhuai

Repository: spark
Updated Branches:
  refs/heads/master d378396f8 - dfe347d2c


[SPARK-9785] [SQL] HashPartitioning compatibility should consider expression 
ordering

HashPartitioning compatibility is currently defined w.r.t the _set_ of 
expressions, but the ordering of those expressions matters when computing hash 
codes; this could lead to incorrect answers if we mistakenly avoided a shuffle 
based on the assumption that HashPartitionings with the same expressions in 
different orders will produce equivalent row hashcodes. The first commit adds a 
regression test which illustrates this problem.

The fix for this is simple: make `HashPartitioning.compatibleWith` and 
`HashPartitioning.guarantees` sensitive to the expression ordering (i.e. do not 
perform set comparison).

Author: Josh Rosen joshro...@databricks.com

Closes #8074 from JoshRosen/hashpartitioning-compatiblewith-fixes and squashes 
the following commits:

b61412f [Josh Rosen] Demonstrate that I haven't cheated in my fix
0b4d7d9 [Josh Rosen] Update so that clusteringSet is only used in satisfies().
dc9c9d7 [Josh Rosen] Add failing regression test for SPARK-9785


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/dfe347d2
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/dfe347d2
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/dfe347d2

Branch: refs/heads/master
Commit: dfe347d2cae3eb05d7539aaf72db3d309e711213
Parents: d378396
Author: Josh Rosen joshro...@databricks.com
Authored: Tue Aug 11 08:52:15 2015 -0700
Committer: Yin Huai yh...@databricks.com
Committed: Tue Aug 11 08:52:15 2015 -0700

--
 .../catalyst/plans/physical/partitioning.scala  | 15 ++
 .../spark/sql/catalyst/PartitioningSuite.scala  | 55 
 2 files changed, 60 insertions(+), 10 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/dfe347d2/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala
index 5a89a90..5ac3f1f 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala
@@ -216,26 +216,23 @@ case class HashPartitioning(expressions: Seq[Expression], 
numPartitions: Int)
   override def nullable: Boolean = false
   override def dataType: DataType = IntegerType
 
-  lazy val clusteringSet = expressions.toSet
-
   override def satisfies(required: Distribution): Boolean = required match {
 case UnspecifiedDistribution = true
 case ClusteredDistribution(requiredClustering) =
-  clusteringSet.subsetOf(requiredClustering.toSet)
+  expressions.toSet.subsetOf(requiredClustering.toSet)
 case _ = false
   }
 
   override def compatibleWith(other: Partitioning): Boolean = other match {
-case o: HashPartitioning =
-  this.clusteringSet == o.clusteringSet  this.numPartitions == 
o.numPartitions
+case o: HashPartitioning = this == o
 case _ = false
   }
 
   override def guarantees(other: Partitioning): Boolean = other match {
-case o: HashPartitioning =
-  this.clusteringSet == o.clusteringSet  this.numPartitions == 
o.numPartitions
+case o: HashPartitioning = this == o
 case _ = false
   }
+
 }
 
 /**
@@ -257,15 +254,13 @@ case class RangePartitioning(ordering: Seq[SortOrder], 
numPartitions: Int)
   override def nullable: Boolean = false
   override def dataType: DataType = IntegerType
 
-  private[this] lazy val clusteringSet = ordering.map(_.child).toSet
-
   override def satisfies(required: Distribution): Boolean = required match {
 case UnspecifiedDistribution = true
 case OrderedDistribution(requiredOrdering) =
   val minSize = Seq(requiredOrdering.size, ordering.size).min
   requiredOrdering.take(minSize) == ordering.take(minSize)
 case ClusteredDistribution(requiredClustering) =
-  clusteringSet.subsetOf(requiredClustering.toSet)
+  ordering.map(_.child).toSet.subsetOf(requiredClustering.toSet)
 case _ = false
   }
 

http://git-wip-us.apache.org/repos/asf/spark/blob/dfe347d2/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/PartitioningSuite.scala
--
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/PartitioningSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/PartitioningSuite.scala
new file mode 100644
index 000..5b802cc
--- /dev/null
+++

[1/2] spark git commit: [SPARK-9815] Rename PlatformDependent.UNSAFE - Platform.

2015-08-11 Thread davies

Repository: spark
Updated Branches:
  refs/heads/master 600031ebe - d378396f8


http://git-wip-us.apache.org/repos/asf/spark/blob/d378396f/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
index 7bd..134f1aa 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
@@ -1013,7 +1013,7 @@ case class Decode(bin: Expression, charset: Expression)
 try {
   ${ev.primitive} = UTF8String.fromString(new String($bytes, 
$charset.toString()));
 } catch (java.io.UnsupportedEncodingException e) {
-  org.apache.spark.unsafe.PlatformDependent.throwException(e);
+  org.apache.spark.unsafe.Platform.throwException(e);
 }
   )
   }
@@ -1043,7 +1043,7 @@ case class Encode(value: Expression, charset: Expression)
 try {
   ${ev.primitive} = $string.toString().getBytes($charset.toString());
 } catch (java.io.UnsupportedEncodingException e) {
-  org.apache.spark.unsafe.PlatformDependent.throwException(e);
+  org.apache.spark.unsafe.Platform.throwException(e);
 })
   }
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/d378396f/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoinerBitsetSuite.scala
--
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoinerBitsetSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoinerBitsetSuite.scala
index aff1bee..796d600 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoinerBitsetSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoinerBitsetSuite.scala
@@ -22,7 +22,7 @@ import scala.util.Random
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.expressions.UnsafeRow
 import org.apache.spark.sql.types._
-import org.apache.spark.unsafe.PlatformDependent
+import org.apache.spark.unsafe.Platform
 
 /**
  * A test suite for the bitset portion of the row concatenation.
@@ -96,7 +96,7 @@ class GenerateUnsafeRowJoinerBitsetSuite extends 
SparkFunSuite {
 // This way we can test the joiner when the input UnsafeRows are not the 
entire arrays.
 val offset = numFields * 8
 val buf = new Array[Byte](sizeInBytes + offset)
-row.pointTo(buf, PlatformDependent.BYTE_ARRAY_OFFSET + offset, numFields, 
sizeInBytes)
+row.pointTo(buf, Platform.BYTE_ARRAY_OFFSET + offset, numFields, 
sizeInBytes)
 row
   }
 

http://git-wip-us.apache.org/repos/asf/spark/blob/d378396f/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMap.java
--
diff --git 
a/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMap.java
 
b/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMap.java
index 00218f2..5cce41d 100644
--- 
a/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMap.java
+++ 
b/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMap.java
@@ -27,7 +27,7 @@ import org.apache.spark.sql.catalyst.expressions.UnsafeRow;
 import org.apache.spark.sql.types.StructField;
 import org.apache.spark.sql.types.StructType;
 import org.apache.spark.unsafe.KVIterator;
-import org.apache.spark.unsafe.PlatformDependent;
+import org.apache.spark.unsafe.Platform;
 import org.apache.spark.unsafe.map.BytesToBytesMap;
 import org.apache.spark.unsafe.memory.MemoryLocation;
 import org.apache.spark.unsafe.memory.TaskMemoryManager;
@@ -138,7 +138,7 @@ public final class UnsafeFixedWidthAggregationMap {
 unsafeGroupingKeyRow.getBaseOffset(),
 unsafeGroupingKeyRow.getSizeInBytes(),
 emptyAggregationBuffer,
-PlatformDependent.BYTE_ARRAY_OFFSET,
+Platform.BYTE_ARRAY_OFFSET,
 emptyAggregationBuffer.length
   );
   if (!putSucceeded) {

http://git-wip-us.apache.org/repos/asf/spark/blob/d378396f/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeKVExternalSorter.java
--
diff --git 
a/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeKVExternalSorter.java
 
b/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeKVExternalSorter.java

[1/2] spark git commit: [SPARK-9815] Rename PlatformDependent.UNSAFE - Platform.

2015-08-11 Thread davies

Repository: spark
Updated Branches:
  refs/heads/branch-1.5 ebbd3b616 - 84ba990f2


http://git-wip-us.apache.org/repos/asf/spark/blob/84ba990f/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
index 7bd..134f1aa 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
@@ -1013,7 +1013,7 @@ case class Decode(bin: Expression, charset: Expression)
 try {
   ${ev.primitive} = UTF8String.fromString(new String($bytes, 
$charset.toString()));
 } catch (java.io.UnsupportedEncodingException e) {
-  org.apache.spark.unsafe.PlatformDependent.throwException(e);
+  org.apache.spark.unsafe.Platform.throwException(e);
 }
   )
   }
@@ -1043,7 +1043,7 @@ case class Encode(value: Expression, charset: Expression)
 try {
   ${ev.primitive} = $string.toString().getBytes($charset.toString());
 } catch (java.io.UnsupportedEncodingException e) {
-  org.apache.spark.unsafe.PlatformDependent.throwException(e);
+  org.apache.spark.unsafe.Platform.throwException(e);
 })
   }
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/84ba990f/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoinerBitsetSuite.scala
--
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoinerBitsetSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoinerBitsetSuite.scala
index aff1bee..796d600 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoinerBitsetSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoinerBitsetSuite.scala
@@ -22,7 +22,7 @@ import scala.util.Random
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.expressions.UnsafeRow
 import org.apache.spark.sql.types._
-import org.apache.spark.unsafe.PlatformDependent
+import org.apache.spark.unsafe.Platform
 
 /**
  * A test suite for the bitset portion of the row concatenation.
@@ -96,7 +96,7 @@ class GenerateUnsafeRowJoinerBitsetSuite extends 
SparkFunSuite {
 // This way we can test the joiner when the input UnsafeRows are not the 
entire arrays.
 val offset = numFields * 8
 val buf = new Array[Byte](sizeInBytes + offset)
-row.pointTo(buf, PlatformDependent.BYTE_ARRAY_OFFSET + offset, numFields, 
sizeInBytes)
+row.pointTo(buf, Platform.BYTE_ARRAY_OFFSET + offset, numFields, 
sizeInBytes)
 row
   }
 

http://git-wip-us.apache.org/repos/asf/spark/blob/84ba990f/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMap.java
--
diff --git 
a/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMap.java
 
b/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMap.java
index 00218f2..5cce41d 100644
--- 
a/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMap.java
+++ 
b/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMap.java
@@ -27,7 +27,7 @@ import org.apache.spark.sql.catalyst.expressions.UnsafeRow;
 import org.apache.spark.sql.types.StructField;
 import org.apache.spark.sql.types.StructType;
 import org.apache.spark.unsafe.KVIterator;
-import org.apache.spark.unsafe.PlatformDependent;
+import org.apache.spark.unsafe.Platform;
 import org.apache.spark.unsafe.map.BytesToBytesMap;
 import org.apache.spark.unsafe.memory.MemoryLocation;
 import org.apache.spark.unsafe.memory.TaskMemoryManager;
@@ -138,7 +138,7 @@ public final class UnsafeFixedWidthAggregationMap {
 unsafeGroupingKeyRow.getBaseOffset(),
 unsafeGroupingKeyRow.getSizeInBytes(),
 emptyAggregationBuffer,
-PlatformDependent.BYTE_ARRAY_OFFSET,
+Platform.BYTE_ARRAY_OFFSET,
 emptyAggregationBuffer.length
   );
   if (!putSucceeded) {

http://git-wip-us.apache.org/repos/asf/spark/blob/84ba990f/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeKVExternalSorter.java
--
diff --git 
a/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeKVExternalSorter.java
 
b/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeKVExternalSorter.java

spark git commit: [SPARK-8345] [ML] Add an SQL node as a feature transformer

2015-08-11 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master bce72797f - 8cad854ef


[SPARK-8345] [ML] Add an SQL node as a feature transformer

Implements the transforms which are defined by SQL statement.
Currently we only support SQL syntax like 'SELECT ... FROM __THIS__'
where '__THIS__' represents the underlying table of the input dataset.

Author: Yanbo Liang yblia...@gmail.com

Closes #7465 from yanboliang/spark-8345 and squashes the following commits:

b403fcb [Yanbo Liang] address comments
0d4bb15 [Yanbo Liang] a better transformSchema() implementation
51eb9e7 [Yanbo Liang] Add an SQL node as a feature transformer


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8cad854e
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8cad854e
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8cad854e

Branch: refs/heads/master
Commit: 8cad854ef6a2066de5adffcca6b79a205ccfd5f3
Parents: bce7279
Author: Yanbo Liang yblia...@gmail.com
Authored: Tue Aug 11 11:01:59 2015 -0700
Committer: Xiangrui Meng m...@databricks.com
Committed: Tue Aug 11 11:01:59 2015 -0700

--
 .../spark/ml/feature/SQLTransformer.scala   | 72 
 .../spark/ml/feature/SQLTransformerSuite.scala  | 44 
 2 files changed, 116 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/8cad854e/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala 
b/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala
new file mode 100644
index 000..95e4305
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the License); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.feature
+
+import org.apache.spark.SparkContext
+import org.apache.spark.annotation.Experimental
+import org.apache.spark.ml.param.{ParamMap, Param}
+import org.apache.spark.ml.Transformer
+import org.apache.spark.ml.util.Identifiable
+import org.apache.spark.sql.{SQLContext, DataFrame, Row}
+import org.apache.spark.sql.types.StructType
+
+/**
+ * :: Experimental ::
+ * Implements the transforms which are defined by SQL statement.
+ * Currently we only support SQL syntax like 'SELECT ... FROM __THIS__'
+ * where '__THIS__' represents the underlying table of the input dataset.
+ */
+@Experimental
+class SQLTransformer (override val uid: String) extends Transformer {
+
+  def this() = this(Identifiable.randomUID(sql))
+
+  /**
+   * SQL statement parameter. The statement is provided in string form.
+   * @group param
+   */
+  final val statement: Param[String] = new Param[String](this, statement, 
SQL statement)
+
+  /** @group setParam */
+  def setStatement(value: String): this.type = set(statement, value)
+
+  /** @group getParam */
+  def getStatement: String = $(statement)
+
+  private val tableIdentifier: String = __THIS__
+
+  override def transform(dataset: DataFrame): DataFrame = {
+val tableName = Identifiable.randomUID(uid)
+dataset.registerTempTable(tableName)
+val realStatement = $(statement).replace(tableIdentifier, tableName)
+val outputDF = dataset.sqlContext.sql(realStatement)
+outputDF
+  }
+
+  override def transformSchema(schema: StructType): StructType = {
+val sc = SparkContext.getOrCreate()
+val sqlContext = SQLContext.getOrCreate(sc)
+val dummyRDD = sc.parallelize(Seq(Row.empty))
+val dummyDF = sqlContext.createDataFrame(dummyRDD, schema)
+dummyDF.registerTempTable(tableIdentifier)
+val outputSchema = sqlContext.sql($(statement)).schema
+outputSchema
+  }
+
+  override def copy(extra: ParamMap): SQLTransformer = defaultCopy(extra)
+}

http://git-wip-us.apache.org/repos/asf/spark/blob/8cad854e/mllib/src/test/scala/org/apache/spark/ml/feature/SQLTransformerSuite.scala
--
diff --git

spark git commit: [SPARK-9572] [STREAMING] [PYSPARK] Added StreamingContext.getActiveOrCreate() in Python

2015-08-11 Thread tdas

Repository: spark
Updated Branches:
  refs/heads/master dbd778d84 - 5b8bb1b21


[SPARK-9572] [STREAMING] [PYSPARK] Added StreamingContext.getActiveOrCreate() 
in Python

Author: Tathagata Das tathagata.das1...@gmail.com

Closes #8080 from tdas/SPARK-9572 and squashes the following commits:

64a231d [Tathagata Das] Fix based on comments
741a0d0 [Tathagata Das] Fixed style
f4f094c [Tathagata Das] Tweaked test
9afcdbe [Tathagata Das] Merge remote-tracking branch 'apache-github/master' 
into SPARK-9572
e21488d [Tathagata Das] Minor update
1a371d9 [Tathagata Das] Addressed comments.
60479da [Tathagata Das] Fixed indent
9c2da9c [Tathagata Das] Fixed bugs
b5bd32c [Tathagata Das] Merge remote-tracking branch 'apache-github/master' 
into SPARK-9572
b55b348 [Tathagata Das] Removed prints
5781728 [Tathagata Das] Fix style issues
b711214 [Tathagata Das] Reverted run-tests.py
643b59d [Tathagata Das] Revert unnecessary change
150e58c [Tathagata Das] Added StreamingContext.getActiveOrCreate() in Python


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5b8bb1b2
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5b8bb1b2
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5b8bb1b2

Branch: refs/heads/master
Commit: 5b8bb1b213b8738f563fcd00747604410fbb3087
Parents: dbd778d
Author: Tathagata Das tathagata.das1...@gmail.com
Authored: Tue Aug 11 12:02:28 2015 -0700
Committer: Tathagata Das tathagata.das1...@gmail.com
Committed: Tue Aug 11 12:02:28 2015 -0700

--
 python/pyspark/streaming/context.py |  57 -
 python/pyspark/streaming/tests.py   | 133 ---
 python/run-tests.py |   2 +-
 3 files changed, 177 insertions(+), 15 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/5b8bb1b2/python/pyspark/streaming/context.py
--
diff --git a/python/pyspark/streaming/context.py 
b/python/pyspark/streaming/context.py
index ac5ba69..e3ba70e 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -86,6 +86,9 @@ class StreamingContext(object):
 
 _transformerSerializer = None
 
+# Reference to a currently active StreamingContext
+_activeContext = None
+
 def __init__(self, sparkContext, batchDuration=None, jssc=None):
 
 Create a new StreamingContext.
@@ -142,10 +145,10 @@ class StreamingContext(object):
 Either recreate a StreamingContext from checkpoint data or create a 
new StreamingContext.
 If checkpoint data exists in the provided `checkpointPath`, then 
StreamingContext will be
 recreated from the checkpoint data. If the data does not exist, then 
the provided setupFunc
-will be used to create a JavaStreamingContext.
+will be used to create a new context.
 
-@param checkpointPath: Checkpoint directory used in an earlier 
JavaStreamingContext program
-@param setupFunc:  Function to create a new JavaStreamingContext 
and setup DStreams
+@param checkpointPath: Checkpoint directory used in an earlier 
streaming program
+@param setupFunc:  Function to create a new context and setup 
DStreams
 
 # TODO: support checkpoint in HDFS
 if not os.path.exists(checkpointPath) or not 
os.listdir(checkpointPath):
@@ -170,6 +173,52 @@ class StreamingContext(object):
 cls._transformerSerializer.ctx = sc
 return StreamingContext(sc, None, jssc)
 
+@classmethod
+def getActive(cls):
+
+Return either the currently active StreamingContext (i.e., if there is 
a context started
+but not stopped) or None.
+
+activePythonContext = cls._activeContext
+if activePythonContext is not None:
+# Verify that the current running Java StreamingContext is active 
and is the same one
+# backing the supposedly active Python context
+activePythonContextJavaId = 
activePythonContext._jssc.ssc().hashCode()
+activeJvmContextOption = 
activePythonContext._jvm.StreamingContext.getActive()
+
+if activeJvmContextOption.isEmpty():
+cls._activeContext = None
+elif activeJvmContextOption.get().hashCode() != 
activePythonContextJavaId:
+cls._activeContext = None
+raise Exception(JVM's active JavaStreamingContext is not the 
JavaStreamingContext 
+backing the action Python StreamingContext. 
This is unexpected.)
+return cls._activeContext
+
+@classmethod
+def getActiveOrCreate(cls, checkpointPath, setupFunc):
+
+Either return the active StreamingContext (i.e. currently started but 
not stopped),
+or

spark git commit: [SPARK-9572] [STREAMING] [PYSPARK] Added StreamingContext.getActiveOrCreate() in Python

2015-08-11 Thread tdas

Repository: spark
Updated Branches:
  refs/heads/branch-1.5 b077f36ea - 71460b889


[SPARK-9572] [STREAMING] [PYSPARK] Added StreamingContext.getActiveOrCreate() 
in Python

Author: Tathagata Das tathagata.das1...@gmail.com

Closes #8080 from tdas/SPARK-9572 and squashes the following commits:

64a231d [Tathagata Das] Fix based on comments
741a0d0 [Tathagata Das] Fixed style
f4f094c [Tathagata Das] Tweaked test
9afcdbe [Tathagata Das] Merge remote-tracking branch 'apache-github/master' 
into SPARK-9572
e21488d [Tathagata Das] Minor update
1a371d9 [Tathagata Das] Addressed comments.
60479da [Tathagata Das] Fixed indent
9c2da9c [Tathagata Das] Fixed bugs
b5bd32c [Tathagata Das] Merge remote-tracking branch 'apache-github/master' 
into SPARK-9572
b55b348 [Tathagata Das] Removed prints
5781728 [Tathagata Das] Fix style issues
b711214 [Tathagata Das] Reverted run-tests.py
643b59d [Tathagata Das] Revert unnecessary change
150e58c [Tathagata Das] Added StreamingContext.getActiveOrCreate() in Python

(cherry picked from commit 5b8bb1b213b8738f563fcd00747604410fbb3087)
Signed-off-by: Tathagata Das tathagata.das1...@gmail.com


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/71460b88
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/71460b88
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/71460b88

Branch: refs/heads/branch-1.5
Commit: 71460b889b4fd7345706a84e26132c216625df95
Parents: b077f36
Author: Tathagata Das tathagata.das1...@gmail.com
Authored: Tue Aug 11 12:02:28 2015 -0700
Committer: Tathagata Das tathagata.das1...@gmail.com
Committed: Tue Aug 11 12:02:44 2015 -0700

--
 python/pyspark/streaming/context.py |  57 -
 python/pyspark/streaming/tests.py   | 133 ---
 python/run-tests.py |   2 +-
 3 files changed, 177 insertions(+), 15 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/71460b88/python/pyspark/streaming/context.py
--
diff --git a/python/pyspark/streaming/context.py 
b/python/pyspark/streaming/context.py
index ac5ba69..e3ba70e 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -86,6 +86,9 @@ class StreamingContext(object):
 
 _transformerSerializer = None
 
+# Reference to a currently active StreamingContext
+_activeContext = None
+
 def __init__(self, sparkContext, batchDuration=None, jssc=None):
 
 Create a new StreamingContext.
@@ -142,10 +145,10 @@ class StreamingContext(object):
 Either recreate a StreamingContext from checkpoint data or create a 
new StreamingContext.
 If checkpoint data exists in the provided `checkpointPath`, then 
StreamingContext will be
 recreated from the checkpoint data. If the data does not exist, then 
the provided setupFunc
-will be used to create a JavaStreamingContext.
+will be used to create a new context.
 
-@param checkpointPath: Checkpoint directory used in an earlier 
JavaStreamingContext program
-@param setupFunc:  Function to create a new JavaStreamingContext 
and setup DStreams
+@param checkpointPath: Checkpoint directory used in an earlier 
streaming program
+@param setupFunc:  Function to create a new context and setup 
DStreams
 
 # TODO: support checkpoint in HDFS
 if not os.path.exists(checkpointPath) or not 
os.listdir(checkpointPath):
@@ -170,6 +173,52 @@ class StreamingContext(object):
 cls._transformerSerializer.ctx = sc
 return StreamingContext(sc, None, jssc)
 
+@classmethod
+def getActive(cls):
+
+Return either the currently active StreamingContext (i.e., if there is 
a context started
+but not stopped) or None.
+
+activePythonContext = cls._activeContext
+if activePythonContext is not None:
+# Verify that the current running Java StreamingContext is active 
and is the same one
+# backing the supposedly active Python context
+activePythonContextJavaId = 
activePythonContext._jssc.ssc().hashCode()
+activeJvmContextOption = 
activePythonContext._jvm.StreamingContext.getActive()
+
+if activeJvmContextOption.isEmpty():
+cls._activeContext = None
+elif activeJvmContextOption.get().hashCode() != 
activePythonContextJavaId:
+cls._activeContext = None
+raise Exception(JVM's active JavaStreamingContext is not the 
JavaStreamingContext 
+backing the action Python StreamingContext. 
This is unexpected.)
+return cls._activeContext
+
+@classmethod
+def getActiveOrCreate(cls,

spark git commit: [SPARK-8764] [ML] string indexer should take option to handle unseen values

2015-08-11 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/master 8cad854ef - dbd778d84


[SPARK-8764] [ML] string indexer should take option to handle unseen values

As a precursor to adding a public constructor add an option to handle unseen 
values by skipping rather than throwing an exception (default remains throwing 
an exception),

Author: Holden Karau hol...@pigscanfly.ca

Closes #7266 from 
holdenk/SPARK-8764-string-indexer-should-take-option-to-handle-unseen-values 
and squashes the following commits:

38a4de9 [Holden Karau] fix long line
045bf22 [Holden Karau] Add a second b entry so b gets 0 for sure
81dd312 [Holden Karau] Update the docs for handleInvalid param to be more 
descriptive
7f37f6e [Holden Karau] remove extra space (scala style)
414e249 [Holden Karau] And switch to using handleInvalid instead of skipInvalid
1e53f9b [Holden Karau] update the param (codegen side)
7a22215 [Holden Karau] fix typo
100a39b [Holden Karau] Merge in master
aa5b093 [Holden Karau] Since we filter we should never go down this code path 
if getSkipInvalid is true
75ffa69 [Holden Karau] Remove extra newline
d69ef5e [Holden Karau] Add a test
b5734be [Holden Karau] Add support for unseen labels
afecd4e [Holden Karau] Add a param to skip invalid entries.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/dbd778d8
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/dbd778d8
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/dbd778d8

Branch: refs/heads/master
Commit: dbd778d84d094ca142bc08c351478595b280bc2a
Parents: 8cad854
Author: Holden Karau hol...@pigscanfly.ca
Authored: Tue Aug 11 11:33:36 2015 -0700
Committer: Joseph K. Bradley jos...@databricks.com
Committed: Tue Aug 11 11:33:36 2015 -0700

--
 .../apache/spark/ml/feature/StringIndexer.scala | 26 +---
 .../ml/param/shared/SharedParamsCodeGen.scala   |  4 +++
 .../spark/ml/param/shared/sharedParams.scala| 15 +
 .../spark/ml/feature/StringIndexerSuite.scala   | 32 
 4 files changed, 73 insertions(+), 4 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/dbd778d8/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala 
b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
index ebfa972..e4485eb 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
@@ -33,7 +33,8 @@ import org.apache.spark.util.collection.OpenHashMap
 /**
  * Base trait for [[StringIndexer]] and [[StringIndexerModel]].
  */
-private[feature] trait StringIndexerBase extends Params with HasInputCol with 
HasOutputCol {
+private[feature] trait StringIndexerBase extends Params with HasInputCol with 
HasOutputCol
+with HasHandleInvalid {
 
   /** Validates and transforms the input schema. */
   protected def validateAndTransformSchema(schema: StructType): StructType = {
@@ -66,12 +67,15 @@ class StringIndexer(override val uid: String) extends 
Estimator[StringIndexerMod
   def this() = this(Identifiable.randomUID(strIdx))
 
   /** @group setParam */
+  def setHandleInvalid(value: String): this.type = set(handleInvalid, value)
+  setDefault(handleInvalid, error)
+
+  /** @group setParam */
   def setInputCol(value: String): this.type = set(inputCol, value)
 
   /** @group setParam */
   def setOutputCol(value: String): this.type = set(outputCol, value)
 
-  // TODO: handle unseen labels
 
   override def fit(dataset: DataFrame): StringIndexerModel = {
 val counts = dataset.select(col($(inputCol)).cast(StringType))
@@ -112,6 +116,10 @@ class StringIndexerModel private[ml] (
   }
 
   /** @group setParam */
+  def setHandleInvalid(value: String): this.type = set(handleInvalid, value)
+  setDefault(handleInvalid, error)
+
+  /** @group setParam */
   def setInputCol(value: String): this.type = set(inputCol, value)
 
   /** @group setParam */
@@ -128,14 +136,24 @@ class StringIndexerModel private[ml] (
   if (labelToIndex.contains(label)) {
 labelToIndex(label)
   } else {
-// TODO: handle unseen labels
 throw new SparkException(sUnseen label: $label.)
   }
 }
+
 val outputColName = $(outputCol)
 val metadata = NominalAttribute.defaultAttr
   .withName(outputColName).withValues(labels).toMetadata()
-dataset.select(col(*),
+// If we are skipping invalid records, filter them out.
+val filteredDataset = (getHandleInvalid) match {
+  case skip = {
+val filterer = udf { label: String =
+  labelToIndex.contains(label)
+}
+

[2/2] spark git commit: [SPARK-9646] [SQL] Add metrics for all join and aggregate operators

2015-08-11 Thread yhuai

[SPARK-9646] [SQL] Add metrics for all join and aggregate operators

This PR added metrics for all join and aggregate operators. However, I found 
the metrics may be confusing in the following two case:
1. The iterator is not totally consumed and the metric values will be less.
2. Recreating the iterators will make metric values look bigger than the size 
of the input source, such as `CartesianProduct`.

Author: zsxwing zsxw...@gmail.com

Closes #8060 from zsxwing/sql-metrics and squashes the following commits:

40f3fc1 [zsxwing] Mark LongSQLMetric private[metric] to avoid using incorrectly 
and leak memory
b1b9071 [zsxwing] Merge branch 'master' into sql-metrics
4bef25a [zsxwing] Add metrics for SortMergeOuterJoin
95ccfc6 [zsxwing] Merge branch 'master' into sql-metrics
67cb4dd [zsxwing] Add metrics for Project and TungstenProject; remove metrics 
from PhysicalRDD and LocalTableScan
0eb47d4 [zsxwing] Merge branch 'master' into sql-metrics
dd9d932 [zsxwing] Avoid creating new Iterators
589ea26 [zsxwing] Add metrics for all join and aggregate operators

(cherry picked from commit 5831294a7a8fa2524133c5d718cbc8187d2b0620)
Signed-off-by: Yin Huai yh...@databricks.com


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/767ee188
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/767ee188
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/767ee188

Branch: refs/heads/branch-1.5
Commit: 767ee1884b8ecba3afa8ed19a562626361d54f50
Parents: 71460b8
Author: zsxwing zsxw...@gmail.com
Authored: Tue Aug 11 12:39:13 2015 -0700
Committer: Yin Huai yh...@databricks.com
Committed: Tue Aug 11 12:39:39 2015 -0700

--
 .../apache/spark/sql/execution/Aggregate.scala  |  11 +
 .../spark/sql/execution/ExistingRDD.scala   |   2 -
 .../spark/sql/execution/LocalTableScan.scala|   2 -
 .../apache/spark/sql/execution/SparkPlan.scala  |  25 +-
 .../aggregate/SortBasedAggregate.scala  |  12 +-
 .../SortBasedAggregationIterator.scala  |  18 +-
 .../execution/aggregate/TungstenAggregate.scala |  12 +-
 .../aggregate/TungstenAggregationIterator.scala |  11 +-
 .../spark/sql/execution/basicOperators.scala|  36 +-
 .../sql/execution/joins/BroadcastHashJoin.scala |  30 +-
 .../joins/BroadcastHashOuterJoin.scala  |  40 +-
 .../joins/BroadcastLeftSemiJoinHash.scala   |  24 +-
 .../joins/BroadcastNestedLoopJoin.scala |  27 +-
 .../sql/execution/joins/CartesianProduct.scala  |  25 +-
 .../spark/sql/execution/joins/HashJoin.scala|   7 +-
 .../sql/execution/joins/HashOuterJoin.scala |  30 +-
 .../sql/execution/joins/HashSemiJoin.scala  |  23 +-
 .../sql/execution/joins/HashedRelation.scala|   8 +-
 .../sql/execution/joins/LeftSemiJoinBNL.scala   |  19 +-
 .../sql/execution/joins/LeftSemiJoinHash.scala  |  18 +-
 .../sql/execution/joins/ShuffledHashJoin.scala  |  16 +-
 .../execution/joins/ShuffledHashOuterJoin.scala |  29 +-
 .../sql/execution/joins/SortMergeJoin.scala |  21 +-
 .../execution/joins/SortMergeOuterJoin.scala|  38 +-
 .../spark/sql/execution/metric/SQLMetrics.scala |   6 +
 .../execution/joins/HashedRelationSuite.scala   |  14 +-
 .../sql/execution/metric/SQLMetricsSuite.scala  | 450 ++-
 27 files changed, 847 insertions(+), 107 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/767ee188/sql/core/src/main/scala/org/apache/spark/sql/execution/Aggregate.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/Aggregate.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/Aggregate.scala
index e8c6a0f..f3b6a3a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/Aggregate.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/Aggregate.scala
@@ -25,6 +25,7 @@ import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.errors._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.physical._
+import org.apache.spark.sql.execution.metric.SQLMetrics
 
 /**
  * :: DeveloperApi ::
@@ -45,6 +46,10 @@ case class Aggregate(
 child: SparkPlan)
   extends UnaryNode {
 
+  override private[sql] lazy val metrics = Map(
+numInputRows - SQLMetrics.createLongMetric(sparkContext, number of 
input rows),
+numOutputRows - SQLMetrics.createLongMetric(sparkContext, number of 
output rows))
+
   override def requiredChildDistribution: List[Distribution] = {
 if (partial) {
   UnspecifiedDistribution :: Nil
@@ -121,12 +126,15 @@ case class Aggregate(
   }
 
   protected override def doExecute(): RDD[InternalRow] = attachTree(this, 
execute) {
+val numInputRows = longMetric(numInputRows)
+val numOutputRows =

[2/2] spark git commit: [SPARK-9646] [SQL] Add metrics for all join and aggregate operators

2015-08-11 Thread yhuai

[SPARK-9646] [SQL] Add metrics for all join and aggregate operators

This PR added metrics for all join and aggregate operators. However, I found 
the metrics may be confusing in the following two case:
1. The iterator is not totally consumed and the metric values will be less.
2. Recreating the iterators will make metric values look bigger than the size 
of the input source, such as `CartesianProduct`.

Author: zsxwing zsxw...@gmail.com

Closes #8060 from zsxwing/sql-metrics and squashes the following commits:

40f3fc1 [zsxwing] Mark LongSQLMetric private[metric] to avoid using incorrectly 
and leak memory
b1b9071 [zsxwing] Merge branch 'master' into sql-metrics
4bef25a [zsxwing] Add metrics for SortMergeOuterJoin
95ccfc6 [zsxwing] Merge branch 'master' into sql-metrics
67cb4dd [zsxwing] Add metrics for Project and TungstenProject; remove metrics 
from PhysicalRDD and LocalTableScan
0eb47d4 [zsxwing] Merge branch 'master' into sql-metrics
dd9d932 [zsxwing] Avoid creating new Iterators
589ea26 [zsxwing] Add metrics for all join and aggregate operators


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5831294a
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5831294a
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5831294a

Branch: refs/heads/master
Commit: 5831294a7a8fa2524133c5d718cbc8187d2b0620
Parents: 5b8bb1b
Author: zsxwing zsxw...@gmail.com
Authored: Tue Aug 11 12:39:13 2015 -0700
Committer: Yin Huai yh...@databricks.com
Committed: Tue Aug 11 12:39:13 2015 -0700

--
 .../apache/spark/sql/execution/Aggregate.scala  |  11 +
 .../spark/sql/execution/ExistingRDD.scala   |   2 -
 .../spark/sql/execution/LocalTableScan.scala|   2 -
 .../apache/spark/sql/execution/SparkPlan.scala  |  25 +-
 .../aggregate/SortBasedAggregate.scala  |  12 +-
 .../SortBasedAggregationIterator.scala  |  18 +-
 .../execution/aggregate/TungstenAggregate.scala |  12 +-
 .../aggregate/TungstenAggregationIterator.scala |  11 +-
 .../spark/sql/execution/basicOperators.scala|  36 +-
 .../sql/execution/joins/BroadcastHashJoin.scala |  30 +-
 .../joins/BroadcastHashOuterJoin.scala  |  40 +-
 .../joins/BroadcastLeftSemiJoinHash.scala   |  24 +-
 .../joins/BroadcastNestedLoopJoin.scala |  27 +-
 .../sql/execution/joins/CartesianProduct.scala  |  25 +-
 .../spark/sql/execution/joins/HashJoin.scala|   7 +-
 .../sql/execution/joins/HashOuterJoin.scala |  30 +-
 .../sql/execution/joins/HashSemiJoin.scala  |  23 +-
 .../sql/execution/joins/HashedRelation.scala|   8 +-
 .../sql/execution/joins/LeftSemiJoinBNL.scala   |  19 +-
 .../sql/execution/joins/LeftSemiJoinHash.scala  |  18 +-
 .../sql/execution/joins/ShuffledHashJoin.scala  |  16 +-
 .../execution/joins/ShuffledHashOuterJoin.scala |  29 +-
 .../sql/execution/joins/SortMergeJoin.scala |  21 +-
 .../execution/joins/SortMergeOuterJoin.scala|  38 +-
 .../spark/sql/execution/metric/SQLMetrics.scala |   6 +
 .../execution/joins/HashedRelationSuite.scala   |  14 +-
 .../sql/execution/metric/SQLMetricsSuite.scala  | 450 ++-
 27 files changed, 847 insertions(+), 107 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/5831294a/sql/core/src/main/scala/org/apache/spark/sql/execution/Aggregate.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/Aggregate.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/Aggregate.scala
index e8c6a0f..f3b6a3a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/Aggregate.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/Aggregate.scala
@@ -25,6 +25,7 @@ import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.errors._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.physical._
+import org.apache.spark.sql.execution.metric.SQLMetrics
 
 /**
  * :: DeveloperApi ::
@@ -45,6 +46,10 @@ case class Aggregate(
 child: SparkPlan)
   extends UnaryNode {
 
+  override private[sql] lazy val metrics = Map(
+numInputRows - SQLMetrics.createLongMetric(sparkContext, number of 
input rows),
+numOutputRows - SQLMetrics.createLongMetric(sparkContext, number of 
output rows))
+
   override def requiredChildDistribution: List[Distribution] = {
 if (partial) {
   UnspecifiedDistribution :: Nil
@@ -121,12 +126,15 @@ case class Aggregate(
   }
 
   protected override def doExecute(): RDD[InternalRow] = attachTree(this, 
execute) {
+val numInputRows = longMetric(numInputRows)
+val numOutputRows = longMetric(numOutputRows)
 if (groupingExpressions.isEmpty) {
   child.execute().mapPartitions { iter =
 val buffer =

[1/2] spark git commit: [SPARK-9646] [SQL] Add metrics for all join and aggregate operators

2015-08-11 Thread yhuai

Repository: spark
Updated Branches:
  refs/heads/branch-1.5 71460b889 - 767ee1884


http://git-wip-us.apache.org/repos/asf/spark/blob/767ee188/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
--
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
index 953284c..7383d3f 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
@@ -25,15 +25,24 @@ import 
com.esotericsoftware.reflectasm.shaded.org.objectweb.asm._
 import com.esotericsoftware.reflectasm.shaded.org.objectweb.asm.Opcodes._
 
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.sql.test.TestSQLContext
+import org.apache.spark.sql._
+import org.apache.spark.sql.execution.ui.SparkPlanGraph
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.test.{SQLTestUtils, TestSQLContext}
 import org.apache.spark.util.Utils
 
+class SQLMetricsSuite extends SparkFunSuite with SQLTestUtils {
 
-class SQLMetricsSuite extends SparkFunSuite {
+  override val sqlContext = TestSQLContext
+
+  import sqlContext.implicits._
 
   test(LongSQLMetric should not box Long) {
 val l = SQLMetrics.createLongMetric(TestSQLContext.sparkContext, long)
-val f = () = { l += 1L }
+val f = () = {
+  l += 1L
+  l.add(1L)
+}
 BoxingFinder.getClassReader(f.getClass).foreach { cl =
   val boxingFinder = new BoxingFinder()
   cl.accept(boxingFinder, 0)
@@ -51,6 +60,441 @@ class SQLMetricsSuite extends SparkFunSuite {
   assert(boxingFinder.boxingInvokes.nonEmpty, Found find boxing in this 
test)
 }
   }
+
+  /**
+   * Call `df.collect()` and verify if the collected metrics are same as 
expectedMetrics.
+   *
+   * @param df `DataFrame` to run
+   * @param expectedNumOfJobs number of jobs that will run
+   * @param expectedMetrics the expected metrics. The format is
+   *`nodeId - (operatorName, metric name - metric 
value)`.
+   */
+  private def testSparkPlanMetrics(
+  df: DataFrame,
+  expectedNumOfJobs: Int,
+  expectedMetrics: Map[Long, (String, Map[String, Any])]): Unit = {
+val previousExecutionIds = TestSQLContext.listener.executionIdToData.keySet
+df.collect()
+TestSQLContext.sparkContext.listenerBus.waitUntilEmpty(1)
+val executionIds = 
TestSQLContext.listener.executionIdToData.keySet.diff(previousExecutionIds)
+assert(executionIds.size === 1)
+val executionId = executionIds.head
+val jobs = TestSQLContext.listener.getExecution(executionId).get.jobs
+// Use = because there is a race condition that we may miss some jobs
+// TODO Change it to = once we fix the race condition that missing the 
JobStarted event.
+assert(jobs.size = expectedNumOfJobs)
+if (jobs.size == expectedNumOfJobs) {
+  // If we can track all jobs, check the metric values
+  val metricValues = 
TestSQLContext.listener.getExecutionMetrics(executionId)
+  val actualMetrics = 
SparkPlanGraph(df.queryExecution.executedPlan).nodes.filter { node =
+expectedMetrics.contains(node.id)
+  }.map { node =
+val nodeMetrics = node.metrics.map { metric =
+  val metricValue = metricValues(metric.accumulatorId)
+  (metric.name, metricValue)
+}.toMap
+(node.id, node.name - nodeMetrics)
+  }.toMap
+  assert(expectedMetrics === actualMetrics)
+} else {
+  // TODO Remove this else once we fix the race condition that missing 
the JobStarted event.
+  // Since we cannot track all jobs, the metric values could be wrong and 
we should not check
+  // them.
+  logWarning(Due to a race condition, we miss some jobs and cannot verify 
the metric values)
+}
+  }
+
+  test(Project metrics) {
+withSQLConf(
+  SQLConf.UNSAFE_ENABLED.key - false,
+  SQLConf.CODEGEN_ENABLED.key - false,
+  SQLConf.TUNGSTEN_ENABLED.key - false) {
+  // Assume the execution plan is
+  // PhysicalRDD(nodeId = 1) - Project(nodeId = 0)
+  val df = TestData.person.select('name)
+  testSparkPlanMetrics(df, 1, Map(
+0L -(Project, Map(
+  number of rows - 2L)))
+  )
+}
+  }
+
+  test(TungstenProject metrics) {
+withSQLConf(
+  SQLConf.UNSAFE_ENABLED.key - true,
+  SQLConf.CODEGEN_ENABLED.key - true,
+  SQLConf.TUNGSTEN_ENABLED.key - true) {
+  // Assume the execution plan is
+  // PhysicalRDD(nodeId = 1) - TungstenProject(nodeId = 0)
+  val df = TestData.person.select('name)
+  testSparkPlanMetrics(df, 1, Map(
+0L -(TungstenProject, Map(
+  number of rows - 2L)))
+  )
+}
+  }
+
+  test(Filter metrics) {
+// Assume the execution plan is
+//

spark git commit: [SPARK-9750] [MLLIB] Improve equals on SparseMatrix and DenseMatrix

2015-08-11 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/master 5831294a7 - 520ad44b1


[SPARK-9750] [MLLIB] Improve equals on SparseMatrix and DenseMatrix

Adds unit test for `equals` on `mllib.linalg.Matrix` class and `equals` to both 
`SparseMatrix` and `DenseMatrix`. Supports equality testing between 
`SparseMatrix` and `DenseMatrix`.

mengxr

Author: Feynman Liang fli...@databricks.com

Closes #8042 from feynmanliang/SPARK-9750 and squashes the following commits:

bb70d5e [Feynman Liang] Breeze compare for dense matrices as well, in case 
other is sparse
ab6f3c8 [Feynman Liang] Sparse matrix compare for equals
22782df [Feynman Liang] Add equality based on matrix semantics, not 
representation
78f9426 [Feynman Liang] Add casts
43d28fa [Feynman Liang] Fix failing test
6416fa0 [Feynman Liang] Add failing sparse matrix equals tests


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/520ad44b
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/520ad44b
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/520ad44b

Branch: refs/heads/master
Commit: 520ad44b17f72e6465bf990f64b4e289f8a83447
Parents: 5831294
Author: Feynman Liang fli...@databricks.com
Authored: Tue Aug 11 12:49:47 2015 -0700
Committer: Joseph K. Bradley jos...@databricks.com
Committed: Tue Aug 11 12:49:47 2015 -0700

--
 .../org/apache/spark/mllib/linalg/Matrices.scala  |  8 ++--
 .../apache/spark/mllib/linalg/MatricesSuite.scala | 18 ++
 2 files changed, 24 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/520ad44b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
index 1c85834..1139ce3 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
@@ -257,8 +257,7 @@ class DenseMatrix(
 this(numRows, numCols, values, false)
 
   override def equals(o: Any): Boolean = o match {
-case m: DenseMatrix =
-  m.numRows == numRows  m.numCols == numCols  Arrays.equals(toArray, 
m.toArray)
+case m: Matrix = toBreeze == m.toBreeze
 case _ = false
   }
 
@@ -519,6 +518,11 @@ class SparseMatrix(
   rowIndices: Array[Int],
   values: Array[Double]) = this(numRows, numCols, colPtrs, rowIndices, 
values, false)
 
+  override def equals(o: Any): Boolean = o match {
+case m: Matrix = toBreeze == m.toBreeze
+case _ = false
+  }
+
   private[mllib] def toBreeze: BM[Double] = {
  if (!isTransposed) {
new BSM[Double](values, numRows, numCols, colPtrs, rowIndices)

http://git-wip-us.apache.org/repos/asf/spark/blob/520ad44b/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala
--
diff --git 
a/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala 
b/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala
index a270ba2..bfd6d54 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala
@@ -74,6 +74,24 @@ class MatricesSuite extends SparkFunSuite {
 }
   }
 
+  test(equals) {
+val dm1 = Matrices.dense(2, 2, Array(0.0, 1.0, 2.0, 3.0))
+assert(dm1 === dm1)
+assert(dm1 !== dm1.transpose)
+
+val dm2 = Matrices.dense(2, 2, Array(0.0, 2.0, 1.0, 3.0))
+assert(dm1 === dm2.transpose)
+
+val sm1 = dm1.asInstanceOf[DenseMatrix].toSparse
+assert(sm1 === sm1)
+assert(sm1 === dm1)
+assert(sm1 !== sm1.transpose)
+
+val sm2 = dm2.asInstanceOf[DenseMatrix].toSparse
+assert(sm1 === sm2.transpose)
+assert(sm1 === dm2.transpose)
+  }
+
   test(matrix copies are deep copies) {
 val m = 3
 val n = 2


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-9750] [MLLIB] Improve equals on SparseMatrix and DenseMatrix

2015-08-11 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/branch-1.5 767ee1884 - 811d23f1c


[SPARK-9750] [MLLIB] Improve equals on SparseMatrix and DenseMatrix

Adds unit test for `equals` on `mllib.linalg.Matrix` class and `equals` to both 
`SparseMatrix` and `DenseMatrix`. Supports equality testing between 
`SparseMatrix` and `DenseMatrix`.

mengxr

Author: Feynman Liang fli...@databricks.com

Closes #8042 from feynmanliang/SPARK-9750 and squashes the following commits:

bb70d5e [Feynman Liang] Breeze compare for dense matrices as well, in case 
other is sparse
ab6f3c8 [Feynman Liang] Sparse matrix compare for equals
22782df [Feynman Liang] Add equality based on matrix semantics, not 
representation
78f9426 [Feynman Liang] Add casts
43d28fa [Feynman Liang] Fix failing test
6416fa0 [Feynman Liang] Add failing sparse matrix equals tests

(cherry picked from commit 520ad44b17f72e6465bf990f64b4e289f8a83447)
Signed-off-by: Joseph K. Bradley jos...@databricks.com


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/811d23f1
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/811d23f1
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/811d23f1

Branch: refs/heads/branch-1.5
Commit: 811d23f1c27e7f461f0d37d058c07885fb0e0750
Parents: 767ee18
Author: Feynman Liang fli...@databricks.com
Authored: Tue Aug 11 12:49:47 2015 -0700
Committer: Joseph K. Bradley jos...@databricks.com
Committed: Tue Aug 11 12:49:56 2015 -0700

--
 .../org/apache/spark/mllib/linalg/Matrices.scala  |  8 ++--
 .../apache/spark/mllib/linalg/MatricesSuite.scala | 18 ++
 2 files changed, 24 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/811d23f1/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
index 1c85834..1139ce3 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
@@ -257,8 +257,7 @@ class DenseMatrix(
 this(numRows, numCols, values, false)
 
   override def equals(o: Any): Boolean = o match {
-case m: DenseMatrix =
-  m.numRows == numRows  m.numCols == numCols  Arrays.equals(toArray, 
m.toArray)
+case m: Matrix = toBreeze == m.toBreeze
 case _ = false
   }
 
@@ -519,6 +518,11 @@ class SparseMatrix(
   rowIndices: Array[Int],
   values: Array[Double]) = this(numRows, numCols, colPtrs, rowIndices, 
values, false)
 
+  override def equals(o: Any): Boolean = o match {
+case m: Matrix = toBreeze == m.toBreeze
+case _ = false
+  }
+
   private[mllib] def toBreeze: BM[Double] = {
  if (!isTransposed) {
new BSM[Double](values, numRows, numCols, colPtrs, rowIndices)

http://git-wip-us.apache.org/repos/asf/spark/blob/811d23f1/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala
--
diff --git 
a/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala 
b/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala
index a270ba2..bfd6d54 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala
@@ -74,6 +74,24 @@ class MatricesSuite extends SparkFunSuite {
 }
   }
 
+  test(equals) {
+val dm1 = Matrices.dense(2, 2, Array(0.0, 1.0, 2.0, 3.0))
+assert(dm1 === dm1)
+assert(dm1 !== dm1.transpose)
+
+val dm2 = Matrices.dense(2, 2, Array(0.0, 2.0, 1.0, 3.0))
+assert(dm1 === dm2.transpose)
+
+val sm1 = dm1.asInstanceOf[DenseMatrix].toSparse
+assert(sm1 === sm1)
+assert(sm1 === dm1)
+assert(sm1 !== sm1.transpose)
+
+val sm2 = dm2.asInstanceOf[DenseMatrix].toSparse
+assert(sm1 === sm2.transpose)
+assert(sm1 === dm2.transpose)
+  }
+
   test(matrix copies are deep copies) {
 val m = 3
 val n = 2


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [HOTFIX] Fix style error caused by 017b5de

2015-08-11 Thread andrewor14

Repository: spark
Updated Branches:
  refs/heads/master 017b5de07 - 736af95bd


[HOTFIX] Fix style error caused by 017b5de


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/736af95b
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/736af95b
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/736af95b

Branch: refs/heads/master
Commit: 736af95bd0c41723d455246b634a0fb68b38a7c7
Parents: 017b5de
Author: Andrew Or and...@databricks.com
Authored: Tue Aug 11 14:52:52 2015 -0700
Committer: Andrew Or and...@databricks.com
Committed: Tue Aug 11 14:52:52 2015 -0700

--
 mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/736af95b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
index 26eb84a..11ed231 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
@@ -114,7 +114,7 @@ object MLUtils {
   }
 
   // Convenient methods for `loadLibSVMFile`.
-  
+
   /**
* @since 1.0.0
*/


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [HOTFIX] Fix style error caused by ef961ed48a4f45447f0e0ad256b040c7ab2d78d9

2015-08-11 Thread andrewor14

Repository: spark
Updated Branches:
  refs/heads/branch-1.5 725e5c7a4 - 1067c7369


[HOTFIX] Fix style error caused by ef961ed48a4f45447f0e0ad256b040c7ab2d78d9


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1067c736
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1067c736
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1067c736

Branch: refs/heads/branch-1.5
Commit: 1067c73693c52facddfb5e425e9caaf7a1cb364b
Parents: 725e5c7
Author: Andrew Or and...@databricks.com
Authored: Tue Aug 11 14:52:52 2015 -0700
Committer: Andrew Or and...@databricks.com
Committed: Tue Aug 11 14:57:23 2015 -0700

--
 mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/1067c736/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
index 26eb84a..11ed231 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
@@ -114,7 +114,7 @@ object MLUtils {
   }
 
   // Convenient methods for `loadLibSVMFile`.
-  
+
   /**
* @since 1.0.0
*/


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-9831] [SQL] fix serialization with empty broadcast

2015-08-11 Thread davies

Repository: spark
Updated Branches:
  refs/heads/master 74a293f45 - c3e9a120e


[SPARK-9831] [SQL] fix serialization with empty broadcast

Author: Davies Liu dav...@databricks.com

Closes #8117 from davies/fix_serialization and squashes the following commits:

d21ac71 [Davies Liu] fix serialization with empty broadcast


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c3e9a120
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c3e9a120
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c3e9a120

Branch: refs/heads/master
Commit: c3e9a120e33159fb45cd99f3a55fc5cf16cd7c6c
Parents: 74a293f
Author: Davies Liu dav...@databricks.com
Authored: Tue Aug 11 22:45:18 2015 -0700
Committer: Davies Liu davies@gmail.com
Committed: Tue Aug 11 22:45:18 2015 -0700

--
 .../spark/sql/execution/joins/HashedRelation.scala |  2 +-
 .../sql/execution/joins/HashedRelationSuite.scala  | 17 +
 2 files changed, 18 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/c3e9a120/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
index c1bc794..076afe6 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
@@ -299,7 +299,7 @@ private[joins] final class UnsafeHashedRelation(
 binaryMap = new BytesToBytesMap(
   taskMemoryManager,
   shuffleMemoryManager,
-  nKeys * 2, // reduce hash collision
+  (nKeys * 1.5 + 1).toInt, // reduce hash collision
   pageSizeBytes)
 
 var i = 0

http://git-wip-us.apache.org/repos/asf/spark/blob/c3e9a120/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/HashedRelationSuite.scala
--
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/HashedRelationSuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/HashedRelationSuite.scala
index a1fa2c3..c635b2d 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/HashedRelationSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/HashedRelationSuite.scala
@@ -103,4 +103,21 @@ class HashedRelationSuite extends SparkFunSuite {
 assert(hashed2.get(unsafeData(2)) === data2)
 assert(numDataRows.value.value === data.length)
   }
+
+  test(test serialization empty hash map) {
+val os = new ByteArrayOutputStream()
+val out = new ObjectOutputStream(os)
+val hashed = new UnsafeHashedRelation(
+  new java.util.HashMap[UnsafeRow, CompactBuffer[UnsafeRow]])
+hashed.writeExternal(out)
+out.flush()
+val in = new ObjectInputStream(new ByteArrayInputStream(os.toByteArray))
+val hashed2 = new UnsafeHashedRelation()
+hashed2.readExternal(in)
+
+val schema = StructType(StructField(a, IntegerType, true) :: Nil)
+val toUnsafe = UnsafeProjection.create(schema)
+val row = toUnsafe(InternalRow(0))
+assert(hashed2.get(row) === null)
+  }
 }


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-9831] [SQL] fix serialization with empty broadcast

2015-08-11 Thread davies

Repository: spark
Updated Branches:
  refs/heads/branch-1.5 890c75bc2 - 7024f3eac


[SPARK-9831] [SQL] fix serialization with empty broadcast

Author: Davies Liu dav...@databricks.com

Closes #8117 from davies/fix_serialization and squashes the following commits:

d21ac71 [Davies Liu] fix serialization with empty broadcast

(cherry picked from commit c3e9a120e33159fb45cd99f3a55fc5cf16cd7c6c)
Signed-off-by: Davies Liu davies@gmail.com


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7024f3ea
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7024f3ea
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7024f3ea

Branch: refs/heads/branch-1.5
Commit: 7024f3eac7b5133ff7a75171509a09ca0c367f5e
Parents: 890c75b
Author: Davies Liu dav...@databricks.com
Authored: Tue Aug 11 22:45:18 2015 -0700
Committer: Davies Liu davies@gmail.com
Committed: Tue Aug 11 22:45:41 2015 -0700

--
 .../spark/sql/execution/joins/HashedRelation.scala |  2 +-
 .../sql/execution/joins/HashedRelationSuite.scala  | 17 +
 2 files changed, 18 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/7024f3ea/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
index c1bc794..076afe6 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
@@ -299,7 +299,7 @@ private[joins] final class UnsafeHashedRelation(
 binaryMap = new BytesToBytesMap(
   taskMemoryManager,
   shuffleMemoryManager,
-  nKeys * 2, // reduce hash collision
+  (nKeys * 1.5 + 1).toInt, // reduce hash collision
   pageSizeBytes)
 
 var i = 0

http://git-wip-us.apache.org/repos/asf/spark/blob/7024f3ea/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/HashedRelationSuite.scala
--
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/HashedRelationSuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/HashedRelationSuite.scala
index a1fa2c3..c635b2d 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/HashedRelationSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/HashedRelationSuite.scala
@@ -103,4 +103,21 @@ class HashedRelationSuite extends SparkFunSuite {
 assert(hashed2.get(unsafeData(2)) === data2)
 assert(numDataRows.value.value === data.length)
   }
+
+  test(test serialization empty hash map) {
+val os = new ByteArrayOutputStream()
+val out = new ObjectOutputStream(os)
+val hashed = new UnsafeHashedRelation(
+  new java.util.HashMap[UnsafeRow, CompactBuffer[UnsafeRow]])
+hashed.writeExternal(out)
+out.flush()
+val in = new ObjectInputStream(new ByteArrayInputStream(os.toByteArray))
+val hashed2 = new UnsafeHashedRelation()
+hashed2.readExternal(in)
+
+val schema = StructType(StructField(a, IntegerType, true) :: Nil)
+val toUnsafe = UnsafeProjection.create(schema)
+val row = toUnsafe(InternalRow(0))
+assert(hashed2.get(row) === null)
+  }
 }


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-9854] [SQL] RuleExecutor.timeMap should be thread-safe

2015-08-11 Thread joshrosen

Repository: spark
Updated Branches:
  refs/heads/master c3e9a120e - b1581ac28


[SPARK-9854] [SQL] RuleExecutor.timeMap should be thread-safe

`RuleExecutor.timeMap` is currently a non-thread-safe mutable HashMap; this can 
lead to infinite loops if multiple threads are concurrently modifying the map.  
I believe that this is responsible for some hangs that I've observed in 
HiveQuerySuite.

This patch addresses this by using a Guava `AtomicLongMap`.

Author: Josh Rosen joshro...@databricks.com

Closes #8120 from JoshRosen/rule-executor-time-map-fix.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b1581ac2
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b1581ac2
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b1581ac2

Branch: refs/heads/master
Commit: b1581ac28840a4d2209ef8bb5c9f8700b4c1b286
Parents: c3e9a12
Author: Josh Rosen joshro...@databricks.com
Authored: Tue Aug 11 22:46:59 2015 -0700
Committer: Josh Rosen joshro...@databricks.com
Committed: Tue Aug 11 22:46:59 2015 -0700

--
 .../spark/sql/catalyst/rules/RuleExecutor.scala  | 15 +--
 1 file changed, 9 insertions(+), 6 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/b1581ac2/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleExecutor.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleExecutor.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleExecutor.scala
index 8b82451..f80d2a9 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleExecutor.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleExecutor.scala
@@ -17,22 +17,25 @@
 
 package org.apache.spark.sql.catalyst.rules
 
+import scala.collection.JavaConverters._
+
+import com.google.common.util.concurrent.AtomicLongMap
+
 import org.apache.spark.Logging
 import org.apache.spark.sql.catalyst.trees.TreeNode
 import org.apache.spark.sql.catalyst.util.sideBySide
 
-import scala.collection.mutable
-
 object RuleExecutor {
-  protected val timeMap = new mutable.HashMap[String, Long].withDefault(_ = 0)
+  protected val timeMap = AtomicLongMap.create[String]()
 
   /** Resets statistics about time spent running specific rules */
   def resetTime(): Unit = timeMap.clear()
 
   /** Dump statistics about time spent running specific rules. */
   def dumpTimeSpent(): String = {
-val maxSize = timeMap.keys.map(_.toString.length).max
-timeMap.toSeq.sortBy(_._2).reverseMap { case (k, v) =
+val map = timeMap.asMap().asScala
+val maxSize = map.keys.map(_.toString.length).max
+map.toSeq.sortBy(_._2).reverseMap { case (k, v) =
   s${k.padTo(maxSize,  ).mkString} $v
 }.mkString(\n)
   }
@@ -79,7 +82,7 @@ abstract class RuleExecutor[TreeType : TreeNode[_]] extends 
Logging {
 val startTime = System.nanoTime()
 val result = rule(plan)
 val runTime = System.nanoTime() - startTime
-RuleExecutor.timeMap(rule.ruleName) = 
RuleExecutor.timeMap(rule.ruleName) + runTime
+RuleExecutor.timeMap.addAndGet(rule.ruleName, runTime)
 
 if (!result.fastEquals(plan)) {
   logTrace(


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[2/2] spark git commit: Preparing development version 1.5.0-SNAPSHOT

2015-08-11 Thread pwendell

Preparing development version 1.5.0-SNAPSHOT


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b7497e3a
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b7497e3a
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b7497e3a

Branch: refs/heads/branch-1.5
Commit: b7497e3a27205cdc5a7069eaeba3fd03d9e55332
Parents: 158b2ea
Author: Patrick Wendell pwend...@gmail.com
Authored: Tue Aug 11 18:07:34 2015 -0700
Committer: Patrick Wendell pwend...@gmail.com
Committed: Tue Aug 11 18:07:34 2015 -0700

--
 assembly/pom.xml| 2 +-
 bagel/pom.xml   | 2 +-
 core/pom.xml| 2 +-
 examples/pom.xml| 2 +-
 external/flume-assembly/pom.xml | 2 +-
 external/flume-sink/pom.xml | 2 +-
 external/flume/pom.xml  | 2 +-
 external/kafka-assembly/pom.xml | 2 +-
 external/kafka/pom.xml  | 2 +-
 external/mqtt-assembly/pom.xml  | 2 +-
 external/mqtt/pom.xml   | 2 +-
 external/twitter/pom.xml| 2 +-
 external/zeromq/pom.xml | 2 +-
 extras/java8-tests/pom.xml  | 2 +-
 extras/kinesis-asl-assembly/pom.xml | 2 +-
 extras/kinesis-asl/pom.xml  | 2 +-
 extras/spark-ganglia-lgpl/pom.xml   | 2 +-
 graphx/pom.xml  | 2 +-
 launcher/pom.xml| 2 +-
 mllib/pom.xml   | 2 +-
 network/common/pom.xml  | 2 +-
 network/shuffle/pom.xml | 2 +-
 network/yarn/pom.xml| 2 +-
 pom.xml | 2 +-
 repl/pom.xml| 2 +-
 sql/catalyst/pom.xml| 2 +-
 sql/core/pom.xml| 2 +-
 sql/hive-thriftserver/pom.xml   | 2 +-
 sql/hive/pom.xml| 2 +-
 streaming/pom.xml   | 2 +-
 tools/pom.xml   | 2 +-
 unsafe/pom.xml  | 2 +-
 yarn/pom.xml| 2 +-
 33 files changed, 33 insertions(+), 33 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/b7497e3a/assembly/pom.xml
--
diff --git a/assembly/pom.xml b/assembly/pom.xml
index 3ef7d6f..e9c6d26 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -21,7 +21,7 @@
   parent
 groupIdorg.apache.spark/groupId
 artifactIdspark-parent_2.10/artifactId
-version1.5.0/version
+version1.5.0-SNAPSHOT/version
 relativePath../pom.xml/relativePath
   /parent
 

http://git-wip-us.apache.org/repos/asf/spark/blob/b7497e3a/bagel/pom.xml
--
diff --git a/bagel/pom.xml b/bagel/pom.xml
index 684e07b..ed5c37e 100644
--- a/bagel/pom.xml
+++ b/bagel/pom.xml
@@ -21,7 +21,7 @@
   parent
 groupIdorg.apache.spark/groupId
 artifactIdspark-parent_2.10/artifactId
-version1.5.0/version
+version1.5.0-SNAPSHOT/version
 relativePath../pom.xml/relativePath
   /parent
 

http://git-wip-us.apache.org/repos/asf/spark/blob/b7497e3a/core/pom.xml
--
diff --git a/core/pom.xml b/core/pom.xml
index bb25652..0e53a79 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -21,7 +21,7 @@
   parent
 groupIdorg.apache.spark/groupId
 artifactIdspark-parent_2.10/artifactId
-version1.5.0/version
+version1.5.0-SNAPSHOT/version
 relativePath../pom.xml/relativePath
   /parent
 

http://git-wip-us.apache.org/repos/asf/spark/blob/b7497e3a/examples/pom.xml
--
diff --git a/examples/pom.xml b/examples/pom.xml
index 9ef1eda..e6884b0 100644
--- a/examples/pom.xml
+++ b/examples/pom.xml
@@ -21,7 +21,7 @@
   parent
 groupIdorg.apache.spark/groupId
 artifactIdspark-parent_2.10/artifactId
-version1.5.0/version
+version1.5.0-SNAPSHOT/version
 relativePath../pom.xml/relativePath
   /parent
 

http://git-wip-us.apache.org/repos/asf/spark/blob/b7497e3a/external/flume-assembly/pom.xml
--
diff --git a/external/flume-assembly/pom.xml b/external/flume-assembly/pom.xml
index 6377c3e..1318959 100644
--- a/external/flume-assembly/pom.xml
+++ b/external/flume-assembly/pom.xml
@@ -21,7 +21,7 @@
   parent
 groupIdorg.apache.spark/groupId
 artifactIdspark-parent_2.10/artifactId
-version1.5.0/version
+version1.5.0-SNAPSHOT/version
 relativePath../../pom.xml/relativePath
   /parent
 

http://git-wip-us.apache.org/repos/asf/spark/blob/b7497e3a/external/flume-sink/pom.xml
--
diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml
index

spark git commit: [SPARK-9849] [SQL] DirectParquetOutputCommitter qualified name should be backward compatible

2015-08-11 Thread yhuai

Repository: spark
Updated Branches:
  refs/heads/master 5a5bbc299 - afa757c98


[SPARK-9849] [SQL] DirectParquetOutputCommitter qualified name should be 
backward compatible

DirectParquetOutputCommitter was moved in SPARK-9763. However, users can 
explicitly set the class as a config option, so we must be able to resolve the 
old committer qualified name.

Author: Reynold Xin r...@databricks.com

Closes #8114 from rxin/SPARK-9849.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/afa757c9
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/afa757c9
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/afa757c9

Branch: refs/heads/master
Commit: afa757c98c537965007cad4c61c436887f3ac6a6
Parents: 5a5bbc2
Author: Reynold Xin r...@databricks.com
Authored: Tue Aug 11 18:08:49 2015 -0700
Committer: Yin Huai yh...@databricks.com
Committed: Tue Aug 11 18:08:49 2015 -0700

--
 .../datasources/parquet/ParquetRelation.scala   |  7 +
 .../datasources/parquet/ParquetIOSuite.scala| 27 +++-
 2 files changed, 33 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/afa757c9/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala
index 4086a13..c71c69b 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala
@@ -209,6 +209,13 @@ private[sql] class ParquetRelation(
   override def prepareJobForWrite(job: Job): OutputWriterFactory = {
 val conf = ContextUtil.getConfiguration(job)
 
+// SPARK-9849 DirectParquetOutputCommitter qualified name should be 
backward compatible
+val committerClassname = 
conf.get(SQLConf.PARQUET_OUTPUT_COMMITTER_CLASS.key)
+if (committerClassname == 
org.apache.spark.sql.parquet.DirectParquetOutputCommitter) {
+  conf.set(SQLConf.PARQUET_OUTPUT_COMMITTER_CLASS.key,
+classOf[DirectParquetOutputCommitter].getCanonicalName)
+}
+
 val committerClass =
   conf.getClass(
 SQLConf.PARQUET_OUTPUT_COMMITTER_CLASS.key,

http://git-wip-us.apache.org/repos/asf/spark/blob/afa757c9/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
--
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
index ee925af..cb16634 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
@@ -390,7 +390,32 @@ class ParquetIOSuite extends QueryTest with ParquetTest {
 }
   }
 
-  test(SPARK-8121: spark.sql.parquet.output.committer.class shouldn't be 
overriden) {
+  test(SPARK-9849 DirectParquetOutputCommitter qualified name should be 
backward compatible) {
+val clonedConf = new Configuration(configuration)
+
+// Write to a parquet file and let it fail.
+// _temporary should be missing if direct output committer works.
+try {
+  configuration.set(spark.sql.parquet.output.committer.class,
+org.apache.spark.sql.parquet.DirectParquetOutputCommitter)
+  sqlContext.udf.register(div0, (x: Int) = x / 0)
+  withTempPath { dir =
+intercept[org.apache.spark.SparkException] {
+  sqlContext.sql(select div0(1)).write.parquet(dir.getCanonicalPath)
+}
+val path = new Path(dir.getCanonicalPath, _temporary)
+val fs = path.getFileSystem(configuration)
+assert(!fs.exists(path))
+  }
+} finally {
+  // Hadoop 1 doesn't have `Configuration.unset`
+  configuration.clear()
+  clonedConf.foreach(entry = configuration.set(entry.getKey, 
entry.getValue))
+}
+  }
+
+
+  test(SPARK-8121: spark.sql.parquet.output.committer.class shouldn't be 
overridden) {
 withTempPath { dir =
   val clonedConf = new Configuration(configuration)
 


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-9849] [SQL] DirectParquetOutputCommitter qualified name should be backward compatible

2015-08-11 Thread yhuai

Repository: spark
Updated Branches:
  refs/heads/branch-1.5 b7497e3a2 - ec7a4b9b0


[SPARK-9849] [SQL] DirectParquetOutputCommitter qualified name should be 
backward compatible

DirectParquetOutputCommitter was moved in SPARK-9763. However, users can 
explicitly set the class as a config option, so we must be able to resolve the 
old committer qualified name.

Author: Reynold Xin r...@databricks.com

Closes #8114 from rxin/SPARK-9849.

(cherry picked from commit afa757c98c537965007cad4c61c436887f3ac6a6)
Signed-off-by: Yin Huai yh...@databricks.com


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ec7a4b9b
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ec7a4b9b
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ec7a4b9b

Branch: refs/heads/branch-1.5
Commit: ec7a4b9b0b1183965e086f724877d69bccbdbcbe
Parents: b7497e3
Author: Reynold Xin r...@databricks.com
Authored: Tue Aug 11 18:08:49 2015 -0700
Committer: Yin Huai yh...@databricks.com
Committed: Tue Aug 11 18:09:05 2015 -0700

--
 .../datasources/parquet/ParquetRelation.scala   |  7 +
 .../datasources/parquet/ParquetIOSuite.scala| 27 +++-
 2 files changed, 33 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/ec7a4b9b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala
index 4086a13..c71c69b 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala
@@ -209,6 +209,13 @@ private[sql] class ParquetRelation(
   override def prepareJobForWrite(job: Job): OutputWriterFactory = {
 val conf = ContextUtil.getConfiguration(job)
 
+// SPARK-9849 DirectParquetOutputCommitter qualified name should be 
backward compatible
+val committerClassname = 
conf.get(SQLConf.PARQUET_OUTPUT_COMMITTER_CLASS.key)
+if (committerClassname == 
org.apache.spark.sql.parquet.DirectParquetOutputCommitter) {
+  conf.set(SQLConf.PARQUET_OUTPUT_COMMITTER_CLASS.key,
+classOf[DirectParquetOutputCommitter].getCanonicalName)
+}
+
 val committerClass =
   conf.getClass(
 SQLConf.PARQUET_OUTPUT_COMMITTER_CLASS.key,

http://git-wip-us.apache.org/repos/asf/spark/blob/ec7a4b9b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
--
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
index ee925af..cb16634 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
@@ -390,7 +390,32 @@ class ParquetIOSuite extends QueryTest with ParquetTest {
 }
   }
 
-  test(SPARK-8121: spark.sql.parquet.output.committer.class shouldn't be 
overriden) {
+  test(SPARK-9849 DirectParquetOutputCommitter qualified name should be 
backward compatible) {
+val clonedConf = new Configuration(configuration)
+
+// Write to a parquet file and let it fail.
+// _temporary should be missing if direct output committer works.
+try {
+  configuration.set(spark.sql.parquet.output.committer.class,
+org.apache.spark.sql.parquet.DirectParquetOutputCommitter)
+  sqlContext.udf.register(div0, (x: Int) = x / 0)
+  withTempPath { dir =
+intercept[org.apache.spark.SparkException] {
+  sqlContext.sql(select div0(1)).write.parquet(dir.getCanonicalPath)
+}
+val path = new Path(dir.getCanonicalPath, _temporary)
+val fs = path.getFileSystem(configuration)
+assert(!fs.exists(path))
+  }
+} finally {
+  // Hadoop 1 doesn't have `Configuration.unset`
+  configuration.clear()
+  clonedConf.foreach(entry = configuration.set(entry.getKey, 
entry.getValue))
+}
+  }
+
+
+  test(SPARK-8121: spark.sql.parquet.output.committer.class shouldn't be 
overridden) {
 withTempPath { dir =
   val clonedConf = new Configuration(configuration)
 


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-9713] [ML] Document SparkR MLlib glm() integration in Spark 1.5

2015-08-11 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-1.5 6ea33f5bf - 890c75bc2


[SPARK-9713] [ML] Document SparkR MLlib glm() integration in Spark 1.5

This documents the use of R model formulae in the SparkR guide. Also fixes some 
bugs in the R api doc.

mengxr

Author: Eric Liang e...@databricks.com

Closes #8085 from ericl/docs.

(cherry picked from commit 74a293f4537c6982345166f8883538f81d850872)
Signed-off-by: Xiangrui Meng m...@databricks.com


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/890c75bc
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/890c75bc
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/890c75bc

Branch: refs/heads/branch-1.5
Commit: 890c75bc2c2e1405c00485a98c034342122b639f
Parents: 6ea33f5
Author: Eric Liang e...@databricks.com
Authored: Tue Aug 11 21:26:03 2015 -0700
Committer: Xiangrui Meng m...@databricks.com
Committed: Tue Aug 11 21:26:12 2015 -0700

--
 R/pkg/R/generics.R |  4 ++--
 R/pkg/R/mllib.R|  8 
 docs/sparkr.md | 37 -
 3 files changed, 42 insertions(+), 7 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/890c75bc/R/pkg/R/generics.R
--
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index c43b947..379a78b 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -535,8 +535,8 @@ setGeneric(showDF, function(x,...) { 
standardGeneric(showDF) })
 #' @export
 setGeneric(summarize, function(x,...) { standardGeneric(summarize) })
 
-##' rdname summary
-##' @export
+#' @rdname summary
+#' @export
 setGeneric(summary, function(x, ...) { standardGeneric(summary) })
 
 # @rdname tojson

http://git-wip-us.apache.org/repos/asf/spark/blob/890c75bc/R/pkg/R/mllib.R
--
diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
index b524d1f..cea3d76 100644
--- a/R/pkg/R/mllib.R
+++ b/R/pkg/R/mllib.R
@@ -56,10 +56,10 @@ setMethod(glm, signature(formula = formula, family = 
ANY, data = DataFram
 #'
 #' Makes predictions from a model produced by glm(), similarly to R's 
predict().
 #'
-#' @param model A fitted MLlib model
+#' @param object A fitted MLlib model
 #' @param newData DataFrame for testing
 #' @return DataFrame containing predicted values
-#' @rdname glm
+#' @rdname predict
 #' @export
 #' @examples
 #'\dontrun{
@@ -76,10 +76,10 @@ setMethod(predict, signature(object = PipelineModel),
 #'
 #' Returns the summary of a model produced by glm(), similarly to R's 
summary().
 #'
-#' @param model A fitted MLlib model
+#' @param x A fitted MLlib model
 #' @return a list with a 'coefficient' component, which is the matrix of 
coefficients. See
 #' summary.glm for more information.
-#' @rdname glm
+#' @rdname summary
 #' @export
 #' @examples
 #'\dontrun{

http://git-wip-us.apache.org/repos/asf/spark/blob/890c75bc/docs/sparkr.md
--
diff --git a/docs/sparkr.md b/docs/sparkr.md
index 4385a4e..7139d16 100644
--- a/docs/sparkr.md
+++ b/docs/sparkr.md
@@ -11,7 +11,8 @@ title: SparkR (R on Spark)
 SparkR is an R package that provides a light-weight frontend to use Apache 
Spark from R.
 In Spark {{site.SPARK_VERSION}}, SparkR provides a distributed data frame 
implementation that
 supports operations like selection, filtering, aggregation etc. (similar to R 
data frames,
-[dplyr](https://github.com/hadley/dplyr)) but on large datasets.
+[dplyr](https://github.com/hadley/dplyr)) but on large datasets. SparkR also 
supports distributed
+machine learning using MLlib.
 
 # SparkR DataFrames
 
@@ -230,3 +231,37 @@ head(teenagers)
 
 {% endhighlight %}
 /div
+
+# Machine Learning
+
+SparkR allows the fitting of generalized linear models over DataFrames using 
the [glm()](api/R/glm.html) function. Under the hood, SparkR uses MLlib to 
train a model of the specified family. Currently the gaussian and binomial 
families are supported. We support a subset of the available R formula 
operators for model fitting, including '~', '.', '+', and '-'. The example 
below shows the use of building a gaussian GLM model using SparkR.
+
+div data-lang=r  markdown=1
+{% highlight r %}
+# Create the DataFrame
+df - createDataFrame(sqlContext, iris)
+
+# Fit a linear model over the dataset.
+model - glm(Sepal_Length ~ Sepal_Width + Species, data = df, family = 
gaussian)
+
+# Model coefficients are returned in a similar format to R's native glm().
+summary(model)
+##$coefficients
+##Estimate
+##(Intercept)2.2513930
+##Sepal_Width0.8035609
+##Species_versicolor 1.4587432
+##Species_virginica  1.9468169
+
+# Make predictions based on the model.
+predictions - predict(model, newData = df)

spark git commit: [SPARK-9713] [ML] Document SparkR MLlib glm() integration in Spark 1.5

2015-08-11 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master 3ef0f3292 - 74a293f45


[SPARK-9713] [ML] Document SparkR MLlib glm() integration in Spark 1.5

This documents the use of R model formulae in the SparkR guide. Also fixes some 
bugs in the R api doc.

mengxr

Author: Eric Liang e...@databricks.com

Closes #8085 from ericl/docs.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/74a293f4
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/74a293f4
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/74a293f4

Branch: refs/heads/master
Commit: 74a293f4537c6982345166f8883538f81d850872
Parents: 3ef0f32
Author: Eric Liang e...@databricks.com
Authored: Tue Aug 11 21:26:03 2015 -0700
Committer: Xiangrui Meng m...@databricks.com
Committed: Tue Aug 11 21:26:03 2015 -0700

--
 R/pkg/R/generics.R |  4 ++--
 R/pkg/R/mllib.R|  8 
 docs/sparkr.md | 37 -
 3 files changed, 42 insertions(+), 7 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/74a293f4/R/pkg/R/generics.R
--
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index c43b947..379a78b 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -535,8 +535,8 @@ setGeneric(showDF, function(x,...) { 
standardGeneric(showDF) })
 #' @export
 setGeneric(summarize, function(x,...) { standardGeneric(summarize) })
 
-##' rdname summary
-##' @export
+#' @rdname summary
+#' @export
 setGeneric(summary, function(x, ...) { standardGeneric(summary) })
 
 # @rdname tojson

http://git-wip-us.apache.org/repos/asf/spark/blob/74a293f4/R/pkg/R/mllib.R
--
diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
index b524d1f..cea3d76 100644
--- a/R/pkg/R/mllib.R
+++ b/R/pkg/R/mllib.R
@@ -56,10 +56,10 @@ setMethod(glm, signature(formula = formula, family = 
ANY, data = DataFram
 #'
 #' Makes predictions from a model produced by glm(), similarly to R's 
predict().
 #'
-#' @param model A fitted MLlib model
+#' @param object A fitted MLlib model
 #' @param newData DataFrame for testing
 #' @return DataFrame containing predicted values
-#' @rdname glm
+#' @rdname predict
 #' @export
 #' @examples
 #'\dontrun{
@@ -76,10 +76,10 @@ setMethod(predict, signature(object = PipelineModel),
 #'
 #' Returns the summary of a model produced by glm(), similarly to R's 
summary().
 #'
-#' @param model A fitted MLlib model
+#' @param x A fitted MLlib model
 #' @return a list with a 'coefficient' component, which is the matrix of 
coefficients. See
 #' summary.glm for more information.
-#' @rdname glm
+#' @rdname summary
 #' @export
 #' @examples
 #'\dontrun{

http://git-wip-us.apache.org/repos/asf/spark/blob/74a293f4/docs/sparkr.md
--
diff --git a/docs/sparkr.md b/docs/sparkr.md
index 4385a4e..7139d16 100644
--- a/docs/sparkr.md
+++ b/docs/sparkr.md
@@ -11,7 +11,8 @@ title: SparkR (R on Spark)
 SparkR is an R package that provides a light-weight frontend to use Apache 
Spark from R.
 In Spark {{site.SPARK_VERSION}}, SparkR provides a distributed data frame 
implementation that
 supports operations like selection, filtering, aggregation etc. (similar to R 
data frames,
-[dplyr](https://github.com/hadley/dplyr)) but on large datasets.
+[dplyr](https://github.com/hadley/dplyr)) but on large datasets. SparkR also 
supports distributed
+machine learning using MLlib.
 
 # SparkR DataFrames
 
@@ -230,3 +231,37 @@ head(teenagers)
 
 {% endhighlight %}
 /div
+
+# Machine Learning
+
+SparkR allows the fitting of generalized linear models over DataFrames using 
the [glm()](api/R/glm.html) function. Under the hood, SparkR uses MLlib to 
train a model of the specified family. Currently the gaussian and binomial 
families are supported. We support a subset of the available R formula 
operators for model fitting, including '~', '.', '+', and '-'. The example 
below shows the use of building a gaussian GLM model using SparkR.
+
+div data-lang=r  markdown=1
+{% highlight r %}
+# Create the DataFrame
+df - createDataFrame(sqlContext, iris)
+
+# Fit a linear model over the dataset.
+model - glm(Sepal_Length ~ Sepal_Width + Species, data = df, family = 
gaussian)
+
+# Model coefficients are returned in a similar format to R's native glm().
+summary(model)
+##$coefficients
+##Estimate
+##(Intercept)2.2513930
+##Sepal_Width0.8035609
+##Species_versicolor 1.4587432
+##Species_virginica  1.9468169
+
+# Make predictions based on the model.
+predictions - predict(model, newData = df)
+head(select(predictions, Sepal_Length, prediction))
+##  Sepal_Length prediction
+##1  5.1   5.063856
+##2

Git Push Summary

2015-08-11 Thread pwendell

Repository: spark
Updated Tags:  refs/tags/v1.5.0-snapshot-20150811 [deleted] e9329ef6a

-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-1517] Refactor release scripts to facilitate nightly publishing

2015-08-11 Thread pwendell

Repository: spark
Updated Branches:
  refs/heads/branch-1.5 0119edf52 - 6ea33f5bf


[SPARK-1517] Refactor release scripts to facilitate nightly publishing

This update contains some code changes to the release scripts that allow easier 
nightly publishing. I've been using these new scripts on Jenkins for cutting 
and publishing nightly snapshots for the last month or so, and it has been 
going well. I'd like to get them merged back upstream so this can be maintained 
by the community.

The main changes are:
1. Separates the release tagging from various build possibilities for an 
already tagged release (`release-tag.sh` and `release-build.sh`).
2. Allow for injecting credentials through the environment, including GPG keys. 
This is then paired with secure key injection in Jenkins.
3. Support for copying build results to a remote directory, and also rotating 
results, e.g. the ability to keep the last N copies of binary or doc builds.

I'm happy if anyone wants to take a look at this - it's not user facing but an 
internal utility used for generating releases.

Author: Patrick Wendell patr...@databricks.com

Closes #7411 from pwendell/release-script-updates and squashes the following 
commits:

74f9beb [Patrick Wendell] Moving maven build command to a variable
233ce85 [Patrick Wendell] [SPARK-1517] Refactor release scripts to facilitate 
nightly publishing

(cherry picked from commit 3ef0f32928fc383ad3edd5ad167212aeb9eba6e1)
Signed-off-by: Patrick Wendell patr...@databricks.com


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6ea33f5b
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6ea33f5b
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6ea33f5b

Branch: refs/heads/branch-1.5
Commit: 6ea33f5bf1bf5577d99951b77e473c2b1479ac5c
Parents: 0119edf
Author: Patrick Wendell patr...@databricks.com
Authored: Tue Aug 11 21:16:48 2015 -0700
Committer: Patrick Wendell patr...@databricks.com
Committed: Tue Aug 11 21:16:59 2015 -0700

--
 dev/create-release/create-release.sh | 267 -
 dev/create-release/release-build.sh  | 321 ++
 dev/create-release/release-tag.sh|  79 
 3 files changed, 400 insertions(+), 267 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/6ea33f5b/dev/create-release/create-release.sh
--
diff --git a/dev/create-release/create-release.sh 
b/dev/create-release/create-release.sh
deleted file mode 100755
index 4311c8c..000
--- a/dev/create-release/create-release.sh
+++ /dev/null
@@ -1,267 +0,0 @@
-#!/usr/bin/env bash
-
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the License); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# Quick-and-dirty automation of making maven and binary releases. Not robust 
at all.
-# Publishes releases to Maven and packages/copies binary release artifacts.
-# Expects to be run in a totally empty directory.
-#
-# Options:
-#  --skip-create-release   Assume the desired release tag already exists
-#  --skip-publish  Do not publish to Maven central
-#  --skip-package  Do not package and upload binary artifacts
-# Would be nice to add:
-#  - Send output to stderr and have useful logging in stdout
-
-# Note: The following variables must be set before use!
-ASF_USERNAME=${ASF_USERNAME:-pwendell}
-ASF_PASSWORD=${ASF_PASSWORD:-XXX}
-GPG_PASSPHRASE=${GPG_PASSPHRASE:-XXX}
-GIT_BRANCH=${GIT_BRANCH:-branch-1.0}
-RELEASE_VERSION=${RELEASE_VERSION:-1.2.0}
-# Allows publishing under a different version identifier than
-# was present in the actual release sources (e.g. rc-X)
-PUBLISH_VERSION=${PUBLISH_VERSION:-$RELEASE_VERSION} 
-NEXT_VERSION=${NEXT_VERSION:-1.2.1}
-RC_NAME=${RC_NAME:-rc2}
-
-M2_REPO=~/.m2/repository
-SPARK_REPO=$M2_REPO/org/apache/spark
-NEXUS_ROOT=https://repository.apache.org/service/local/staging
-NEXUS_PROFILE=d63f592e7eac0 # Profile for Spark staging uploads
-
-if [ -z $JAVA_HOME ]; then
-  echo Error: JAVA_HOME is not set, cannot proceed.
-  exit -1
-fi
-JAVA_7_HOME=${JAVA_7_HOME:-$JAVA_HOME}
-

45 matches

Mail list logo