spark git commit: [SPARK-9814] [SQL] EqualNotNull not passing to data sources
Repository: spark Updated Branches: refs/heads/branch-1.5 e9d1eab92 - eead87ef2 [SPARK-9814] [SQL] EqualNotNull not passing to data sources Author: hyukjinkwon gurwls...@gmail.com Author: ê¶íì§ gurwls...@gmail.com Closes #8096 from HyukjinKwon/master. (cherry picked from commit 00c02728a6c6c4282c389ca90641dd78dd5e3d32) Signed-off-by: Reynold Xin r...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/eead87ef Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/eead87ef Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/eead87ef Branch: refs/heads/branch-1.5 Commit: eead87ef2459a8c1d5257ea0b22526a76ddf1f69 Parents: e9d1eab Author: hyukjinkwon gurwls...@gmail.com Authored: Tue Aug 11 14:04:09 2015 -0700 Committer: Reynold Xin r...@databricks.com Committed: Tue Aug 11 14:04:23 2015 -0700 -- .../sql/execution/datasources/DataSourceStrategy.scala | 5 + .../main/scala/org/apache/spark/sql/sources/filters.scala | 9 + .../org/apache/spark/sql/sources/FilteredScanSuite.scala| 1 + 3 files changed, 15 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/eead87ef/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala index 78a4acd..2a4c40d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala @@ -349,6 +349,11 @@ private[sql] object DataSourceStrategy extends Strategy with Logging { case expressions.EqualTo(Literal(v, _), a: Attribute) = Some(sources.EqualTo(a.name, v)) + case expressions.EqualNullSafe(a: Attribute, Literal(v, _)) = +Some(sources.EqualNullSafe(a.name, v)) + case expressions.EqualNullSafe(Literal(v, _), a: Attribute) = +Some(sources.EqualNullSafe(a.name, v)) + case expressions.GreaterThan(a: Attribute, Literal(v, _)) = Some(sources.GreaterThan(a.name, v)) case expressions.GreaterThan(Literal(v, _), a: Attribute) = http://git-wip-us.apache.org/repos/asf/spark/blob/eead87ef/sql/core/src/main/scala/org/apache/spark/sql/sources/filters.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/filters.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/filters.scala index 4d942e4..3780cbb 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/sources/filters.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/filters.scala @@ -37,6 +37,15 @@ abstract class Filter case class EqualTo(attribute: String, value: Any) extends Filter /** + * Performs equality comparison, similar to [[EqualTo]]. However, this differs from [[EqualTo]] + * in that it returns `true` (rather than NULL) if both inputs are NULL, and `false` + * (rather than NULL) if one of the input is NULL and the other is not NULL. + * + * @since 1.5.0 + */ +case class EqualNullSafe(attribute: String, value: Any) extends Filter + +/** * A filter that evaluates to `true` iff the attribute evaluates to a value * greater than `value`. * http://git-wip-us.apache.org/repos/asf/spark/blob/eead87ef/sql/core/src/test/scala/org/apache/spark/sql/sources/FilteredScanSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/FilteredScanSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/FilteredScanSuite.scala index 81b3a0f..5ef3657 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/FilteredScanSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/FilteredScanSuite.scala @@ -56,6 +56,7 @@ case class SimpleFilteredScan(from: Int, to: Int)(@transient val sqlContext: SQL // Predicate test on integer column def translateFilterOnA(filter: Filter): Int = Boolean = filter match { case EqualTo(a, v) = (a: Int) = a == v + case EqualNullSafe(a, v) = (a: Int) = a == v case LessThan(a, v: Int) = (a: Int) = a v case LessThanOrEqual(a, v: Int) = (a: Int) = a = v case GreaterThan(a, v: Int) = (a: Int) = a v - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9824] [CORE] Fix the issue that InternalAccumulator leaks WeakReference
Repository: spark Updated Branches: refs/heads/master 00c02728a - f16bc68df [SPARK-9824] [CORE] Fix the issue that InternalAccumulator leaks WeakReference `InternalAccumulator.create` doesn't call `registerAccumulatorForCleanup` to register itself with ContextCleaner, so `WeakReference`s for these accumulators in `Accumulators.originals` won't be removed. This PR added `registerAccumulatorForCleanup` for internal accumulators to avoid the memory leak. Author: zsxwing zsxw...@gmail.com Closes #8108 from zsxwing/internal-accumulators-leak. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f16bc68d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f16bc68d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f16bc68d Branch: refs/heads/master Commit: f16bc68dfb25c7b746ae031a57840ace9bafa87f Parents: 00c0272 Author: zsxwing zsxw...@gmail.com Authored: Tue Aug 11 14:06:23 2015 -0700 Committer: Reynold Xin r...@databricks.com Committed: Tue Aug 11 14:06:23 2015 -0700 -- .../scala/org/apache/spark/Accumulators.scala | 22 .../org/apache/spark/scheduler/Stage.scala | 2 +- .../org/apache/spark/AccumulatorSuite.scala | 3 ++- 3 files changed, 16 insertions(+), 11 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f16bc68d/core/src/main/scala/org/apache/spark/Accumulators.scala -- diff --git a/core/src/main/scala/org/apache/spark/Accumulators.scala b/core/src/main/scala/org/apache/spark/Accumulators.scala index 064246d..c39c866 100644 --- a/core/src/main/scala/org/apache/spark/Accumulators.scala +++ b/core/src/main/scala/org/apache/spark/Accumulators.scala @@ -382,14 +382,18 @@ private[spark] object InternalAccumulator { * add to the same set of accumulators. We do this to report the distribution of accumulator * values across all tasks within each stage. */ - def create(): Seq[Accumulator[Long]] = { -Seq( - // Execution memory refers to the memory used by internal data structures created - // during shuffles, aggregations and joins. The value of this accumulator should be - // approximately the sum of the peak sizes across all such data structures created - // in this task. For SQL jobs, this only tracks all unsafe operators and ExternalSort. - new Accumulator( -0L, AccumulatorParam.LongAccumulatorParam, Some(PEAK_EXECUTION_MEMORY), internal = true) -) ++ maybeTestAccumulator.toSeq + def create(sc: SparkContext): Seq[Accumulator[Long]] = { +val internalAccumulators = Seq( +// Execution memory refers to the memory used by internal data structures created +// during shuffles, aggregations and joins. The value of this accumulator should be +// approximately the sum of the peak sizes across all such data structures created +// in this task. For SQL jobs, this only tracks all unsafe operators and ExternalSort. +new Accumulator( + 0L, AccumulatorParam.LongAccumulatorParam, Some(PEAK_EXECUTION_MEMORY), internal = true) + ) ++ maybeTestAccumulator.toSeq +internalAccumulators.foreach { accumulator = + sc.cleaner.foreach(_.registerAccumulatorForCleanup(accumulator)) +} +internalAccumulators } } http://git-wip-us.apache.org/repos/asf/spark/blob/f16bc68d/core/src/main/scala/org/apache/spark/scheduler/Stage.scala -- diff --git a/core/src/main/scala/org/apache/spark/scheduler/Stage.scala b/core/src/main/scala/org/apache/spark/scheduler/Stage.scala index de05ee2..1cf0685 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/Stage.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/Stage.scala @@ -81,7 +81,7 @@ private[spark] abstract class Stage( * accumulators here again will override partial values from the finished tasks. */ def resetInternalAccumulators(): Unit = { -_internalAccumulators = InternalAccumulator.create() +_internalAccumulators = InternalAccumulator.create(rdd.sparkContext) } /** http://git-wip-us.apache.org/repos/asf/spark/blob/f16bc68d/core/src/test/scala/org/apache/spark/AccumulatorSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/AccumulatorSuite.scala b/core/src/test/scala/org/apache/spark/AccumulatorSuite.scala index 48f5495..0eb2293 100644 --- a/core/src/test/scala/org/apache/spark/AccumulatorSuite.scala +++ b/core/src/test/scala/org/apache/spark/AccumulatorSuite.scala @@ -160,7 +160,8 @@ class AccumulatorSuite extends SparkFunSuite with Matchers with LocalSparkContex } test(internal accumulators in TaskContext) { -
spark git commit: Closes #1290 Closes #4934
Repository: spark Updated Branches: refs/heads/master f16bc68df - 423cdfd83 Closes #1290 Closes #4934 Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/423cdfd8 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/423cdfd8 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/423cdfd8 Branch: refs/heads/master Commit: 423cdfd83d7fd02a4f8cf3e714db913fd3f9ca09 Parents: f16bc68 Author: Xiangrui Meng m...@databricks.com Authored: Tue Aug 11 14:08:09 2015 -0700 Committer: Xiangrui Meng m...@databricks.com Committed: Tue Aug 11 14:08:09 2015 -0700 -- -- - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9788] [MLLIB] Fix LDA Binary Compatibility
Repository: spark Updated Branches: refs/heads/branch-1.5 cdf781db6 - 2273e7432 [SPARK-9788] [MLLIB] Fix LDA Binary Compatibility 1. Add âasymmetricDocConcentrationâ and revert docConcentration changes. If the (internal) doc concentration vector is a single value, âgetDocConcentration returns it. If it is a constant vector, getDocConcentration returns the first item, and fails otherwise. 2. Give `LDAModel.gammaShape` a default value in `LDAModel` concrete class constructors. jkbradley Author: Feynman Liang fli...@databricks.com Closes #8077 from feynmanliang/SPARK-9788 and squashes the following commits: 6b07bc8 [Feynman Liang] Code review changes 9d6a71e [Feynman Liang] Add asymmetricAlpha alias bf4e685 [Feynman Liang] Asymmetric docConcentration 4cab972 [Feynman Liang] Default gammaShape (cherry picked from commit be3e27164133025db860781bd5cdd3ca233edd21) Signed-off-by: Joseph K. Bradley jos...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2273e743 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2273e743 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2273e743 Branch: refs/heads/branch-1.5 Commit: 2273e7432ec218ba163a94f86307ad11904a1dee Parents: cdf781d Author: Feynman Liang fli...@databricks.com Authored: Tue Aug 11 14:21:53 2015 -0700 Committer: Joseph K. Bradley jos...@databricks.com Committed: Tue Aug 11 14:22:02 2015 -0700 -- .../org/apache/spark/mllib/clustering/LDA.scala | 27 --- .../spark/mllib/clustering/LDAModel.scala | 11 .../spark/mllib/clustering/LDAOptimizer.scala | 28 ++-- .../spark/mllib/clustering/LDASuite.scala | 4 +-- 4 files changed, 46 insertions(+), 24 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/2273e743/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala index ab124e6..0fc9b1a 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala @@ -79,7 +79,24 @@ class LDA private ( * * This is the parameter to a Dirichlet distribution. */ - def getDocConcentration: Vector = this.docConcentration + def getAsymmetricDocConcentration: Vector = this.docConcentration + + /** + * Concentration parameter (commonly named alpha) for the prior placed on documents' + * distributions over topics (theta). + * + * This method assumes the Dirichlet distribution is symmetric and can be described by a single + * [[Double]] parameter. It should fail if docConcentration is asymmetric. + */ + def getDocConcentration: Double = { +val parameter = docConcentration(0) +if (docConcentration.size == 1) { + parameter +} else { + require(docConcentration.toArray.forall(_ == parameter)) + parameter +} + } /** * Concentration parameter (commonly named alpha) for the prior placed on documents' @@ -106,18 +123,22 @@ class LDA private ( * [[https://github.com/Blei-Lab/onlineldavb]]. */ def setDocConcentration(docConcentration: Vector): this.type = { +require(docConcentration.size 0, docConcentration must have 0 elements) this.docConcentration = docConcentration this } - /** Replicates Double to create a symmetric prior */ + /** Replicates a [[Double]] docConcentration to create a symmetric prior. */ def setDocConcentration(docConcentration: Double): this.type = { this.docConcentration = Vectors.dense(docConcentration) this } + /** Alias for [[getAsymmetricDocConcentration]] */ + def getAsymmetricAlpha: Vector = getAsymmetricDocConcentration + /** Alias for [[getDocConcentration]] */ - def getAlpha: Vector = getDocConcentration + def getAlpha: Double = getDocConcentration /** Alias for [[setDocConcentration()]] */ def setAlpha(alpha: Vector): this.type = setDocConcentration(alpha) http://git-wip-us.apache.org/repos/asf/spark/blob/2273e743/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala index 33babda..5dc637e 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala @@ -27,7 +27,6 @@ import org.json4s.jackson.JsonMethods._ import org.apache.spark.SparkContext import
spark git commit: [SPARK-9788] [MLLIB] Fix LDA Binary Compatibility
Repository: spark Updated Branches: refs/heads/master 423cdfd83 - be3e27164 [SPARK-9788] [MLLIB] Fix LDA Binary Compatibility 1. Add âasymmetricDocConcentrationâ and revert docConcentration changes. If the (internal) doc concentration vector is a single value, âgetDocConcentration returns it. If it is a constant vector, getDocConcentration returns the first item, and fails otherwise. 2. Give `LDAModel.gammaShape` a default value in `LDAModel` concrete class constructors. jkbradley Author: Feynman Liang fli...@databricks.com Closes #8077 from feynmanliang/SPARK-9788 and squashes the following commits: 6b07bc8 [Feynman Liang] Code review changes 9d6a71e [Feynman Liang] Add asymmetricAlpha alias bf4e685 [Feynman Liang] Asymmetric docConcentration 4cab972 [Feynman Liang] Default gammaShape Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/be3e2716 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/be3e2716 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/be3e2716 Branch: refs/heads/master Commit: be3e27164133025db860781bd5cdd3ca233edd21 Parents: 423cdfd Author: Feynman Liang fli...@databricks.com Authored: Tue Aug 11 14:21:53 2015 -0700 Committer: Joseph K. Bradley jos...@databricks.com Committed: Tue Aug 11 14:21:53 2015 -0700 -- .../org/apache/spark/mllib/clustering/LDA.scala | 27 --- .../spark/mllib/clustering/LDAModel.scala | 11 .../spark/mllib/clustering/LDAOptimizer.scala | 28 ++-- .../spark/mllib/clustering/LDASuite.scala | 4 +-- 4 files changed, 46 insertions(+), 24 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/be3e2716/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala index ab124e6..0fc9b1a 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala @@ -79,7 +79,24 @@ class LDA private ( * * This is the parameter to a Dirichlet distribution. */ - def getDocConcentration: Vector = this.docConcentration + def getAsymmetricDocConcentration: Vector = this.docConcentration + + /** + * Concentration parameter (commonly named alpha) for the prior placed on documents' + * distributions over topics (theta). + * + * This method assumes the Dirichlet distribution is symmetric and can be described by a single + * [[Double]] parameter. It should fail if docConcentration is asymmetric. + */ + def getDocConcentration: Double = { +val parameter = docConcentration(0) +if (docConcentration.size == 1) { + parameter +} else { + require(docConcentration.toArray.forall(_ == parameter)) + parameter +} + } /** * Concentration parameter (commonly named alpha) for the prior placed on documents' @@ -106,18 +123,22 @@ class LDA private ( * [[https://github.com/Blei-Lab/onlineldavb]]. */ def setDocConcentration(docConcentration: Vector): this.type = { +require(docConcentration.size 0, docConcentration must have 0 elements) this.docConcentration = docConcentration this } - /** Replicates Double to create a symmetric prior */ + /** Replicates a [[Double]] docConcentration to create a symmetric prior. */ def setDocConcentration(docConcentration: Double): this.type = { this.docConcentration = Vectors.dense(docConcentration) this } + /** Alias for [[getAsymmetricDocConcentration]] */ + def getAsymmetricAlpha: Vector = getAsymmetricDocConcentration + /** Alias for [[getDocConcentration]] */ - def getAlpha: Vector = getDocConcentration + def getAlpha: Double = getDocConcentration /** Alias for [[setDocConcentration()]] */ def setAlpha(alpha: Vector): this.type = setDocConcentration(alpha) http://git-wip-us.apache.org/repos/asf/spark/blob/be3e2716/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala index 33babda..5dc637e 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala @@ -27,7 +27,6 @@ import org.json4s.jackson.JsonMethods._ import org.apache.spark.SparkContext import org.apache.spark.annotation.Experimental import org.apache.spark.api.java.JavaPairRDD -import org.apache.spark.broadcast.Broadcast import
Git Push Summary
Repository: spark Updated Tags: refs/tags/v1.5.0-snapshot-20150811 [created] e9329ef6a - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[1/2] spark git commit: Preparing Spark release v1.5.0-snapshot-20150811
Repository: spark Updated Branches: refs/heads/branch-1.5 ef961ed48 - 725e5c7a4 Preparing Spark release v1.5.0-snapshot-20150811 Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e9329ef6 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e9329ef6 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e9329ef6 Branch: refs/heads/branch-1.5 Commit: e9329ef6a48d141446777c64f58467827ee5faaa Parents: ef961ed Author: Patrick Wendell pwend...@gmail.com Authored: Tue Aug 11 14:32:37 2015 -0700 Committer: Patrick Wendell pwend...@gmail.com Committed: Tue Aug 11 14:32:37 2015 -0700 -- assembly/pom.xml| 2 +- bagel/pom.xml | 2 +- core/pom.xml| 2 +- examples/pom.xml| 2 +- external/flume-assembly/pom.xml | 2 +- external/flume-sink/pom.xml | 2 +- external/flume/pom.xml | 2 +- external/kafka-assembly/pom.xml | 2 +- external/kafka/pom.xml | 2 +- external/mqtt-assembly/pom.xml | 2 +- external/mqtt/pom.xml | 2 +- external/twitter/pom.xml| 2 +- external/zeromq/pom.xml | 2 +- extras/java8-tests/pom.xml | 2 +- extras/kinesis-asl-assembly/pom.xml | 2 +- extras/kinesis-asl/pom.xml | 2 +- extras/spark-ganglia-lgpl/pom.xml | 2 +- graphx/pom.xml | 2 +- launcher/pom.xml| 2 +- mllib/pom.xml | 2 +- network/common/pom.xml | 2 +- network/shuffle/pom.xml | 2 +- network/yarn/pom.xml| 2 +- pom.xml | 2 +- repl/pom.xml| 2 +- sql/catalyst/pom.xml| 2 +- sql/core/pom.xml| 2 +- sql/hive-thriftserver/pom.xml | 2 +- sql/hive/pom.xml| 2 +- streaming/pom.xml | 2 +- tools/pom.xml | 2 +- unsafe/pom.xml | 2 +- yarn/pom.xml| 2 +- 33 files changed, 33 insertions(+), 33 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e9329ef6/assembly/pom.xml -- diff --git a/assembly/pom.xml b/assembly/pom.xml index e9c6d26..3ef7d6f 100644 --- a/assembly/pom.xml +++ b/assembly/pom.xml @@ -21,7 +21,7 @@ parent groupIdorg.apache.spark/groupId artifactIdspark-parent_2.10/artifactId -version1.5.0-SNAPSHOT/version +version1.5.0/version relativePath../pom.xml/relativePath /parent http://git-wip-us.apache.org/repos/asf/spark/blob/e9329ef6/bagel/pom.xml -- diff --git a/bagel/pom.xml b/bagel/pom.xml index ed5c37e..684e07b 100644 --- a/bagel/pom.xml +++ b/bagel/pom.xml @@ -21,7 +21,7 @@ parent groupIdorg.apache.spark/groupId artifactIdspark-parent_2.10/artifactId -version1.5.0-SNAPSHOT/version +version1.5.0/version relativePath../pom.xml/relativePath /parent http://git-wip-us.apache.org/repos/asf/spark/blob/e9329ef6/core/pom.xml -- diff --git a/core/pom.xml b/core/pom.xml index 0e53a79..bb25652 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -21,7 +21,7 @@ parent groupIdorg.apache.spark/groupId artifactIdspark-parent_2.10/artifactId -version1.5.0-SNAPSHOT/version +version1.5.0/version relativePath../pom.xml/relativePath /parent http://git-wip-us.apache.org/repos/asf/spark/blob/e9329ef6/examples/pom.xml -- diff --git a/examples/pom.xml b/examples/pom.xml index e6884b0..9ef1eda 100644 --- a/examples/pom.xml +++ b/examples/pom.xml @@ -21,7 +21,7 @@ parent groupIdorg.apache.spark/groupId artifactIdspark-parent_2.10/artifactId -version1.5.0-SNAPSHOT/version +version1.5.0/version relativePath../pom.xml/relativePath /parent http://git-wip-us.apache.org/repos/asf/spark/blob/e9329ef6/external/flume-assembly/pom.xml -- diff --git a/external/flume-assembly/pom.xml b/external/flume-assembly/pom.xml index 1318959..6377c3e 100644 --- a/external/flume-assembly/pom.xml +++ b/external/flume-assembly/pom.xml @@ -21,7 +21,7 @@ parent groupIdorg.apache.spark/groupId artifactIdspark-parent_2.10/artifactId -version1.5.0-SNAPSHOT/version +version1.5.0/version relativePath../../pom.xml/relativePath /parent http://git-wip-us.apache.org/repos/asf/spark/blob/e9329ef6/external/flume-sink/pom.xml
[2/2] spark git commit: Preparing development version 1.5.0-SNAPSHOT
Preparing development version 1.5.0-SNAPSHOT Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/725e5c7a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/725e5c7a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/725e5c7a Branch: refs/heads/branch-1.5 Commit: 725e5c7a44f08deda23d2a15617557a354f21dc9 Parents: e9329ef Author: Patrick Wendell pwend...@gmail.com Authored: Tue Aug 11 14:32:43 2015 -0700 Committer: Patrick Wendell pwend...@gmail.com Committed: Tue Aug 11 14:32:43 2015 -0700 -- assembly/pom.xml| 2 +- bagel/pom.xml | 2 +- core/pom.xml| 2 +- examples/pom.xml| 2 +- external/flume-assembly/pom.xml | 2 +- external/flume-sink/pom.xml | 2 +- external/flume/pom.xml | 2 +- external/kafka-assembly/pom.xml | 2 +- external/kafka/pom.xml | 2 +- external/mqtt-assembly/pom.xml | 2 +- external/mqtt/pom.xml | 2 +- external/twitter/pom.xml| 2 +- external/zeromq/pom.xml | 2 +- extras/java8-tests/pom.xml | 2 +- extras/kinesis-asl-assembly/pom.xml | 2 +- extras/kinesis-asl/pom.xml | 2 +- extras/spark-ganglia-lgpl/pom.xml | 2 +- graphx/pom.xml | 2 +- launcher/pom.xml| 2 +- mllib/pom.xml | 2 +- network/common/pom.xml | 2 +- network/shuffle/pom.xml | 2 +- network/yarn/pom.xml| 2 +- pom.xml | 2 +- repl/pom.xml| 2 +- sql/catalyst/pom.xml| 2 +- sql/core/pom.xml| 2 +- sql/hive-thriftserver/pom.xml | 2 +- sql/hive/pom.xml| 2 +- streaming/pom.xml | 2 +- tools/pom.xml | 2 +- unsafe/pom.xml | 2 +- yarn/pom.xml| 2 +- 33 files changed, 33 insertions(+), 33 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/725e5c7a/assembly/pom.xml -- diff --git a/assembly/pom.xml b/assembly/pom.xml index 3ef7d6f..e9c6d26 100644 --- a/assembly/pom.xml +++ b/assembly/pom.xml @@ -21,7 +21,7 @@ parent groupIdorg.apache.spark/groupId artifactIdspark-parent_2.10/artifactId -version1.5.0/version +version1.5.0-SNAPSHOT/version relativePath../pom.xml/relativePath /parent http://git-wip-us.apache.org/repos/asf/spark/blob/725e5c7a/bagel/pom.xml -- diff --git a/bagel/pom.xml b/bagel/pom.xml index 684e07b..ed5c37e 100644 --- a/bagel/pom.xml +++ b/bagel/pom.xml @@ -21,7 +21,7 @@ parent groupIdorg.apache.spark/groupId artifactIdspark-parent_2.10/artifactId -version1.5.0/version +version1.5.0-SNAPSHOT/version relativePath../pom.xml/relativePath /parent http://git-wip-us.apache.org/repos/asf/spark/blob/725e5c7a/core/pom.xml -- diff --git a/core/pom.xml b/core/pom.xml index bb25652..0e53a79 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -21,7 +21,7 @@ parent groupIdorg.apache.spark/groupId artifactIdspark-parent_2.10/artifactId -version1.5.0/version +version1.5.0-SNAPSHOT/version relativePath../pom.xml/relativePath /parent http://git-wip-us.apache.org/repos/asf/spark/blob/725e5c7a/examples/pom.xml -- diff --git a/examples/pom.xml b/examples/pom.xml index 9ef1eda..e6884b0 100644 --- a/examples/pom.xml +++ b/examples/pom.xml @@ -21,7 +21,7 @@ parent groupIdorg.apache.spark/groupId artifactIdspark-parent_2.10/artifactId -version1.5.0/version +version1.5.0-SNAPSHOT/version relativePath../pom.xml/relativePath /parent http://git-wip-us.apache.org/repos/asf/spark/blob/725e5c7a/external/flume-assembly/pom.xml -- diff --git a/external/flume-assembly/pom.xml b/external/flume-assembly/pom.xml index 6377c3e..1318959 100644 --- a/external/flume-assembly/pom.xml +++ b/external/flume-assembly/pom.xml @@ -21,7 +21,7 @@ parent groupIdorg.apache.spark/groupId artifactIdspark-parent_2.10/artifactId -version1.5.0/version +version1.5.0-SNAPSHOT/version relativePath../../pom.xml/relativePath /parent http://git-wip-us.apache.org/repos/asf/spark/blob/725e5c7a/external/flume-sink/pom.xml -- diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml index
spark git commit: [SPARK-7726] Add import so Scaladoc doesn't fail.
Repository: spark Updated Branches: refs/heads/master 520ad44b1 - 2a3be4ddf [SPARK-7726] Add import so Scaladoc doesn't fail. This is another import needed so Scala 2.11 doc generation doesn't fail. See SPARK-7726 for more detail. I tested this locally and the 2.11 install goes from failing to succeeding with this patch. Author: Patrick Wendell patr...@databricks.com Closes #8095 from pwendell/scaladoc. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2a3be4dd Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2a3be4dd Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2a3be4dd Branch: refs/heads/master Commit: 2a3be4ddf9d9527353f07ea0ab204ce17dbcba9a Parents: 520ad44 Author: Patrick Wendell patr...@databricks.com Authored: Tue Aug 11 14:02:23 2015 -0700 Committer: Reynold Xin r...@databricks.com Committed: Tue Aug 11 14:02:23 2015 -0700 -- .../spark/network/shuffle/protocol/mesos/RegisterDriver.java | 3 +++ 1 file changed, 3 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/2a3be4dd/network/shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/mesos/RegisterDriver.java -- diff --git a/network/shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/mesos/RegisterDriver.java b/network/shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/mesos/RegisterDriver.java index 1c28fc1..94a61d6 100644 --- a/network/shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/mesos/RegisterDriver.java +++ b/network/shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/mesos/RegisterDriver.java @@ -23,6 +23,9 @@ import io.netty.buffer.ByteBuf; import org.apache.spark.network.protocol.Encoders; import org.apache.spark.network.shuffle.protocol.BlockTransferMessage; +// Needed by ScalaDoc. See SPARK-7726 +import static org.apache.spark.network.shuffle.protocol.BlockTransferMessage.Type; + /** * A message sent from the driver to register with the MesosExternalShuffleService. */ - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-7726] Add import so Scaladoc doesn't fail.
Repository: spark Updated Branches: refs/heads/branch-1.5 811d23f1c - e9d1eab92 [SPARK-7726] Add import so Scaladoc doesn't fail. This is another import needed so Scala 2.11 doc generation doesn't fail. See SPARK-7726 for more detail. I tested this locally and the 2.11 install goes from failing to succeeding with this patch. Author: Patrick Wendell patr...@databricks.com Closes #8095 from pwendell/scaladoc. (cherry picked from commit 2a3be4ddf9d9527353f07ea0ab204ce17dbcba9a) Signed-off-by: Reynold Xin r...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e9d1eab9 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e9d1eab9 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e9d1eab9 Branch: refs/heads/branch-1.5 Commit: e9d1eab925df5510085928eb34a43b4a15eb01a2 Parents: 811d23f Author: Patrick Wendell patr...@databricks.com Authored: Tue Aug 11 14:02:23 2015 -0700 Committer: Reynold Xin r...@databricks.com Committed: Tue Aug 11 14:02:46 2015 -0700 -- .../spark/network/shuffle/protocol/mesos/RegisterDriver.java | 3 +++ 1 file changed, 3 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e9d1eab9/network/shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/mesos/RegisterDriver.java -- diff --git a/network/shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/mesos/RegisterDriver.java b/network/shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/mesos/RegisterDriver.java index 1c28fc1..94a61d6 100644 --- a/network/shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/mesos/RegisterDriver.java +++ b/network/shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/mesos/RegisterDriver.java @@ -23,6 +23,9 @@ import io.netty.buffer.ByteBuf; import org.apache.spark.network.protocol.Encoders; import org.apache.spark.network.shuffle.protocol.BlockTransferMessage; +// Needed by ScalaDoc. See SPARK-7726 +import static org.apache.spark.network.shuffle.protocol.BlockTransferMessage.Type; + /** * A message sent from the driver to register with the MesosExternalShuffleService. */ - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9814] [SQL] EqualNotNull not passing to data sources
Repository: spark Updated Branches: refs/heads/master 2a3be4ddf - 00c02728a [SPARK-9814] [SQL] EqualNotNull not passing to data sources Author: hyukjinkwon gurwls...@gmail.com Author: ê¶íì§ gurwls...@gmail.com Closes #8096 from HyukjinKwon/master. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/00c02728 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/00c02728 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/00c02728 Branch: refs/heads/master Commit: 00c02728a6c6c4282c389ca90641dd78dd5e3d32 Parents: 2a3be4d Author: hyukjinkwon gurwls...@gmail.com Authored: Tue Aug 11 14:04:09 2015 -0700 Committer: Reynold Xin r...@databricks.com Committed: Tue Aug 11 14:04:09 2015 -0700 -- .../sql/execution/datasources/DataSourceStrategy.scala | 5 + .../main/scala/org/apache/spark/sql/sources/filters.scala | 9 + .../org/apache/spark/sql/sources/FilteredScanSuite.scala| 1 + 3 files changed, 15 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/00c02728/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala index 78a4acd..2a4c40d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala @@ -349,6 +349,11 @@ private[sql] object DataSourceStrategy extends Strategy with Logging { case expressions.EqualTo(Literal(v, _), a: Attribute) = Some(sources.EqualTo(a.name, v)) + case expressions.EqualNullSafe(a: Attribute, Literal(v, _)) = +Some(sources.EqualNullSafe(a.name, v)) + case expressions.EqualNullSafe(Literal(v, _), a: Attribute) = +Some(sources.EqualNullSafe(a.name, v)) + case expressions.GreaterThan(a: Attribute, Literal(v, _)) = Some(sources.GreaterThan(a.name, v)) case expressions.GreaterThan(Literal(v, _), a: Attribute) = http://git-wip-us.apache.org/repos/asf/spark/blob/00c02728/sql/core/src/main/scala/org/apache/spark/sql/sources/filters.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/filters.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/filters.scala index 4d942e4..3780cbb 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/sources/filters.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/filters.scala @@ -37,6 +37,15 @@ abstract class Filter case class EqualTo(attribute: String, value: Any) extends Filter /** + * Performs equality comparison, similar to [[EqualTo]]. However, this differs from [[EqualTo]] + * in that it returns `true` (rather than NULL) if both inputs are NULL, and `false` + * (rather than NULL) if one of the input is NULL and the other is not NULL. + * + * @since 1.5.0 + */ +case class EqualNullSafe(attribute: String, value: Any) extends Filter + +/** * A filter that evaluates to `true` iff the attribute evaluates to a value * greater than `value`. * http://git-wip-us.apache.org/repos/asf/spark/blob/00c02728/sql/core/src/test/scala/org/apache/spark/sql/sources/FilteredScanSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/FilteredScanSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/FilteredScanSuite.scala index 81b3a0f..5ef3657 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/FilteredScanSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/FilteredScanSuite.scala @@ -56,6 +56,7 @@ case class SimpleFilteredScan(from: Int, to: Int)(@transient val sqlContext: SQL // Predicate test on integer column def translateFilterOnA(filter: Filter): Int = Boolean = filter match { case EqualTo(a, v) = (a: Int) = a == v + case EqualNullSafe(a, v) = (a: Int) = a == v case LessThan(a, v: Int) = (a: Int) = a v case LessThanOrEqual(a, v: Int) = (a: Int) = a = v case GreaterThan(a, v: Int) = (a: Int) = a v - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-8925] [MLLIB] Add @since tags to mllib.util
Repository: spark Updated Branches: refs/heads/branch-1.5 2273e7432 - ef961ed48 [SPARK-8925] [MLLIB] Add @since tags to mllib.util Went thru the history of changes the file MLUtils.scala and picked up the version that the change went in. Author: Sudhakar Thota sudhakarth...@yahoo.com Author: Sudhakar Thota sudhakarth...@sudhakars-mbp-2.usca.ibm.com Closes #7436 from sthota2014/SPARK-8925_thotas. (cherry picked from commit 017b5de07ef6cff249e984a2ab781c520249ac76) Signed-off-by: Xiangrui Meng m...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ef961ed4 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ef961ed4 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ef961ed4 Branch: refs/heads/branch-1.5 Commit: ef961ed48a4f45447f0e0ad256b040c7ab2d78d9 Parents: 2273e74 Author: Sudhakar Thota sudhakarth...@yahoo.com Authored: Tue Aug 11 14:31:51 2015 -0700 Committer: Xiangrui Meng m...@databricks.com Committed: Tue Aug 11 14:32:01 2015 -0700 -- .../org/apache/spark/mllib/util/MLUtils.scala | 22 +++- 1 file changed, 21 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ef961ed4/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala index 7c5cfa7..26eb84a 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala @@ -64,6 +64,7 @@ object MLUtils { *feature dimensions. * @param minPartitions min number of partitions * @return labeled data stored as an RDD[LabeledPoint] + * @since 1.0.0 */ def loadLibSVMFile( sc: SparkContext, @@ -113,7 +114,10 @@ object MLUtils { } // Convenient methods for `loadLibSVMFile`. - + + /** + * @since 1.0.0 + */ @deprecated(use method without multiclass argument, which no longer has effect, 1.1.0) def loadLibSVMFile( sc: SparkContext, @@ -126,6 +130,7 @@ object MLUtils { /** * Loads labeled data in the LIBSVM format into an RDD[LabeledPoint], with the default number of * partitions. + * @since 1.0.0 */ def loadLibSVMFile( sc: SparkContext, @@ -133,6 +138,9 @@ object MLUtils { numFeatures: Int): RDD[LabeledPoint] = loadLibSVMFile(sc, path, numFeatures, sc.defaultMinPartitions) + /** + * @since 1.0.0 + */ @deprecated(use method without multiclass argument, which no longer has effect, 1.1.0) def loadLibSVMFile( sc: SparkContext, @@ -141,6 +149,9 @@ object MLUtils { numFeatures: Int): RDD[LabeledPoint] = loadLibSVMFile(sc, path, numFeatures) + /** + * @since 1.0.0 + */ @deprecated(use method without multiclass argument, which no longer has effect, 1.1.0) def loadLibSVMFile( sc: SparkContext, @@ -151,6 +162,7 @@ object MLUtils { /** * Loads binary labeled data in the LIBSVM format into an RDD[LabeledPoint], with number of * features determined automatically and the default number of partitions. + * @since 1.0.0 */ def loadLibSVMFile(sc: SparkContext, path: String): RDD[LabeledPoint] = loadLibSVMFile(sc, path, -1) @@ -181,12 +193,14 @@ object MLUtils { * @param path file or directory path in any Hadoop-supported file system URI * @param minPartitions min number of partitions * @return vectors stored as an RDD[Vector] + * @since 1.1.0 */ def loadVectors(sc: SparkContext, path: String, minPartitions: Int): RDD[Vector] = sc.textFile(path, minPartitions).map(Vectors.parse) /** * Loads vectors saved using `RDD[Vector].saveAsTextFile` with the default number of partitions. + * @since 1.1.0 */ def loadVectors(sc: SparkContext, path: String): RDD[Vector] = sc.textFile(path, sc.defaultMinPartitions).map(Vectors.parse) @@ -197,6 +211,7 @@ object MLUtils { * @param path file or directory path in any Hadoop-supported file system URI * @param minPartitions min number of partitions * @return labeled points stored as an RDD[LabeledPoint] + * @since 1.1.0 */ def loadLabeledPoints(sc: SparkContext, path: String, minPartitions: Int): RDD[LabeledPoint] = sc.textFile(path, minPartitions).map(LabeledPoint.parse) @@ -204,6 +219,7 @@ object MLUtils { /** * Loads labeled points saved using `RDD[LabeledPoint].saveAsTextFile` with the default number of * partitions. + * @since 1.1.0 */ def loadLabeledPoints(sc: SparkContext, dir: String): RDD[LabeledPoint] = loadLabeledPoints(sc, dir, sc.defaultMinPartitions) @@ -220,6 +236,7 @@
spark git commit: [SPARK-8925] [MLLIB] Add @since tags to mllib.util
Repository: spark Updated Branches: refs/heads/master be3e27164 - 017b5de07 [SPARK-8925] [MLLIB] Add @since tags to mllib.util Went thru the history of changes the file MLUtils.scala and picked up the version that the change went in. Author: Sudhakar Thota sudhakarth...@yahoo.com Author: Sudhakar Thota sudhakarth...@sudhakars-mbp-2.usca.ibm.com Closes #7436 from sthota2014/SPARK-8925_thotas. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/017b5de0 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/017b5de0 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/017b5de0 Branch: refs/heads/master Commit: 017b5de07ef6cff249e984a2ab781c520249ac76 Parents: be3e271 Author: Sudhakar Thota sudhakarth...@yahoo.com Authored: Tue Aug 11 14:31:51 2015 -0700 Committer: Xiangrui Meng m...@databricks.com Committed: Tue Aug 11 14:31:51 2015 -0700 -- .../org/apache/spark/mllib/util/MLUtils.scala | 22 +++- 1 file changed, 21 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/017b5de0/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala index 7c5cfa7..26eb84a 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala @@ -64,6 +64,7 @@ object MLUtils { *feature dimensions. * @param minPartitions min number of partitions * @return labeled data stored as an RDD[LabeledPoint] + * @since 1.0.0 */ def loadLibSVMFile( sc: SparkContext, @@ -113,7 +114,10 @@ object MLUtils { } // Convenient methods for `loadLibSVMFile`. - + + /** + * @since 1.0.0 + */ @deprecated(use method without multiclass argument, which no longer has effect, 1.1.0) def loadLibSVMFile( sc: SparkContext, @@ -126,6 +130,7 @@ object MLUtils { /** * Loads labeled data in the LIBSVM format into an RDD[LabeledPoint], with the default number of * partitions. + * @since 1.0.0 */ def loadLibSVMFile( sc: SparkContext, @@ -133,6 +138,9 @@ object MLUtils { numFeatures: Int): RDD[LabeledPoint] = loadLibSVMFile(sc, path, numFeatures, sc.defaultMinPartitions) + /** + * @since 1.0.0 + */ @deprecated(use method without multiclass argument, which no longer has effect, 1.1.0) def loadLibSVMFile( sc: SparkContext, @@ -141,6 +149,9 @@ object MLUtils { numFeatures: Int): RDD[LabeledPoint] = loadLibSVMFile(sc, path, numFeatures) + /** + * @since 1.0.0 + */ @deprecated(use method without multiclass argument, which no longer has effect, 1.1.0) def loadLibSVMFile( sc: SparkContext, @@ -151,6 +162,7 @@ object MLUtils { /** * Loads binary labeled data in the LIBSVM format into an RDD[LabeledPoint], with number of * features determined automatically and the default number of partitions. + * @since 1.0.0 */ def loadLibSVMFile(sc: SparkContext, path: String): RDD[LabeledPoint] = loadLibSVMFile(sc, path, -1) @@ -181,12 +193,14 @@ object MLUtils { * @param path file or directory path in any Hadoop-supported file system URI * @param minPartitions min number of partitions * @return vectors stored as an RDD[Vector] + * @since 1.1.0 */ def loadVectors(sc: SparkContext, path: String, minPartitions: Int): RDD[Vector] = sc.textFile(path, minPartitions).map(Vectors.parse) /** * Loads vectors saved using `RDD[Vector].saveAsTextFile` with the default number of partitions. + * @since 1.1.0 */ def loadVectors(sc: SparkContext, path: String): RDD[Vector] = sc.textFile(path, sc.defaultMinPartitions).map(Vectors.parse) @@ -197,6 +211,7 @@ object MLUtils { * @param path file or directory path in any Hadoop-supported file system URI * @param minPartitions min number of partitions * @return labeled points stored as an RDD[LabeledPoint] + * @since 1.1.0 */ def loadLabeledPoints(sc: SparkContext, path: String, minPartitions: Int): RDD[LabeledPoint] = sc.textFile(path, minPartitions).map(LabeledPoint.parse) @@ -204,6 +219,7 @@ object MLUtils { /** * Loads labeled points saved using `RDD[LabeledPoint].saveAsTextFile` with the default number of * partitions. + * @since 1.1.0 */ def loadLabeledPoints(sc: SparkContext, dir: String): RDD[LabeledPoint] = loadLabeledPoints(sc, dir, sc.defaultMinPartitions) @@ -220,6 +236,7 @@ object MLUtils { * * @deprecated Should use [[org.apache.spark.rdd.RDD#saveAsTextFile]] for saving and *
spark git commit: [SPARK-9649] Fix flaky test MasterSuite again - disable REST
Repository: spark Updated Branches: refs/heads/branch-1.5 ec7a4b9b0 - 0119edf52 [SPARK-9649] Fix flaky test MasterSuite again - disable REST The REST server is not actually used in most tests and so we can disable it. It is a source of flakiness because it tries to bind to a specific port in vain. There was also some code that avoided the shuffle service in tests. This is actually not necessary because the shuffle service is already off by default. Author: Andrew Or and...@databricks.com Closes #8084 from andrewor14/fix-master-suite-again. (cherry picked from commit ca8f70e9d473d2c81866f3c330cc6545c33bdac7) Signed-off-by: Josh Rosen joshro...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0119edf5 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0119edf5 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0119edf5 Branch: refs/heads/branch-1.5 Commit: 0119edf52885c6c798cd00bf545f5b0b1f6910af Parents: ec7a4b9 Author: Andrew Or and...@databricks.com Authored: Tue Aug 11 20:46:58 2015 -0700 Committer: Josh Rosen joshro...@databricks.com Committed: Tue Aug 11 20:47:19 2015 -0700 -- pom.xml | 1 + project/SparkBuild.scala | 1 + 2 files changed, 2 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0119edf5/pom.xml -- diff --git a/pom.xml b/pom.xml index 8942836..cfd7d32 100644 --- a/pom.xml +++ b/pom.xml @@ -1895,6 +1895,7 @@ java.io.tmpdir${project.build.directory}/tmp/java.io.tmpdir spark.test.home${spark.test.home}/spark.test.home spark.testing1/spark.testing + spark.master.rest.enabledfalse/spark.master.rest.enabled spark.ui.enabledfalse/spark.ui.enabled spark.ui.showConsoleProgressfalse/spark.ui.showConsoleProgress spark.driver.allowMultipleContextstrue/spark.driver.allowMultipleContexts http://git-wip-us.apache.org/repos/asf/spark/blob/0119edf5/project/SparkBuild.scala -- diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala index cad7067..74f815f 100644 --- a/project/SparkBuild.scala +++ b/project/SparkBuild.scala @@ -546,6 +546,7 @@ object TestSettings { javaOptions in Test += -Dspark.test.home= + sparkHome, javaOptions in Test += -Dspark.testing=1, javaOptions in Test += -Dspark.port.maxRetries=100, +javaOptions in Test += -Dspark.master.rest.enabled=false, javaOptions in Test += -Dspark.ui.enabled=false, javaOptions in Test += -Dspark.ui.showConsoleProgress=false, javaOptions in Test += -Dspark.driver.allowMultipleContexts=true, - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9640] [STREAMING] [TEST] Do not run Python Kinesis tests when the Kinesis assembly JAR has not been generated
Repository: spark Updated Branches: refs/heads/branch-1.5 f9beef998 - c7f009040 [SPARK-9640] [STREAMING] [TEST] Do not run Python Kinesis tests when the Kinesis assembly JAR has not been generated Author: Tathagata Das tathagata.das1...@gmail.com Closes #7961 from tdas/SPARK-9640 and squashes the following commits: 974ce19 [Tathagata Das] Undo changes related to SPARK-9727 004ae26 [Tathagata Das] style fixes 9bbb97d [Tathagata Das] Minor style fies e6a677e [Tathagata Das] Merge remote-tracking branch 'apache-github/master' into SPARK-9640 ca90719 [Tathagata Das] Removed extra line ba9cfc7 [Tathagata Das] Improved kinesis test selection logic 88d59bd [Tathagata Das] updated test modules 871fcc8 [Tathagata Das] Fixed SparkBuild 94be631 [Tathagata Das] Fixed style b858196 [Tathagata Das] Fixed conditions and few other things based on PR comments. e292e64 [Tathagata Das] Added filters for Kinesis python tests (cherry picked from commit 0f90d6055e5bea9ceb1d454db84f4aa1d59b284d) Signed-off-by: Tathagata Das tathagata.das1...@gmail.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c7f00904 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c7f00904 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c7f00904 Branch: refs/heads/branch-1.5 Commit: c7f0090409c2a94a43404271730beded421a0f2f Parents: f9beef9 Author: Tathagata Das tathagata.das1...@gmail.com Authored: Mon Aug 10 23:41:53 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Mon Aug 10 23:42:44 2015 -0700 -- python/pyspark/streaming/tests.py | 56 ++ 1 file changed, 44 insertions(+), 12 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c7f00904/python/pyspark/streaming/tests.py -- diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py index 66ae334..f0ed415 100644 --- a/python/pyspark/streaming/tests.py +++ b/python/pyspark/streaming/tests.py @@ -971,8 +971,10 @@ class KinesisStreamTests(PySparkStreamingTestCase): awsAccessKey, awsSecretKey) def test_kinesis_stream(self): -if os.environ.get('ENABLE_KINESIS_TESTS') != '1': -print(Skip test_kinesis_stream) +if not are_kinesis_tests_enabled: +sys.stderr.write( +Skipped test_kinesis_stream (enable by setting environment variable %s=1 +% kinesis_test_environ_var) return import random @@ -1013,6 +1015,7 @@ class KinesisStreamTests(PySparkStreamingTestCase): traceback.print_exc() raise finally: +self.ssc.stop(False) kinesisTestUtils.deleteStream() kinesisTestUtils.deleteDynamoDBTable(kinesisAppName) @@ -1027,7 +1030,7 @@ def search_kafka_assembly_jar(): (Failed to find Spark Streaming kafka assembly jar in %s. % kafka_assembly_dir) + You need to build Spark with 'build/sbt assembly/assembly streaming-kafka-assembly/assembly' or -'build/mvn package' before running this test) +'build/mvn package' before running this test.) elif len(jars) 1: raise Exception((Found multiple Spark Streaming Kafka assembly JARs in %s; please remove all but one) % kafka_assembly_dir) @@ -1045,7 +1048,7 @@ def search_flume_assembly_jar(): (Failed to find Spark Streaming Flume assembly jar in %s. % flume_assembly_dir) + You need to build Spark with 'build/sbt assembly/assembly streaming-flume-assembly/assembly' or -'build/mvn package' before running this test) +'build/mvn package' before running this test.) elif len(jars) 1: raise Exception((Found multiple Spark Streaming Flume assembly JARs in %s; please remove all but one) % flume_assembly_dir) @@ -1095,11 +1098,7 @@ def search_kinesis_asl_assembly_jar(): os.path.join(kinesis_asl_assembly_dir, target/scala-*/spark-streaming-kinesis-asl-assembly-*.jar)) if not jars: -raise Exception( -(Failed to find Spark Streaming Kinesis ASL assembly jar in %s. % - kinesis_asl_assembly_dir) + You need to build Spark with -'build/sbt -Pkinesis-asl assembly/assembly streaming-kinesis-asl-assembly/assembly' -or 'build/mvn -Pkinesis-asl package' before running this test) +return None elif len(jars) 1: raise Exception((Found multiple Spark Streaming Kinesis ASL assembly JARs in %s; please remove all but one) % kinesis_asl_assembly_dir) @@ -1107,6 +1106,10 @@
spark git commit: [SPARK-9640] [STREAMING] [TEST] Do not run Python Kinesis tests when the Kinesis assembly JAR has not been generated
Repository: spark Updated Branches: refs/heads/master 91e9389f3 - 0f90d6055 [SPARK-9640] [STREAMING] [TEST] Do not run Python Kinesis tests when the Kinesis assembly JAR has not been generated Author: Tathagata Das tathagata.das1...@gmail.com Closes #7961 from tdas/SPARK-9640 and squashes the following commits: 974ce19 [Tathagata Das] Undo changes related to SPARK-9727 004ae26 [Tathagata Das] style fixes 9bbb97d [Tathagata Das] Minor style fies e6a677e [Tathagata Das] Merge remote-tracking branch 'apache-github/master' into SPARK-9640 ca90719 [Tathagata Das] Removed extra line ba9cfc7 [Tathagata Das] Improved kinesis test selection logic 88d59bd [Tathagata Das] updated test modules 871fcc8 [Tathagata Das] Fixed SparkBuild 94be631 [Tathagata Das] Fixed style b858196 [Tathagata Das] Fixed conditions and few other things based on PR comments. e292e64 [Tathagata Das] Added filters for Kinesis python tests Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0f90d605 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0f90d605 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0f90d605 Branch: refs/heads/master Commit: 0f90d6055e5bea9ceb1d454db84f4aa1d59b284d Parents: 91e9389 Author: Tathagata Das tathagata.das1...@gmail.com Authored: Mon Aug 10 23:41:53 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Mon Aug 10 23:41:53 2015 -0700 -- python/pyspark/streaming/tests.py | 56 ++ 1 file changed, 44 insertions(+), 12 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0f90d605/python/pyspark/streaming/tests.py -- diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py index 66ae334..f0ed415 100644 --- a/python/pyspark/streaming/tests.py +++ b/python/pyspark/streaming/tests.py @@ -971,8 +971,10 @@ class KinesisStreamTests(PySparkStreamingTestCase): awsAccessKey, awsSecretKey) def test_kinesis_stream(self): -if os.environ.get('ENABLE_KINESIS_TESTS') != '1': -print(Skip test_kinesis_stream) +if not are_kinesis_tests_enabled: +sys.stderr.write( +Skipped test_kinesis_stream (enable by setting environment variable %s=1 +% kinesis_test_environ_var) return import random @@ -1013,6 +1015,7 @@ class KinesisStreamTests(PySparkStreamingTestCase): traceback.print_exc() raise finally: +self.ssc.stop(False) kinesisTestUtils.deleteStream() kinesisTestUtils.deleteDynamoDBTable(kinesisAppName) @@ -1027,7 +1030,7 @@ def search_kafka_assembly_jar(): (Failed to find Spark Streaming kafka assembly jar in %s. % kafka_assembly_dir) + You need to build Spark with 'build/sbt assembly/assembly streaming-kafka-assembly/assembly' or -'build/mvn package' before running this test) +'build/mvn package' before running this test.) elif len(jars) 1: raise Exception((Found multiple Spark Streaming Kafka assembly JARs in %s; please remove all but one) % kafka_assembly_dir) @@ -1045,7 +1048,7 @@ def search_flume_assembly_jar(): (Failed to find Spark Streaming Flume assembly jar in %s. % flume_assembly_dir) + You need to build Spark with 'build/sbt assembly/assembly streaming-flume-assembly/assembly' or -'build/mvn package' before running this test) +'build/mvn package' before running this test.) elif len(jars) 1: raise Exception((Found multiple Spark Streaming Flume assembly JARs in %s; please remove all but one) % flume_assembly_dir) @@ -1095,11 +1098,7 @@ def search_kinesis_asl_assembly_jar(): os.path.join(kinesis_asl_assembly_dir, target/scala-*/spark-streaming-kinesis-asl-assembly-*.jar)) if not jars: -raise Exception( -(Failed to find Spark Streaming Kinesis ASL assembly jar in %s. % - kinesis_asl_assembly_dir) + You need to build Spark with -'build/sbt -Pkinesis-asl assembly/assembly streaming-kinesis-asl-assembly/assembly' -or 'build/mvn -Pkinesis-asl package' before running this test) +return None elif len(jars) 1: raise Exception((Found multiple Spark Streaming Kinesis ASL assembly JARs in %s; please remove all but one) % kinesis_asl_assembly_dir) @@ -1107,6 +1106,10 @@ def search_kinesis_asl_assembly_jar(): return jars[0] +# Must be same as the variable and condition defined in
spark git commit: [SPARK-9727] [STREAMING] [BUILD] Updated streaming kinesis SBT project name to be more consistent
Repository: spark Updated Branches: refs/heads/master 55752d883 - 600031ebe [SPARK-9727] [STREAMING] [BUILD] Updated streaming kinesis SBT project name to be more consistent Author: Tathagata Das tathagata.das1...@gmail.com Closes #8092 from tdas/SPARK-9727 and squashes the following commits: b1b01fd [Tathagata Das] Updated streaming kinesis project name Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/600031eb Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/600031eb Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/600031eb Branch: refs/heads/master Commit: 600031ebe27473d8fffe6ea436c2149223b82896 Parents: 55752d8 Author: Tathagata Das tathagata.das1...@gmail.com Authored: Tue Aug 11 02:41:03 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Tue Aug 11 02:41:03 2015 -0700 -- dev/sparktestsupport/modules.py | 4 ++-- extras/kinesis-asl/pom.xml | 2 +- project/SparkBuild.scala| 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/600031eb/dev/sparktestsupport/modules.py -- diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py index d82c0cc..346452f 100644 --- a/dev/sparktestsupport/modules.py +++ b/dev/sparktestsupport/modules.py @@ -134,7 +134,7 @@ streaming = Module( # files in streaming_kinesis_asl are changed, so that if Kinesis experiences an outage, we don't # fail other PRs. streaming_kinesis_asl = Module( -name=kinesis-asl, +name=streaming-kinesis-asl, dependencies=[], source_file_regexes=[ extras/kinesis-asl/, @@ -147,7 +147,7 @@ streaming_kinesis_asl = Module( ENABLE_KINESIS_TESTS: 1 }, sbt_test_goals=[ -kinesis-asl/test, +streaming-kinesis-asl/test, ] ) http://git-wip-us.apache.org/repos/asf/spark/blob/600031eb/extras/kinesis-asl/pom.xml -- diff --git a/extras/kinesis-asl/pom.xml b/extras/kinesis-asl/pom.xml index c242e7a..521b53e 100644 --- a/extras/kinesis-asl/pom.xml +++ b/extras/kinesis-asl/pom.xml @@ -31,7 +31,7 @@ nameSpark Kinesis Integration/name properties -sbt.project.namekinesis-asl/sbt.project.name +sbt.project.namestreaming-kinesis-asl/sbt.project.name /properties dependencies http://git-wip-us.apache.org/repos/asf/spark/blob/600031eb/project/SparkBuild.scala -- diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala index 41a85fa..cad7067 100644 --- a/project/SparkBuild.scala +++ b/project/SparkBuild.scala @@ -42,8 +42,8 @@ object BuildCommons { streaming-zeromq, launcher, unsafe).map(ProjectRef(buildLocation, _)) val optionallyEnabledProjects@Seq(yarn, yarnStable, java8Tests, sparkGangliaLgpl, -sparkKinesisAsl) = Seq(yarn, yarn-stable, java8-tests, ganglia-lgpl, -kinesis-asl).map(ProjectRef(buildLocation, _)) +streamingKinesisAsl) = Seq(yarn, yarn-stable, java8-tests, ganglia-lgpl, +streaming-kinesis-asl).map(ProjectRef(buildLocation, _)) val assemblyProjects@Seq(assembly, examples, networkYarn, streamingFlumeAssembly, streamingKafkaAssembly, streamingMqttAssembly, streamingKinesisAslAssembly) = Seq(assembly, examples, network-yarn, streaming-flume-assembly, streaming-kafka-assembly, streaming-mqtt-assembly, streaming-kinesis-asl-assembly) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9727] [STREAMING] [BUILD] Updated streaming kinesis SBT project name to be more consistent
Repository: spark Updated Branches: refs/heads/branch-1.5 c7f009040 - ebbd3b616 [SPARK-9727] [STREAMING] [BUILD] Updated streaming kinesis SBT project name to be more consistent Author: Tathagata Das tathagata.das1...@gmail.com Closes #8092 from tdas/SPARK-9727 and squashes the following commits: b1b01fd [Tathagata Das] Updated streaming kinesis project name (cherry picked from commit 600031ebe27473d8fffe6ea436c2149223b82896) Signed-off-by: Tathagata Das tathagata.das1...@gmail.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ebbd3b61 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ebbd3b61 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ebbd3b61 Branch: refs/heads/branch-1.5 Commit: ebbd3b616bf49701c2466bde5193241f69cf3e30 Parents: c7f0090 Author: Tathagata Das tathagata.das1...@gmail.com Authored: Tue Aug 11 02:41:03 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Tue Aug 11 02:41:25 2015 -0700 -- dev/sparktestsupport/modules.py | 4 ++-- extras/kinesis-asl/pom.xml | 2 +- project/SparkBuild.scala| 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ebbd3b61/dev/sparktestsupport/modules.py -- diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py index d82c0cc..346452f 100644 --- a/dev/sparktestsupport/modules.py +++ b/dev/sparktestsupport/modules.py @@ -134,7 +134,7 @@ streaming = Module( # files in streaming_kinesis_asl are changed, so that if Kinesis experiences an outage, we don't # fail other PRs. streaming_kinesis_asl = Module( -name=kinesis-asl, +name=streaming-kinesis-asl, dependencies=[], source_file_regexes=[ extras/kinesis-asl/, @@ -147,7 +147,7 @@ streaming_kinesis_asl = Module( ENABLE_KINESIS_TESTS: 1 }, sbt_test_goals=[ -kinesis-asl/test, +streaming-kinesis-asl/test, ] ) http://git-wip-us.apache.org/repos/asf/spark/blob/ebbd3b61/extras/kinesis-asl/pom.xml -- diff --git a/extras/kinesis-asl/pom.xml b/extras/kinesis-asl/pom.xml index c242e7a..521b53e 100644 --- a/extras/kinesis-asl/pom.xml +++ b/extras/kinesis-asl/pom.xml @@ -31,7 +31,7 @@ nameSpark Kinesis Integration/name properties -sbt.project.namekinesis-asl/sbt.project.name +sbt.project.namestreaming-kinesis-asl/sbt.project.name /properties dependencies http://git-wip-us.apache.org/repos/asf/spark/blob/ebbd3b61/project/SparkBuild.scala -- diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala index 41a85fa..cad7067 100644 --- a/project/SparkBuild.scala +++ b/project/SparkBuild.scala @@ -42,8 +42,8 @@ object BuildCommons { streaming-zeromq, launcher, unsafe).map(ProjectRef(buildLocation, _)) val optionallyEnabledProjects@Seq(yarn, yarnStable, java8Tests, sparkGangliaLgpl, -sparkKinesisAsl) = Seq(yarn, yarn-stable, java8-tests, ganglia-lgpl, -kinesis-asl).map(ProjectRef(buildLocation, _)) +streamingKinesisAsl) = Seq(yarn, yarn-stable, java8-tests, ganglia-lgpl, +streaming-kinesis-asl).map(ProjectRef(buildLocation, _)) val assemblyProjects@Seq(assembly, examples, networkYarn, streamingFlumeAssembly, streamingKafkaAssembly, streamingMqttAssembly, streamingKinesisAslAssembly) = Seq(assembly, examples, network-yarn, streaming-flume-assembly, streaming-kafka-assembly, streaming-mqtt-assembly, streaming-kinesis-asl-assembly) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9810] [BUILD] Remove individual commit messages from the squash commit message
Repository: spark Updated Branches: refs/heads/master 0f90d6055 - 55752d883 [SPARK-9810] [BUILD] Remove individual commit messages from the squash commit message For more information, please see the JIRA ticket and the associated dev list discussion. https://issues.apache.org/jira/browse/SPARK-9810 http://apache-spark-developers-list.1001551.n3.nabble.com/discuss-Removing-individual-commit-messages-from-the-squash-commit-message-td13295.html Author: Reynold Xin r...@databricks.com Closes #8091 from rxin/SPARK-9810. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/55752d88 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/55752d88 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/55752d88 Branch: refs/heads/master Commit: 55752d88321925da815823f968128832de6fdbbb Parents: 0f90d60 Author: Reynold Xin r...@databricks.com Authored: Tue Aug 11 01:08:30 2015 -0700 Committer: Reynold Xin r...@databricks.com Committed: Tue Aug 11 01:08:30 2015 -0700 -- dev/merge_spark_pr.py | 6 +- 1 file changed, 1 insertion(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/55752d88/dev/merge_spark_pr.py -- diff --git a/dev/merge_spark_pr.py b/dev/merge_spark_pr.py index ad4b766..b9bdec3 100755 --- a/dev/merge_spark_pr.py +++ b/dev/merge_spark_pr.py @@ -159,11 +159,7 @@ def merge_pr(pr_num, target_ref, title, body, pr_repo_desc): merge_message_flags += [-m, message] # The string Closes #%s string is required for GitHub to correctly close the PR -merge_message_flags += [ --m, -Closes #%s from %s and squashes the following commits: % (pr_num, pr_repo_desc)] -for c in commits: -merge_message_flags += [-m, c] +merge_message_flags += [-m, Closes #%s from %s. % (pr_num, pr_repo_desc)] run_cmd(['git', 'commit', '--author=%s' % primary_author] + merge_message_flags) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[2/2] spark git commit: [SPARK-9815] Rename PlatformDependent.UNSAFE - Platform.
[SPARK-9815] Rename PlatformDependent.UNSAFE - Platform. PlatformDependent.UNSAFE is way too verbose. Author: Reynold Xin r...@databricks.com Closes #8094 from rxin/SPARK-9815 and squashes the following commits: 229b603 [Reynold Xin] [SPARK-9815] Rename PlatformDependent.UNSAFE - Platform. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d378396f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d378396f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d378396f Branch: refs/heads/master Commit: d378396f86f625f006738d87fe5dbc2ff8fd913d Parents: 600031e Author: Reynold Xin r...@databricks.com Authored: Tue Aug 11 08:41:06 2015 -0700 Committer: Davies Liu davies@gmail.com Committed: Tue Aug 11 08:41:06 2015 -0700 -- .../serializer/DummySerializerInstance.java | 6 +- .../unsafe/UnsafeShuffleExternalSorter.java | 22 +-- .../shuffle/unsafe/UnsafeShuffleWriter.java | 4 +- .../spark/unsafe/map/BytesToBytesMap.java | 20 +- .../unsafe/sort/PrefixComparators.java | 5 +- .../unsafe/sort/UnsafeExternalSorter.java | 22 +-- .../unsafe/sort/UnsafeInMemorySorter.java | 4 +- .../unsafe/sort/UnsafeSorterSpillReader.java| 4 +- .../unsafe/sort/UnsafeSorterSpillWriter.java| 6 +- .../UnsafeShuffleInMemorySorterSuite.java | 20 +- .../map/AbstractBytesToBytesMapSuite.java | 94 +- .../unsafe/sort/UnsafeExternalSorterSuite.java | 20 +- .../unsafe/sort/UnsafeInMemorySorterSuite.java | 20 +- .../catalyst/expressions/UnsafeArrayData.java | 51 ++--- .../sql/catalyst/expressions/UnsafeReaders.java | 8 +- .../sql/catalyst/expressions/UnsafeRow.java | 108 +-- .../catalyst/expressions/UnsafeRowWriters.java | 41 ++-- .../sql/catalyst/expressions/UnsafeWriters.java | 43 ++--- .../sql/execution/UnsafeExternalRowSorter.java | 4 +- .../expressions/codegen/CodeGenerator.scala | 4 +- .../codegen/GenerateUnsafeProjection.scala | 32 ++-- .../codegen/GenerateUnsafeRowJoiner.scala | 16 +- .../catalyst/expressions/stringOperations.scala | 4 +- .../GenerateUnsafeRowJoinerBitsetSuite.scala| 4 +- .../UnsafeFixedWidthAggregationMap.java | 4 +- .../sql/execution/UnsafeKVExternalSorter.java | 4 +- .../sql/execution/UnsafeRowSerializer.scala | 6 +- .../sql/execution/joins/HashedRelation.scala| 13 +- .../org/apache/spark/sql/UnsafeRowSuite.scala | 4 +- .../java/org/apache/spark/unsafe/Platform.java | 173 + .../apache/spark/unsafe/PlatformDependent.java | 187 --- .../spark/unsafe/array/ByteArrayMethods.java| 14 +- .../apache/spark/unsafe/array/LongArray.java| 6 +- .../spark/unsafe/bitset/BitSetMethods.java | 19 +- .../spark/unsafe/hash/Murmur3_x86_32.java | 4 +- .../apache/spark/unsafe/memory/MemoryBlock.java | 4 +- .../unsafe/memory/UnsafeMemoryAllocator.java| 6 +- .../apache/spark/unsafe/types/ByteArray.java| 10 +- .../apache/spark/unsafe/types/UTF8String.java | 30 ++- .../spark/unsafe/hash/Murmur3_x86_32Suite.java | 14 +- 40 files changed, 466 insertions(+), 594 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d378396f/core/src/main/java/org/apache/spark/serializer/DummySerializerInstance.java -- diff --git a/core/src/main/java/org/apache/spark/serializer/DummySerializerInstance.java b/core/src/main/java/org/apache/spark/serializer/DummySerializerInstance.java index 0399abc..0e58bb4 100644 --- a/core/src/main/java/org/apache/spark/serializer/DummySerializerInstance.java +++ b/core/src/main/java/org/apache/spark/serializer/DummySerializerInstance.java @@ -25,7 +25,7 @@ import java.nio.ByteBuffer; import scala.reflect.ClassTag; import org.apache.spark.annotation.Private; -import org.apache.spark.unsafe.PlatformDependent; +import org.apache.spark.unsafe.Platform; /** * Unfortunately, we need a serializer instance in order to construct a DiskBlockObjectWriter. @@ -49,7 +49,7 @@ public final class DummySerializerInstance extends SerializerInstance { try { s.flush(); } catch (IOException e) { - PlatformDependent.throwException(e); + Platform.throwException(e); } } @@ -64,7 +64,7 @@ public final class DummySerializerInstance extends SerializerInstance { try { s.close(); } catch (IOException e) { - PlatformDependent.throwException(e); + Platform.throwException(e); } } }; http://git-wip-us.apache.org/repos/asf/spark/blob/d378396f/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleExternalSorter.java
[2/2] spark git commit: [SPARK-9815] Rename PlatformDependent.UNSAFE - Platform.
[SPARK-9815] Rename PlatformDependent.UNSAFE - Platform. PlatformDependent.UNSAFE is way too verbose. Author: Reynold Xin r...@databricks.com Closes #8094 from rxin/SPARK-9815 and squashes the following commits: 229b603 [Reynold Xin] [SPARK-9815] Rename PlatformDependent.UNSAFE - Platform. (cherry picked from commit d378396f86f625f006738d87fe5dbc2ff8fd913d) Signed-off-by: Davies Liu davies@gmail.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/84ba990f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/84ba990f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/84ba990f Branch: refs/heads/branch-1.5 Commit: 84ba990f2e27ef6d05716307ebe9644c7efffee9 Parents: ebbd3b6 Author: Reynold Xin r...@databricks.com Authored: Tue Aug 11 08:41:06 2015 -0700 Committer: Davies Liu davies@gmail.com Committed: Tue Aug 11 08:41:28 2015 -0700 -- .../serializer/DummySerializerInstance.java | 6 +- .../unsafe/UnsafeShuffleExternalSorter.java | 22 +-- .../shuffle/unsafe/UnsafeShuffleWriter.java | 4 +- .../spark/unsafe/map/BytesToBytesMap.java | 20 +- .../unsafe/sort/PrefixComparators.java | 5 +- .../unsafe/sort/UnsafeExternalSorter.java | 22 +-- .../unsafe/sort/UnsafeInMemorySorter.java | 4 +- .../unsafe/sort/UnsafeSorterSpillReader.java| 4 +- .../unsafe/sort/UnsafeSorterSpillWriter.java| 6 +- .../UnsafeShuffleInMemorySorterSuite.java | 20 +- .../map/AbstractBytesToBytesMapSuite.java | 94 +- .../unsafe/sort/UnsafeExternalSorterSuite.java | 20 +- .../unsafe/sort/UnsafeInMemorySorterSuite.java | 20 +- .../catalyst/expressions/UnsafeArrayData.java | 51 ++--- .../sql/catalyst/expressions/UnsafeReaders.java | 8 +- .../sql/catalyst/expressions/UnsafeRow.java | 108 +-- .../catalyst/expressions/UnsafeRowWriters.java | 41 ++-- .../sql/catalyst/expressions/UnsafeWriters.java | 43 ++--- .../sql/execution/UnsafeExternalRowSorter.java | 4 +- .../expressions/codegen/CodeGenerator.scala | 4 +- .../codegen/GenerateUnsafeProjection.scala | 32 ++-- .../codegen/GenerateUnsafeRowJoiner.scala | 16 +- .../catalyst/expressions/stringOperations.scala | 4 +- .../GenerateUnsafeRowJoinerBitsetSuite.scala| 4 +- .../UnsafeFixedWidthAggregationMap.java | 4 +- .../sql/execution/UnsafeKVExternalSorter.java | 4 +- .../sql/execution/UnsafeRowSerializer.scala | 6 +- .../sql/execution/joins/HashedRelation.scala| 13 +- .../org/apache/spark/sql/UnsafeRowSuite.scala | 4 +- .../java/org/apache/spark/unsafe/Platform.java | 173 + .../apache/spark/unsafe/PlatformDependent.java | 187 --- .../spark/unsafe/array/ByteArrayMethods.java| 14 +- .../apache/spark/unsafe/array/LongArray.java| 6 +- .../spark/unsafe/bitset/BitSetMethods.java | 19 +- .../spark/unsafe/hash/Murmur3_x86_32.java | 4 +- .../apache/spark/unsafe/memory/MemoryBlock.java | 4 +- .../unsafe/memory/UnsafeMemoryAllocator.java| 6 +- .../apache/spark/unsafe/types/ByteArray.java| 10 +- .../apache/spark/unsafe/types/UTF8String.java | 30 ++- .../spark/unsafe/hash/Murmur3_x86_32Suite.java | 14 +- 40 files changed, 466 insertions(+), 594 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/84ba990f/core/src/main/java/org/apache/spark/serializer/DummySerializerInstance.java -- diff --git a/core/src/main/java/org/apache/spark/serializer/DummySerializerInstance.java b/core/src/main/java/org/apache/spark/serializer/DummySerializerInstance.java index 0399abc..0e58bb4 100644 --- a/core/src/main/java/org/apache/spark/serializer/DummySerializerInstance.java +++ b/core/src/main/java/org/apache/spark/serializer/DummySerializerInstance.java @@ -25,7 +25,7 @@ import java.nio.ByteBuffer; import scala.reflect.ClassTag; import org.apache.spark.annotation.Private; -import org.apache.spark.unsafe.PlatformDependent; +import org.apache.spark.unsafe.Platform; /** * Unfortunately, we need a serializer instance in order to construct a DiskBlockObjectWriter. @@ -49,7 +49,7 @@ public final class DummySerializerInstance extends SerializerInstance { try { s.flush(); } catch (IOException e) { - PlatformDependent.throwException(e); + Platform.throwException(e); } } @@ -64,7 +64,7 @@ public final class DummySerializerInstance extends SerializerInstance { try { s.close(); } catch (IOException e) { - PlatformDependent.throwException(e); + Platform.throwException(e); } } };
spark git commit: [SPARK-9785] [SQL] HashPartitioning compatibility should consider expression ordering
Repository: spark Updated Branches: refs/heads/master d378396f8 - dfe347d2c [SPARK-9785] [SQL] HashPartitioning compatibility should consider expression ordering HashPartitioning compatibility is currently defined w.r.t the _set_ of expressions, but the ordering of those expressions matters when computing hash codes; this could lead to incorrect answers if we mistakenly avoided a shuffle based on the assumption that HashPartitionings with the same expressions in different orders will produce equivalent row hashcodes. The first commit adds a regression test which illustrates this problem. The fix for this is simple: make `HashPartitioning.compatibleWith` and `HashPartitioning.guarantees` sensitive to the expression ordering (i.e. do not perform set comparison). Author: Josh Rosen joshro...@databricks.com Closes #8074 from JoshRosen/hashpartitioning-compatiblewith-fixes and squashes the following commits: b61412f [Josh Rosen] Demonstrate that I haven't cheated in my fix 0b4d7d9 [Josh Rosen] Update so that clusteringSet is only used in satisfies(). dc9c9d7 [Josh Rosen] Add failing regression test for SPARK-9785 Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/dfe347d2 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/dfe347d2 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/dfe347d2 Branch: refs/heads/master Commit: dfe347d2cae3eb05d7539aaf72db3d309e711213 Parents: d378396 Author: Josh Rosen joshro...@databricks.com Authored: Tue Aug 11 08:52:15 2015 -0700 Committer: Yin Huai yh...@databricks.com Committed: Tue Aug 11 08:52:15 2015 -0700 -- .../catalyst/plans/physical/partitioning.scala | 15 ++ .../spark/sql/catalyst/PartitioningSuite.scala | 55 2 files changed, 60 insertions(+), 10 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/dfe347d2/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala index 5a89a90..5ac3f1f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala @@ -216,26 +216,23 @@ case class HashPartitioning(expressions: Seq[Expression], numPartitions: Int) override def nullable: Boolean = false override def dataType: DataType = IntegerType - lazy val clusteringSet = expressions.toSet - override def satisfies(required: Distribution): Boolean = required match { case UnspecifiedDistribution = true case ClusteredDistribution(requiredClustering) = - clusteringSet.subsetOf(requiredClustering.toSet) + expressions.toSet.subsetOf(requiredClustering.toSet) case _ = false } override def compatibleWith(other: Partitioning): Boolean = other match { -case o: HashPartitioning = - this.clusteringSet == o.clusteringSet this.numPartitions == o.numPartitions +case o: HashPartitioning = this == o case _ = false } override def guarantees(other: Partitioning): Boolean = other match { -case o: HashPartitioning = - this.clusteringSet == o.clusteringSet this.numPartitions == o.numPartitions +case o: HashPartitioning = this == o case _ = false } + } /** @@ -257,15 +254,13 @@ case class RangePartitioning(ordering: Seq[SortOrder], numPartitions: Int) override def nullable: Boolean = false override def dataType: DataType = IntegerType - private[this] lazy val clusteringSet = ordering.map(_.child).toSet - override def satisfies(required: Distribution): Boolean = required match { case UnspecifiedDistribution = true case OrderedDistribution(requiredOrdering) = val minSize = Seq(requiredOrdering.size, ordering.size).min requiredOrdering.take(minSize) == ordering.take(minSize) case ClusteredDistribution(requiredClustering) = - clusteringSet.subsetOf(requiredClustering.toSet) + ordering.map(_.child).toSet.subsetOf(requiredClustering.toSet) case _ = false } http://git-wip-us.apache.org/repos/asf/spark/blob/dfe347d2/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/PartitioningSuite.scala -- diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/PartitioningSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/PartitioningSuite.scala new file mode 100644 index 000..5b802cc --- /dev/null +++
[1/2] spark git commit: [SPARK-9815] Rename PlatformDependent.UNSAFE - Platform.
Repository: spark Updated Branches: refs/heads/master 600031ebe - d378396f8 http://git-wip-us.apache.org/repos/asf/spark/blob/d378396f/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala index 7bd..134f1aa 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala @@ -1013,7 +1013,7 @@ case class Decode(bin: Expression, charset: Expression) try { ${ev.primitive} = UTF8String.fromString(new String($bytes, $charset.toString())); } catch (java.io.UnsupportedEncodingException e) { - org.apache.spark.unsafe.PlatformDependent.throwException(e); + org.apache.spark.unsafe.Platform.throwException(e); } ) } @@ -1043,7 +1043,7 @@ case class Encode(value: Expression, charset: Expression) try { ${ev.primitive} = $string.toString().getBytes($charset.toString()); } catch (java.io.UnsupportedEncodingException e) { - org.apache.spark.unsafe.PlatformDependent.throwException(e); + org.apache.spark.unsafe.Platform.throwException(e); }) } } http://git-wip-us.apache.org/repos/asf/spark/blob/d378396f/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoinerBitsetSuite.scala -- diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoinerBitsetSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoinerBitsetSuite.scala index aff1bee..796d600 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoinerBitsetSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoinerBitsetSuite.scala @@ -22,7 +22,7 @@ import scala.util.Random import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.expressions.UnsafeRow import org.apache.spark.sql.types._ -import org.apache.spark.unsafe.PlatformDependent +import org.apache.spark.unsafe.Platform /** * A test suite for the bitset portion of the row concatenation. @@ -96,7 +96,7 @@ class GenerateUnsafeRowJoinerBitsetSuite extends SparkFunSuite { // This way we can test the joiner when the input UnsafeRows are not the entire arrays. val offset = numFields * 8 val buf = new Array[Byte](sizeInBytes + offset) -row.pointTo(buf, PlatformDependent.BYTE_ARRAY_OFFSET + offset, numFields, sizeInBytes) +row.pointTo(buf, Platform.BYTE_ARRAY_OFFSET + offset, numFields, sizeInBytes) row } http://git-wip-us.apache.org/repos/asf/spark/blob/d378396f/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMap.java -- diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMap.java b/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMap.java index 00218f2..5cce41d 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMap.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMap.java @@ -27,7 +27,7 @@ import org.apache.spark.sql.catalyst.expressions.UnsafeRow; import org.apache.spark.sql.types.StructField; import org.apache.spark.sql.types.StructType; import org.apache.spark.unsafe.KVIterator; -import org.apache.spark.unsafe.PlatformDependent; +import org.apache.spark.unsafe.Platform; import org.apache.spark.unsafe.map.BytesToBytesMap; import org.apache.spark.unsafe.memory.MemoryLocation; import org.apache.spark.unsafe.memory.TaskMemoryManager; @@ -138,7 +138,7 @@ public final class UnsafeFixedWidthAggregationMap { unsafeGroupingKeyRow.getBaseOffset(), unsafeGroupingKeyRow.getSizeInBytes(), emptyAggregationBuffer, -PlatformDependent.BYTE_ARRAY_OFFSET, +Platform.BYTE_ARRAY_OFFSET, emptyAggregationBuffer.length ); if (!putSucceeded) { http://git-wip-us.apache.org/repos/asf/spark/blob/d378396f/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeKVExternalSorter.java -- diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeKVExternalSorter.java b/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeKVExternalSorter.java
[1/2] spark git commit: [SPARK-9815] Rename PlatformDependent.UNSAFE - Platform.
Repository: spark Updated Branches: refs/heads/branch-1.5 ebbd3b616 - 84ba990f2 http://git-wip-us.apache.org/repos/asf/spark/blob/84ba990f/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala index 7bd..134f1aa 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala @@ -1013,7 +1013,7 @@ case class Decode(bin: Expression, charset: Expression) try { ${ev.primitive} = UTF8String.fromString(new String($bytes, $charset.toString())); } catch (java.io.UnsupportedEncodingException e) { - org.apache.spark.unsafe.PlatformDependent.throwException(e); + org.apache.spark.unsafe.Platform.throwException(e); } ) } @@ -1043,7 +1043,7 @@ case class Encode(value: Expression, charset: Expression) try { ${ev.primitive} = $string.toString().getBytes($charset.toString()); } catch (java.io.UnsupportedEncodingException e) { - org.apache.spark.unsafe.PlatformDependent.throwException(e); + org.apache.spark.unsafe.Platform.throwException(e); }) } } http://git-wip-us.apache.org/repos/asf/spark/blob/84ba990f/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoinerBitsetSuite.scala -- diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoinerBitsetSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoinerBitsetSuite.scala index aff1bee..796d600 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoinerBitsetSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoinerBitsetSuite.scala @@ -22,7 +22,7 @@ import scala.util.Random import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.expressions.UnsafeRow import org.apache.spark.sql.types._ -import org.apache.spark.unsafe.PlatformDependent +import org.apache.spark.unsafe.Platform /** * A test suite for the bitset portion of the row concatenation. @@ -96,7 +96,7 @@ class GenerateUnsafeRowJoinerBitsetSuite extends SparkFunSuite { // This way we can test the joiner when the input UnsafeRows are not the entire arrays. val offset = numFields * 8 val buf = new Array[Byte](sizeInBytes + offset) -row.pointTo(buf, PlatformDependent.BYTE_ARRAY_OFFSET + offset, numFields, sizeInBytes) +row.pointTo(buf, Platform.BYTE_ARRAY_OFFSET + offset, numFields, sizeInBytes) row } http://git-wip-us.apache.org/repos/asf/spark/blob/84ba990f/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMap.java -- diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMap.java b/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMap.java index 00218f2..5cce41d 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMap.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMap.java @@ -27,7 +27,7 @@ import org.apache.spark.sql.catalyst.expressions.UnsafeRow; import org.apache.spark.sql.types.StructField; import org.apache.spark.sql.types.StructType; import org.apache.spark.unsafe.KVIterator; -import org.apache.spark.unsafe.PlatformDependent; +import org.apache.spark.unsafe.Platform; import org.apache.spark.unsafe.map.BytesToBytesMap; import org.apache.spark.unsafe.memory.MemoryLocation; import org.apache.spark.unsafe.memory.TaskMemoryManager; @@ -138,7 +138,7 @@ public final class UnsafeFixedWidthAggregationMap { unsafeGroupingKeyRow.getBaseOffset(), unsafeGroupingKeyRow.getSizeInBytes(), emptyAggregationBuffer, -PlatformDependent.BYTE_ARRAY_OFFSET, +Platform.BYTE_ARRAY_OFFSET, emptyAggregationBuffer.length ); if (!putSucceeded) { http://git-wip-us.apache.org/repos/asf/spark/blob/84ba990f/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeKVExternalSorter.java -- diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeKVExternalSorter.java b/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeKVExternalSorter.java
spark git commit: [SPARK-8345] [ML] Add an SQL node as a feature transformer
Repository: spark Updated Branches: refs/heads/master bce72797f - 8cad854ef [SPARK-8345] [ML] Add an SQL node as a feature transformer Implements the transforms which are defined by SQL statement. Currently we only support SQL syntax like 'SELECT ... FROM __THIS__' where '__THIS__' represents the underlying table of the input dataset. Author: Yanbo Liang yblia...@gmail.com Closes #7465 from yanboliang/spark-8345 and squashes the following commits: b403fcb [Yanbo Liang] address comments 0d4bb15 [Yanbo Liang] a better transformSchema() implementation 51eb9e7 [Yanbo Liang] Add an SQL node as a feature transformer Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8cad854e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8cad854e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8cad854e Branch: refs/heads/master Commit: 8cad854ef6a2066de5adffcca6b79a205ccfd5f3 Parents: bce7279 Author: Yanbo Liang yblia...@gmail.com Authored: Tue Aug 11 11:01:59 2015 -0700 Committer: Xiangrui Meng m...@databricks.com Committed: Tue Aug 11 11:01:59 2015 -0700 -- .../spark/ml/feature/SQLTransformer.scala | 72 .../spark/ml/feature/SQLTransformerSuite.scala | 44 2 files changed, 116 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/8cad854e/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala new file mode 100644 index 000..95e4305 --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the License); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.feature + +import org.apache.spark.SparkContext +import org.apache.spark.annotation.Experimental +import org.apache.spark.ml.param.{ParamMap, Param} +import org.apache.spark.ml.Transformer +import org.apache.spark.ml.util.Identifiable +import org.apache.spark.sql.{SQLContext, DataFrame, Row} +import org.apache.spark.sql.types.StructType + +/** + * :: Experimental :: + * Implements the transforms which are defined by SQL statement. + * Currently we only support SQL syntax like 'SELECT ... FROM __THIS__' + * where '__THIS__' represents the underlying table of the input dataset. + */ +@Experimental +class SQLTransformer (override val uid: String) extends Transformer { + + def this() = this(Identifiable.randomUID(sql)) + + /** + * SQL statement parameter. The statement is provided in string form. + * @group param + */ + final val statement: Param[String] = new Param[String](this, statement, SQL statement) + + /** @group setParam */ + def setStatement(value: String): this.type = set(statement, value) + + /** @group getParam */ + def getStatement: String = $(statement) + + private val tableIdentifier: String = __THIS__ + + override def transform(dataset: DataFrame): DataFrame = { +val tableName = Identifiable.randomUID(uid) +dataset.registerTempTable(tableName) +val realStatement = $(statement).replace(tableIdentifier, tableName) +val outputDF = dataset.sqlContext.sql(realStatement) +outputDF + } + + override def transformSchema(schema: StructType): StructType = { +val sc = SparkContext.getOrCreate() +val sqlContext = SQLContext.getOrCreate(sc) +val dummyRDD = sc.parallelize(Seq(Row.empty)) +val dummyDF = sqlContext.createDataFrame(dummyRDD, schema) +dummyDF.registerTempTable(tableIdentifier) +val outputSchema = sqlContext.sql($(statement)).schema +outputSchema + } + + override def copy(extra: ParamMap): SQLTransformer = defaultCopy(extra) +} http://git-wip-us.apache.org/repos/asf/spark/blob/8cad854e/mllib/src/test/scala/org/apache/spark/ml/feature/SQLTransformerSuite.scala -- diff --git
spark git commit: [SPARK-9572] [STREAMING] [PYSPARK] Added StreamingContext.getActiveOrCreate() in Python
Repository: spark Updated Branches: refs/heads/master dbd778d84 - 5b8bb1b21 [SPARK-9572] [STREAMING] [PYSPARK] Added StreamingContext.getActiveOrCreate() in Python Author: Tathagata Das tathagata.das1...@gmail.com Closes #8080 from tdas/SPARK-9572 and squashes the following commits: 64a231d [Tathagata Das] Fix based on comments 741a0d0 [Tathagata Das] Fixed style f4f094c [Tathagata Das] Tweaked test 9afcdbe [Tathagata Das] Merge remote-tracking branch 'apache-github/master' into SPARK-9572 e21488d [Tathagata Das] Minor update 1a371d9 [Tathagata Das] Addressed comments. 60479da [Tathagata Das] Fixed indent 9c2da9c [Tathagata Das] Fixed bugs b5bd32c [Tathagata Das] Merge remote-tracking branch 'apache-github/master' into SPARK-9572 b55b348 [Tathagata Das] Removed prints 5781728 [Tathagata Das] Fix style issues b711214 [Tathagata Das] Reverted run-tests.py 643b59d [Tathagata Das] Revert unnecessary change 150e58c [Tathagata Das] Added StreamingContext.getActiveOrCreate() in Python Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5b8bb1b2 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5b8bb1b2 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5b8bb1b2 Branch: refs/heads/master Commit: 5b8bb1b213b8738f563fcd00747604410fbb3087 Parents: dbd778d Author: Tathagata Das tathagata.das1...@gmail.com Authored: Tue Aug 11 12:02:28 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Tue Aug 11 12:02:28 2015 -0700 -- python/pyspark/streaming/context.py | 57 - python/pyspark/streaming/tests.py | 133 --- python/run-tests.py | 2 +- 3 files changed, 177 insertions(+), 15 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5b8bb1b2/python/pyspark/streaming/context.py -- diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py index ac5ba69..e3ba70e 100644 --- a/python/pyspark/streaming/context.py +++ b/python/pyspark/streaming/context.py @@ -86,6 +86,9 @@ class StreamingContext(object): _transformerSerializer = None +# Reference to a currently active StreamingContext +_activeContext = None + def __init__(self, sparkContext, batchDuration=None, jssc=None): Create a new StreamingContext. @@ -142,10 +145,10 @@ class StreamingContext(object): Either recreate a StreamingContext from checkpoint data or create a new StreamingContext. If checkpoint data exists in the provided `checkpointPath`, then StreamingContext will be recreated from the checkpoint data. If the data does not exist, then the provided setupFunc -will be used to create a JavaStreamingContext. +will be used to create a new context. -@param checkpointPath: Checkpoint directory used in an earlier JavaStreamingContext program -@param setupFunc: Function to create a new JavaStreamingContext and setup DStreams +@param checkpointPath: Checkpoint directory used in an earlier streaming program +@param setupFunc: Function to create a new context and setup DStreams # TODO: support checkpoint in HDFS if not os.path.exists(checkpointPath) or not os.listdir(checkpointPath): @@ -170,6 +173,52 @@ class StreamingContext(object): cls._transformerSerializer.ctx = sc return StreamingContext(sc, None, jssc) +@classmethod +def getActive(cls): + +Return either the currently active StreamingContext (i.e., if there is a context started +but not stopped) or None. + +activePythonContext = cls._activeContext +if activePythonContext is not None: +# Verify that the current running Java StreamingContext is active and is the same one +# backing the supposedly active Python context +activePythonContextJavaId = activePythonContext._jssc.ssc().hashCode() +activeJvmContextOption = activePythonContext._jvm.StreamingContext.getActive() + +if activeJvmContextOption.isEmpty(): +cls._activeContext = None +elif activeJvmContextOption.get().hashCode() != activePythonContextJavaId: +cls._activeContext = None +raise Exception(JVM's active JavaStreamingContext is not the JavaStreamingContext +backing the action Python StreamingContext. This is unexpected.) +return cls._activeContext + +@classmethod +def getActiveOrCreate(cls, checkpointPath, setupFunc): + +Either return the active StreamingContext (i.e. currently started but not stopped), +or
spark git commit: [SPARK-9572] [STREAMING] [PYSPARK] Added StreamingContext.getActiveOrCreate() in Python
Repository: spark Updated Branches: refs/heads/branch-1.5 b077f36ea - 71460b889 [SPARK-9572] [STREAMING] [PYSPARK] Added StreamingContext.getActiveOrCreate() in Python Author: Tathagata Das tathagata.das1...@gmail.com Closes #8080 from tdas/SPARK-9572 and squashes the following commits: 64a231d [Tathagata Das] Fix based on comments 741a0d0 [Tathagata Das] Fixed style f4f094c [Tathagata Das] Tweaked test 9afcdbe [Tathagata Das] Merge remote-tracking branch 'apache-github/master' into SPARK-9572 e21488d [Tathagata Das] Minor update 1a371d9 [Tathagata Das] Addressed comments. 60479da [Tathagata Das] Fixed indent 9c2da9c [Tathagata Das] Fixed bugs b5bd32c [Tathagata Das] Merge remote-tracking branch 'apache-github/master' into SPARK-9572 b55b348 [Tathagata Das] Removed prints 5781728 [Tathagata Das] Fix style issues b711214 [Tathagata Das] Reverted run-tests.py 643b59d [Tathagata Das] Revert unnecessary change 150e58c [Tathagata Das] Added StreamingContext.getActiveOrCreate() in Python (cherry picked from commit 5b8bb1b213b8738f563fcd00747604410fbb3087) Signed-off-by: Tathagata Das tathagata.das1...@gmail.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/71460b88 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/71460b88 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/71460b88 Branch: refs/heads/branch-1.5 Commit: 71460b889b4fd7345706a84e26132c216625df95 Parents: b077f36 Author: Tathagata Das tathagata.das1...@gmail.com Authored: Tue Aug 11 12:02:28 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Tue Aug 11 12:02:44 2015 -0700 -- python/pyspark/streaming/context.py | 57 - python/pyspark/streaming/tests.py | 133 --- python/run-tests.py | 2 +- 3 files changed, 177 insertions(+), 15 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/71460b88/python/pyspark/streaming/context.py -- diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py index ac5ba69..e3ba70e 100644 --- a/python/pyspark/streaming/context.py +++ b/python/pyspark/streaming/context.py @@ -86,6 +86,9 @@ class StreamingContext(object): _transformerSerializer = None +# Reference to a currently active StreamingContext +_activeContext = None + def __init__(self, sparkContext, batchDuration=None, jssc=None): Create a new StreamingContext. @@ -142,10 +145,10 @@ class StreamingContext(object): Either recreate a StreamingContext from checkpoint data or create a new StreamingContext. If checkpoint data exists in the provided `checkpointPath`, then StreamingContext will be recreated from the checkpoint data. If the data does not exist, then the provided setupFunc -will be used to create a JavaStreamingContext. +will be used to create a new context. -@param checkpointPath: Checkpoint directory used in an earlier JavaStreamingContext program -@param setupFunc: Function to create a new JavaStreamingContext and setup DStreams +@param checkpointPath: Checkpoint directory used in an earlier streaming program +@param setupFunc: Function to create a new context and setup DStreams # TODO: support checkpoint in HDFS if not os.path.exists(checkpointPath) or not os.listdir(checkpointPath): @@ -170,6 +173,52 @@ class StreamingContext(object): cls._transformerSerializer.ctx = sc return StreamingContext(sc, None, jssc) +@classmethod +def getActive(cls): + +Return either the currently active StreamingContext (i.e., if there is a context started +but not stopped) or None. + +activePythonContext = cls._activeContext +if activePythonContext is not None: +# Verify that the current running Java StreamingContext is active and is the same one +# backing the supposedly active Python context +activePythonContextJavaId = activePythonContext._jssc.ssc().hashCode() +activeJvmContextOption = activePythonContext._jvm.StreamingContext.getActive() + +if activeJvmContextOption.isEmpty(): +cls._activeContext = None +elif activeJvmContextOption.get().hashCode() != activePythonContextJavaId: +cls._activeContext = None +raise Exception(JVM's active JavaStreamingContext is not the JavaStreamingContext +backing the action Python StreamingContext. This is unexpected.) +return cls._activeContext + +@classmethod +def getActiveOrCreate(cls,
spark git commit: [SPARK-8764] [ML] string indexer should take option to handle unseen values
Repository: spark Updated Branches: refs/heads/master 8cad854ef - dbd778d84 [SPARK-8764] [ML] string indexer should take option to handle unseen values As a precursor to adding a public constructor add an option to handle unseen values by skipping rather than throwing an exception (default remains throwing an exception), Author: Holden Karau hol...@pigscanfly.ca Closes #7266 from holdenk/SPARK-8764-string-indexer-should-take-option-to-handle-unseen-values and squashes the following commits: 38a4de9 [Holden Karau] fix long line 045bf22 [Holden Karau] Add a second b entry so b gets 0 for sure 81dd312 [Holden Karau] Update the docs for handleInvalid param to be more descriptive 7f37f6e [Holden Karau] remove extra space (scala style) 414e249 [Holden Karau] And switch to using handleInvalid instead of skipInvalid 1e53f9b [Holden Karau] update the param (codegen side) 7a22215 [Holden Karau] fix typo 100a39b [Holden Karau] Merge in master aa5b093 [Holden Karau] Since we filter we should never go down this code path if getSkipInvalid is true 75ffa69 [Holden Karau] Remove extra newline d69ef5e [Holden Karau] Add a test b5734be [Holden Karau] Add support for unseen labels afecd4e [Holden Karau] Add a param to skip invalid entries. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/dbd778d8 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/dbd778d8 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/dbd778d8 Branch: refs/heads/master Commit: dbd778d84d094ca142bc08c351478595b280bc2a Parents: 8cad854 Author: Holden Karau hol...@pigscanfly.ca Authored: Tue Aug 11 11:33:36 2015 -0700 Committer: Joseph K. Bradley jos...@databricks.com Committed: Tue Aug 11 11:33:36 2015 -0700 -- .../apache/spark/ml/feature/StringIndexer.scala | 26 +--- .../ml/param/shared/SharedParamsCodeGen.scala | 4 +++ .../spark/ml/param/shared/sharedParams.scala| 15 + .../spark/ml/feature/StringIndexerSuite.scala | 32 4 files changed, 73 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/dbd778d8/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala index ebfa972..e4485eb 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala @@ -33,7 +33,8 @@ import org.apache.spark.util.collection.OpenHashMap /** * Base trait for [[StringIndexer]] and [[StringIndexerModel]]. */ -private[feature] trait StringIndexerBase extends Params with HasInputCol with HasOutputCol { +private[feature] trait StringIndexerBase extends Params with HasInputCol with HasOutputCol +with HasHandleInvalid { /** Validates and transforms the input schema. */ protected def validateAndTransformSchema(schema: StructType): StructType = { @@ -66,12 +67,15 @@ class StringIndexer(override val uid: String) extends Estimator[StringIndexerMod def this() = this(Identifiable.randomUID(strIdx)) /** @group setParam */ + def setHandleInvalid(value: String): this.type = set(handleInvalid, value) + setDefault(handleInvalid, error) + + /** @group setParam */ def setInputCol(value: String): this.type = set(inputCol, value) /** @group setParam */ def setOutputCol(value: String): this.type = set(outputCol, value) - // TODO: handle unseen labels override def fit(dataset: DataFrame): StringIndexerModel = { val counts = dataset.select(col($(inputCol)).cast(StringType)) @@ -112,6 +116,10 @@ class StringIndexerModel private[ml] ( } /** @group setParam */ + def setHandleInvalid(value: String): this.type = set(handleInvalid, value) + setDefault(handleInvalid, error) + + /** @group setParam */ def setInputCol(value: String): this.type = set(inputCol, value) /** @group setParam */ @@ -128,14 +136,24 @@ class StringIndexerModel private[ml] ( if (labelToIndex.contains(label)) { labelToIndex(label) } else { -// TODO: handle unseen labels throw new SparkException(sUnseen label: $label.) } } + val outputColName = $(outputCol) val metadata = NominalAttribute.defaultAttr .withName(outputColName).withValues(labels).toMetadata() -dataset.select(col(*), +// If we are skipping invalid records, filter them out. +val filteredDataset = (getHandleInvalid) match { + case skip = { +val filterer = udf { label: String = + labelToIndex.contains(label) +} +
[2/2] spark git commit: [SPARK-9646] [SQL] Add metrics for all join and aggregate operators
[SPARK-9646] [SQL] Add metrics for all join and aggregate operators This PR added metrics for all join and aggregate operators. However, I found the metrics may be confusing in the following two case: 1. The iterator is not totally consumed and the metric values will be less. 2. Recreating the iterators will make metric values look bigger than the size of the input source, such as `CartesianProduct`. Author: zsxwing zsxw...@gmail.com Closes #8060 from zsxwing/sql-metrics and squashes the following commits: 40f3fc1 [zsxwing] Mark LongSQLMetric private[metric] to avoid using incorrectly and leak memory b1b9071 [zsxwing] Merge branch 'master' into sql-metrics 4bef25a [zsxwing] Add metrics for SortMergeOuterJoin 95ccfc6 [zsxwing] Merge branch 'master' into sql-metrics 67cb4dd [zsxwing] Add metrics for Project and TungstenProject; remove metrics from PhysicalRDD and LocalTableScan 0eb47d4 [zsxwing] Merge branch 'master' into sql-metrics dd9d932 [zsxwing] Avoid creating new Iterators 589ea26 [zsxwing] Add metrics for all join and aggregate operators (cherry picked from commit 5831294a7a8fa2524133c5d718cbc8187d2b0620) Signed-off-by: Yin Huai yh...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/767ee188 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/767ee188 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/767ee188 Branch: refs/heads/branch-1.5 Commit: 767ee1884b8ecba3afa8ed19a562626361d54f50 Parents: 71460b8 Author: zsxwing zsxw...@gmail.com Authored: Tue Aug 11 12:39:13 2015 -0700 Committer: Yin Huai yh...@databricks.com Committed: Tue Aug 11 12:39:39 2015 -0700 -- .../apache/spark/sql/execution/Aggregate.scala | 11 + .../spark/sql/execution/ExistingRDD.scala | 2 - .../spark/sql/execution/LocalTableScan.scala| 2 - .../apache/spark/sql/execution/SparkPlan.scala | 25 +- .../aggregate/SortBasedAggregate.scala | 12 +- .../SortBasedAggregationIterator.scala | 18 +- .../execution/aggregate/TungstenAggregate.scala | 12 +- .../aggregate/TungstenAggregationIterator.scala | 11 +- .../spark/sql/execution/basicOperators.scala| 36 +- .../sql/execution/joins/BroadcastHashJoin.scala | 30 +- .../joins/BroadcastHashOuterJoin.scala | 40 +- .../joins/BroadcastLeftSemiJoinHash.scala | 24 +- .../joins/BroadcastNestedLoopJoin.scala | 27 +- .../sql/execution/joins/CartesianProduct.scala | 25 +- .../spark/sql/execution/joins/HashJoin.scala| 7 +- .../sql/execution/joins/HashOuterJoin.scala | 30 +- .../sql/execution/joins/HashSemiJoin.scala | 23 +- .../sql/execution/joins/HashedRelation.scala| 8 +- .../sql/execution/joins/LeftSemiJoinBNL.scala | 19 +- .../sql/execution/joins/LeftSemiJoinHash.scala | 18 +- .../sql/execution/joins/ShuffledHashJoin.scala | 16 +- .../execution/joins/ShuffledHashOuterJoin.scala | 29 +- .../sql/execution/joins/SortMergeJoin.scala | 21 +- .../execution/joins/SortMergeOuterJoin.scala| 38 +- .../spark/sql/execution/metric/SQLMetrics.scala | 6 + .../execution/joins/HashedRelationSuite.scala | 14 +- .../sql/execution/metric/SQLMetricsSuite.scala | 450 ++- 27 files changed, 847 insertions(+), 107 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/767ee188/sql/core/src/main/scala/org/apache/spark/sql/execution/Aggregate.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Aggregate.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Aggregate.scala index e8c6a0f..f3b6a3a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/Aggregate.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/Aggregate.scala @@ -25,6 +25,7 @@ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.errors._ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.physical._ +import org.apache.spark.sql.execution.metric.SQLMetrics /** * :: DeveloperApi :: @@ -45,6 +46,10 @@ case class Aggregate( child: SparkPlan) extends UnaryNode { + override private[sql] lazy val metrics = Map( +numInputRows - SQLMetrics.createLongMetric(sparkContext, number of input rows), +numOutputRows - SQLMetrics.createLongMetric(sparkContext, number of output rows)) + override def requiredChildDistribution: List[Distribution] = { if (partial) { UnspecifiedDistribution :: Nil @@ -121,12 +126,15 @@ case class Aggregate( } protected override def doExecute(): RDD[InternalRow] = attachTree(this, execute) { +val numInputRows = longMetric(numInputRows) +val numOutputRows =
[2/2] spark git commit: [SPARK-9646] [SQL] Add metrics for all join and aggregate operators
[SPARK-9646] [SQL] Add metrics for all join and aggregate operators This PR added metrics for all join and aggregate operators. However, I found the metrics may be confusing in the following two case: 1. The iterator is not totally consumed and the metric values will be less. 2. Recreating the iterators will make metric values look bigger than the size of the input source, such as `CartesianProduct`. Author: zsxwing zsxw...@gmail.com Closes #8060 from zsxwing/sql-metrics and squashes the following commits: 40f3fc1 [zsxwing] Mark LongSQLMetric private[metric] to avoid using incorrectly and leak memory b1b9071 [zsxwing] Merge branch 'master' into sql-metrics 4bef25a [zsxwing] Add metrics for SortMergeOuterJoin 95ccfc6 [zsxwing] Merge branch 'master' into sql-metrics 67cb4dd [zsxwing] Add metrics for Project and TungstenProject; remove metrics from PhysicalRDD and LocalTableScan 0eb47d4 [zsxwing] Merge branch 'master' into sql-metrics dd9d932 [zsxwing] Avoid creating new Iterators 589ea26 [zsxwing] Add metrics for all join and aggregate operators Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5831294a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5831294a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5831294a Branch: refs/heads/master Commit: 5831294a7a8fa2524133c5d718cbc8187d2b0620 Parents: 5b8bb1b Author: zsxwing zsxw...@gmail.com Authored: Tue Aug 11 12:39:13 2015 -0700 Committer: Yin Huai yh...@databricks.com Committed: Tue Aug 11 12:39:13 2015 -0700 -- .../apache/spark/sql/execution/Aggregate.scala | 11 + .../spark/sql/execution/ExistingRDD.scala | 2 - .../spark/sql/execution/LocalTableScan.scala| 2 - .../apache/spark/sql/execution/SparkPlan.scala | 25 +- .../aggregate/SortBasedAggregate.scala | 12 +- .../SortBasedAggregationIterator.scala | 18 +- .../execution/aggregate/TungstenAggregate.scala | 12 +- .../aggregate/TungstenAggregationIterator.scala | 11 +- .../spark/sql/execution/basicOperators.scala| 36 +- .../sql/execution/joins/BroadcastHashJoin.scala | 30 +- .../joins/BroadcastHashOuterJoin.scala | 40 +- .../joins/BroadcastLeftSemiJoinHash.scala | 24 +- .../joins/BroadcastNestedLoopJoin.scala | 27 +- .../sql/execution/joins/CartesianProduct.scala | 25 +- .../spark/sql/execution/joins/HashJoin.scala| 7 +- .../sql/execution/joins/HashOuterJoin.scala | 30 +- .../sql/execution/joins/HashSemiJoin.scala | 23 +- .../sql/execution/joins/HashedRelation.scala| 8 +- .../sql/execution/joins/LeftSemiJoinBNL.scala | 19 +- .../sql/execution/joins/LeftSemiJoinHash.scala | 18 +- .../sql/execution/joins/ShuffledHashJoin.scala | 16 +- .../execution/joins/ShuffledHashOuterJoin.scala | 29 +- .../sql/execution/joins/SortMergeJoin.scala | 21 +- .../execution/joins/SortMergeOuterJoin.scala| 38 +- .../spark/sql/execution/metric/SQLMetrics.scala | 6 + .../execution/joins/HashedRelationSuite.scala | 14 +- .../sql/execution/metric/SQLMetricsSuite.scala | 450 ++- 27 files changed, 847 insertions(+), 107 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5831294a/sql/core/src/main/scala/org/apache/spark/sql/execution/Aggregate.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Aggregate.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Aggregate.scala index e8c6a0f..f3b6a3a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/Aggregate.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/Aggregate.scala @@ -25,6 +25,7 @@ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.errors._ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.physical._ +import org.apache.spark.sql.execution.metric.SQLMetrics /** * :: DeveloperApi :: @@ -45,6 +46,10 @@ case class Aggregate( child: SparkPlan) extends UnaryNode { + override private[sql] lazy val metrics = Map( +numInputRows - SQLMetrics.createLongMetric(sparkContext, number of input rows), +numOutputRows - SQLMetrics.createLongMetric(sparkContext, number of output rows)) + override def requiredChildDistribution: List[Distribution] = { if (partial) { UnspecifiedDistribution :: Nil @@ -121,12 +126,15 @@ case class Aggregate( } protected override def doExecute(): RDD[InternalRow] = attachTree(this, execute) { +val numInputRows = longMetric(numInputRows) +val numOutputRows = longMetric(numOutputRows) if (groupingExpressions.isEmpty) { child.execute().mapPartitions { iter = val buffer =
[1/2] spark git commit: [SPARK-9646] [SQL] Add metrics for all join and aggregate operators
Repository: spark Updated Branches: refs/heads/branch-1.5 71460b889 - 767ee1884 http://git-wip-us.apache.org/repos/asf/spark/blob/767ee188/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala index 953284c..7383d3f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala @@ -25,15 +25,24 @@ import com.esotericsoftware.reflectasm.shaded.org.objectweb.asm._ import com.esotericsoftware.reflectasm.shaded.org.objectweb.asm.Opcodes._ import org.apache.spark.SparkFunSuite -import org.apache.spark.sql.test.TestSQLContext +import org.apache.spark.sql._ +import org.apache.spark.sql.execution.ui.SparkPlanGraph +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.test.{SQLTestUtils, TestSQLContext} import org.apache.spark.util.Utils +class SQLMetricsSuite extends SparkFunSuite with SQLTestUtils { -class SQLMetricsSuite extends SparkFunSuite { + override val sqlContext = TestSQLContext + + import sqlContext.implicits._ test(LongSQLMetric should not box Long) { val l = SQLMetrics.createLongMetric(TestSQLContext.sparkContext, long) -val f = () = { l += 1L } +val f = () = { + l += 1L + l.add(1L) +} BoxingFinder.getClassReader(f.getClass).foreach { cl = val boxingFinder = new BoxingFinder() cl.accept(boxingFinder, 0) @@ -51,6 +60,441 @@ class SQLMetricsSuite extends SparkFunSuite { assert(boxingFinder.boxingInvokes.nonEmpty, Found find boxing in this test) } } + + /** + * Call `df.collect()` and verify if the collected metrics are same as expectedMetrics. + * + * @param df `DataFrame` to run + * @param expectedNumOfJobs number of jobs that will run + * @param expectedMetrics the expected metrics. The format is + *`nodeId - (operatorName, metric name - metric value)`. + */ + private def testSparkPlanMetrics( + df: DataFrame, + expectedNumOfJobs: Int, + expectedMetrics: Map[Long, (String, Map[String, Any])]): Unit = { +val previousExecutionIds = TestSQLContext.listener.executionIdToData.keySet +df.collect() +TestSQLContext.sparkContext.listenerBus.waitUntilEmpty(1) +val executionIds = TestSQLContext.listener.executionIdToData.keySet.diff(previousExecutionIds) +assert(executionIds.size === 1) +val executionId = executionIds.head +val jobs = TestSQLContext.listener.getExecution(executionId).get.jobs +// Use = because there is a race condition that we may miss some jobs +// TODO Change it to = once we fix the race condition that missing the JobStarted event. +assert(jobs.size = expectedNumOfJobs) +if (jobs.size == expectedNumOfJobs) { + // If we can track all jobs, check the metric values + val metricValues = TestSQLContext.listener.getExecutionMetrics(executionId) + val actualMetrics = SparkPlanGraph(df.queryExecution.executedPlan).nodes.filter { node = +expectedMetrics.contains(node.id) + }.map { node = +val nodeMetrics = node.metrics.map { metric = + val metricValue = metricValues(metric.accumulatorId) + (metric.name, metricValue) +}.toMap +(node.id, node.name - nodeMetrics) + }.toMap + assert(expectedMetrics === actualMetrics) +} else { + // TODO Remove this else once we fix the race condition that missing the JobStarted event. + // Since we cannot track all jobs, the metric values could be wrong and we should not check + // them. + logWarning(Due to a race condition, we miss some jobs and cannot verify the metric values) +} + } + + test(Project metrics) { +withSQLConf( + SQLConf.UNSAFE_ENABLED.key - false, + SQLConf.CODEGEN_ENABLED.key - false, + SQLConf.TUNGSTEN_ENABLED.key - false) { + // Assume the execution plan is + // PhysicalRDD(nodeId = 1) - Project(nodeId = 0) + val df = TestData.person.select('name) + testSparkPlanMetrics(df, 1, Map( +0L -(Project, Map( + number of rows - 2L))) + ) +} + } + + test(TungstenProject metrics) { +withSQLConf( + SQLConf.UNSAFE_ENABLED.key - true, + SQLConf.CODEGEN_ENABLED.key - true, + SQLConf.TUNGSTEN_ENABLED.key - true) { + // Assume the execution plan is + // PhysicalRDD(nodeId = 1) - TungstenProject(nodeId = 0) + val df = TestData.person.select('name) + testSparkPlanMetrics(df, 1, Map( +0L -(TungstenProject, Map( + number of rows - 2L))) + ) +} + } + + test(Filter metrics) { +// Assume the execution plan is +//
spark git commit: [SPARK-9750] [MLLIB] Improve equals on SparseMatrix and DenseMatrix
Repository: spark Updated Branches: refs/heads/master 5831294a7 - 520ad44b1 [SPARK-9750] [MLLIB] Improve equals on SparseMatrix and DenseMatrix Adds unit test for `equals` on `mllib.linalg.Matrix` class and `equals` to both `SparseMatrix` and `DenseMatrix`. Supports equality testing between `SparseMatrix` and `DenseMatrix`. mengxr Author: Feynman Liang fli...@databricks.com Closes #8042 from feynmanliang/SPARK-9750 and squashes the following commits: bb70d5e [Feynman Liang] Breeze compare for dense matrices as well, in case other is sparse ab6f3c8 [Feynman Liang] Sparse matrix compare for equals 22782df [Feynman Liang] Add equality based on matrix semantics, not representation 78f9426 [Feynman Liang] Add casts 43d28fa [Feynman Liang] Fix failing test 6416fa0 [Feynman Liang] Add failing sparse matrix equals tests Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/520ad44b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/520ad44b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/520ad44b Branch: refs/heads/master Commit: 520ad44b17f72e6465bf990f64b4e289f8a83447 Parents: 5831294 Author: Feynman Liang fli...@databricks.com Authored: Tue Aug 11 12:49:47 2015 -0700 Committer: Joseph K. Bradley jos...@databricks.com Committed: Tue Aug 11 12:49:47 2015 -0700 -- .../org/apache/spark/mllib/linalg/Matrices.scala | 8 ++-- .../apache/spark/mllib/linalg/MatricesSuite.scala | 18 ++ 2 files changed, 24 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/520ad44b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala index 1c85834..1139ce3 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala @@ -257,8 +257,7 @@ class DenseMatrix( this(numRows, numCols, values, false) override def equals(o: Any): Boolean = o match { -case m: DenseMatrix = - m.numRows == numRows m.numCols == numCols Arrays.equals(toArray, m.toArray) +case m: Matrix = toBreeze == m.toBreeze case _ = false } @@ -519,6 +518,11 @@ class SparseMatrix( rowIndices: Array[Int], values: Array[Double]) = this(numRows, numCols, colPtrs, rowIndices, values, false) + override def equals(o: Any): Boolean = o match { +case m: Matrix = toBreeze == m.toBreeze +case _ = false + } + private[mllib] def toBreeze: BM[Double] = { if (!isTransposed) { new BSM[Double](values, numRows, numCols, colPtrs, rowIndices) http://git-wip-us.apache.org/repos/asf/spark/blob/520ad44b/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala index a270ba2..bfd6d54 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala @@ -74,6 +74,24 @@ class MatricesSuite extends SparkFunSuite { } } + test(equals) { +val dm1 = Matrices.dense(2, 2, Array(0.0, 1.0, 2.0, 3.0)) +assert(dm1 === dm1) +assert(dm1 !== dm1.transpose) + +val dm2 = Matrices.dense(2, 2, Array(0.0, 2.0, 1.0, 3.0)) +assert(dm1 === dm2.transpose) + +val sm1 = dm1.asInstanceOf[DenseMatrix].toSparse +assert(sm1 === sm1) +assert(sm1 === dm1) +assert(sm1 !== sm1.transpose) + +val sm2 = dm2.asInstanceOf[DenseMatrix].toSparse +assert(sm1 === sm2.transpose) +assert(sm1 === dm2.transpose) + } + test(matrix copies are deep copies) { val m = 3 val n = 2 - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9750] [MLLIB] Improve equals on SparseMatrix and DenseMatrix
Repository: spark Updated Branches: refs/heads/branch-1.5 767ee1884 - 811d23f1c [SPARK-9750] [MLLIB] Improve equals on SparseMatrix and DenseMatrix Adds unit test for `equals` on `mllib.linalg.Matrix` class and `equals` to both `SparseMatrix` and `DenseMatrix`. Supports equality testing between `SparseMatrix` and `DenseMatrix`. mengxr Author: Feynman Liang fli...@databricks.com Closes #8042 from feynmanliang/SPARK-9750 and squashes the following commits: bb70d5e [Feynman Liang] Breeze compare for dense matrices as well, in case other is sparse ab6f3c8 [Feynman Liang] Sparse matrix compare for equals 22782df [Feynman Liang] Add equality based on matrix semantics, not representation 78f9426 [Feynman Liang] Add casts 43d28fa [Feynman Liang] Fix failing test 6416fa0 [Feynman Liang] Add failing sparse matrix equals tests (cherry picked from commit 520ad44b17f72e6465bf990f64b4e289f8a83447) Signed-off-by: Joseph K. Bradley jos...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/811d23f1 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/811d23f1 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/811d23f1 Branch: refs/heads/branch-1.5 Commit: 811d23f1c27e7f461f0d37d058c07885fb0e0750 Parents: 767ee18 Author: Feynman Liang fli...@databricks.com Authored: Tue Aug 11 12:49:47 2015 -0700 Committer: Joseph K. Bradley jos...@databricks.com Committed: Tue Aug 11 12:49:56 2015 -0700 -- .../org/apache/spark/mllib/linalg/Matrices.scala | 8 ++-- .../apache/spark/mllib/linalg/MatricesSuite.scala | 18 ++ 2 files changed, 24 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/811d23f1/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala index 1c85834..1139ce3 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala @@ -257,8 +257,7 @@ class DenseMatrix( this(numRows, numCols, values, false) override def equals(o: Any): Boolean = o match { -case m: DenseMatrix = - m.numRows == numRows m.numCols == numCols Arrays.equals(toArray, m.toArray) +case m: Matrix = toBreeze == m.toBreeze case _ = false } @@ -519,6 +518,11 @@ class SparseMatrix( rowIndices: Array[Int], values: Array[Double]) = this(numRows, numCols, colPtrs, rowIndices, values, false) + override def equals(o: Any): Boolean = o match { +case m: Matrix = toBreeze == m.toBreeze +case _ = false + } + private[mllib] def toBreeze: BM[Double] = { if (!isTransposed) { new BSM[Double](values, numRows, numCols, colPtrs, rowIndices) http://git-wip-us.apache.org/repos/asf/spark/blob/811d23f1/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala index a270ba2..bfd6d54 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala @@ -74,6 +74,24 @@ class MatricesSuite extends SparkFunSuite { } } + test(equals) { +val dm1 = Matrices.dense(2, 2, Array(0.0, 1.0, 2.0, 3.0)) +assert(dm1 === dm1) +assert(dm1 !== dm1.transpose) + +val dm2 = Matrices.dense(2, 2, Array(0.0, 2.0, 1.0, 3.0)) +assert(dm1 === dm2.transpose) + +val sm1 = dm1.asInstanceOf[DenseMatrix].toSparse +assert(sm1 === sm1) +assert(sm1 === dm1) +assert(sm1 !== sm1.transpose) + +val sm2 = dm2.asInstanceOf[DenseMatrix].toSparse +assert(sm1 === sm2.transpose) +assert(sm1 === dm2.transpose) + } + test(matrix copies are deep copies) { val m = 3 val n = 2 - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [HOTFIX] Fix style error caused by 017b5de
Repository: spark Updated Branches: refs/heads/master 017b5de07 - 736af95bd [HOTFIX] Fix style error caused by 017b5de Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/736af95b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/736af95b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/736af95b Branch: refs/heads/master Commit: 736af95bd0c41723d455246b634a0fb68b38a7c7 Parents: 017b5de Author: Andrew Or and...@databricks.com Authored: Tue Aug 11 14:52:52 2015 -0700 Committer: Andrew Or and...@databricks.com Committed: Tue Aug 11 14:52:52 2015 -0700 -- mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/736af95b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala index 26eb84a..11ed231 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala @@ -114,7 +114,7 @@ object MLUtils { } // Convenient methods for `loadLibSVMFile`. - + /** * @since 1.0.0 */ - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [HOTFIX] Fix style error caused by ef961ed48a4f45447f0e0ad256b040c7ab2d78d9
Repository: spark Updated Branches: refs/heads/branch-1.5 725e5c7a4 - 1067c7369 [HOTFIX] Fix style error caused by ef961ed48a4f45447f0e0ad256b040c7ab2d78d9 Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1067c736 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1067c736 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1067c736 Branch: refs/heads/branch-1.5 Commit: 1067c73693c52facddfb5e425e9caaf7a1cb364b Parents: 725e5c7 Author: Andrew Or and...@databricks.com Authored: Tue Aug 11 14:52:52 2015 -0700 Committer: Andrew Or and...@databricks.com Committed: Tue Aug 11 14:57:23 2015 -0700 -- mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/1067c736/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala index 26eb84a..11ed231 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala @@ -114,7 +114,7 @@ object MLUtils { } // Convenient methods for `loadLibSVMFile`. - + /** * @since 1.0.0 */ - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9831] [SQL] fix serialization with empty broadcast
Repository: spark Updated Branches: refs/heads/master 74a293f45 - c3e9a120e [SPARK-9831] [SQL] fix serialization with empty broadcast Author: Davies Liu dav...@databricks.com Closes #8117 from davies/fix_serialization and squashes the following commits: d21ac71 [Davies Liu] fix serialization with empty broadcast Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c3e9a120 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c3e9a120 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c3e9a120 Branch: refs/heads/master Commit: c3e9a120e33159fb45cd99f3a55fc5cf16cd7c6c Parents: 74a293f Author: Davies Liu dav...@databricks.com Authored: Tue Aug 11 22:45:18 2015 -0700 Committer: Davies Liu davies@gmail.com Committed: Tue Aug 11 22:45:18 2015 -0700 -- .../spark/sql/execution/joins/HashedRelation.scala | 2 +- .../sql/execution/joins/HashedRelationSuite.scala | 17 + 2 files changed, 18 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c3e9a120/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala index c1bc794..076afe6 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala @@ -299,7 +299,7 @@ private[joins] final class UnsafeHashedRelation( binaryMap = new BytesToBytesMap( taskMemoryManager, shuffleMemoryManager, - nKeys * 2, // reduce hash collision + (nKeys * 1.5 + 1).toInt, // reduce hash collision pageSizeBytes) var i = 0 http://git-wip-us.apache.org/repos/asf/spark/blob/c3e9a120/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/HashedRelationSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/HashedRelationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/HashedRelationSuite.scala index a1fa2c3..c635b2d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/HashedRelationSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/HashedRelationSuite.scala @@ -103,4 +103,21 @@ class HashedRelationSuite extends SparkFunSuite { assert(hashed2.get(unsafeData(2)) === data2) assert(numDataRows.value.value === data.length) } + + test(test serialization empty hash map) { +val os = new ByteArrayOutputStream() +val out = new ObjectOutputStream(os) +val hashed = new UnsafeHashedRelation( + new java.util.HashMap[UnsafeRow, CompactBuffer[UnsafeRow]]) +hashed.writeExternal(out) +out.flush() +val in = new ObjectInputStream(new ByteArrayInputStream(os.toByteArray)) +val hashed2 = new UnsafeHashedRelation() +hashed2.readExternal(in) + +val schema = StructType(StructField(a, IntegerType, true) :: Nil) +val toUnsafe = UnsafeProjection.create(schema) +val row = toUnsafe(InternalRow(0)) +assert(hashed2.get(row) === null) + } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9831] [SQL] fix serialization with empty broadcast
Repository: spark Updated Branches: refs/heads/branch-1.5 890c75bc2 - 7024f3eac [SPARK-9831] [SQL] fix serialization with empty broadcast Author: Davies Liu dav...@databricks.com Closes #8117 from davies/fix_serialization and squashes the following commits: d21ac71 [Davies Liu] fix serialization with empty broadcast (cherry picked from commit c3e9a120e33159fb45cd99f3a55fc5cf16cd7c6c) Signed-off-by: Davies Liu davies@gmail.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7024f3ea Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7024f3ea Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7024f3ea Branch: refs/heads/branch-1.5 Commit: 7024f3eac7b5133ff7a75171509a09ca0c367f5e Parents: 890c75b Author: Davies Liu dav...@databricks.com Authored: Tue Aug 11 22:45:18 2015 -0700 Committer: Davies Liu davies@gmail.com Committed: Tue Aug 11 22:45:41 2015 -0700 -- .../spark/sql/execution/joins/HashedRelation.scala | 2 +- .../sql/execution/joins/HashedRelationSuite.scala | 17 + 2 files changed, 18 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/7024f3ea/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala index c1bc794..076afe6 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala @@ -299,7 +299,7 @@ private[joins] final class UnsafeHashedRelation( binaryMap = new BytesToBytesMap( taskMemoryManager, shuffleMemoryManager, - nKeys * 2, // reduce hash collision + (nKeys * 1.5 + 1).toInt, // reduce hash collision pageSizeBytes) var i = 0 http://git-wip-us.apache.org/repos/asf/spark/blob/7024f3ea/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/HashedRelationSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/HashedRelationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/HashedRelationSuite.scala index a1fa2c3..c635b2d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/HashedRelationSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/HashedRelationSuite.scala @@ -103,4 +103,21 @@ class HashedRelationSuite extends SparkFunSuite { assert(hashed2.get(unsafeData(2)) === data2) assert(numDataRows.value.value === data.length) } + + test(test serialization empty hash map) { +val os = new ByteArrayOutputStream() +val out = new ObjectOutputStream(os) +val hashed = new UnsafeHashedRelation( + new java.util.HashMap[UnsafeRow, CompactBuffer[UnsafeRow]]) +hashed.writeExternal(out) +out.flush() +val in = new ObjectInputStream(new ByteArrayInputStream(os.toByteArray)) +val hashed2 = new UnsafeHashedRelation() +hashed2.readExternal(in) + +val schema = StructType(StructField(a, IntegerType, true) :: Nil) +val toUnsafe = UnsafeProjection.create(schema) +val row = toUnsafe(InternalRow(0)) +assert(hashed2.get(row) === null) + } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9854] [SQL] RuleExecutor.timeMap should be thread-safe
Repository: spark Updated Branches: refs/heads/master c3e9a120e - b1581ac28 [SPARK-9854] [SQL] RuleExecutor.timeMap should be thread-safe `RuleExecutor.timeMap` is currently a non-thread-safe mutable HashMap; this can lead to infinite loops if multiple threads are concurrently modifying the map. I believe that this is responsible for some hangs that I've observed in HiveQuerySuite. This patch addresses this by using a Guava `AtomicLongMap`. Author: Josh Rosen joshro...@databricks.com Closes #8120 from JoshRosen/rule-executor-time-map-fix. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b1581ac2 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b1581ac2 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b1581ac2 Branch: refs/heads/master Commit: b1581ac28840a4d2209ef8bb5c9f8700b4c1b286 Parents: c3e9a12 Author: Josh Rosen joshro...@databricks.com Authored: Tue Aug 11 22:46:59 2015 -0700 Committer: Josh Rosen joshro...@databricks.com Committed: Tue Aug 11 22:46:59 2015 -0700 -- .../spark/sql/catalyst/rules/RuleExecutor.scala | 15 +-- 1 file changed, 9 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b1581ac2/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleExecutor.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleExecutor.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleExecutor.scala index 8b82451..f80d2a9 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleExecutor.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleExecutor.scala @@ -17,22 +17,25 @@ package org.apache.spark.sql.catalyst.rules +import scala.collection.JavaConverters._ + +import com.google.common.util.concurrent.AtomicLongMap + import org.apache.spark.Logging import org.apache.spark.sql.catalyst.trees.TreeNode import org.apache.spark.sql.catalyst.util.sideBySide -import scala.collection.mutable - object RuleExecutor { - protected val timeMap = new mutable.HashMap[String, Long].withDefault(_ = 0) + protected val timeMap = AtomicLongMap.create[String]() /** Resets statistics about time spent running specific rules */ def resetTime(): Unit = timeMap.clear() /** Dump statistics about time spent running specific rules. */ def dumpTimeSpent(): String = { -val maxSize = timeMap.keys.map(_.toString.length).max -timeMap.toSeq.sortBy(_._2).reverseMap { case (k, v) = +val map = timeMap.asMap().asScala +val maxSize = map.keys.map(_.toString.length).max +map.toSeq.sortBy(_._2).reverseMap { case (k, v) = s${k.padTo(maxSize, ).mkString} $v }.mkString(\n) } @@ -79,7 +82,7 @@ abstract class RuleExecutor[TreeType : TreeNode[_]] extends Logging { val startTime = System.nanoTime() val result = rule(plan) val runTime = System.nanoTime() - startTime -RuleExecutor.timeMap(rule.ruleName) = RuleExecutor.timeMap(rule.ruleName) + runTime +RuleExecutor.timeMap.addAndGet(rule.ruleName, runTime) if (!result.fastEquals(plan)) { logTrace( - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[2/2] spark git commit: Preparing development version 1.5.0-SNAPSHOT
Preparing development version 1.5.0-SNAPSHOT Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b7497e3a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b7497e3a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b7497e3a Branch: refs/heads/branch-1.5 Commit: b7497e3a27205cdc5a7069eaeba3fd03d9e55332 Parents: 158b2ea Author: Patrick Wendell pwend...@gmail.com Authored: Tue Aug 11 18:07:34 2015 -0700 Committer: Patrick Wendell pwend...@gmail.com Committed: Tue Aug 11 18:07:34 2015 -0700 -- assembly/pom.xml| 2 +- bagel/pom.xml | 2 +- core/pom.xml| 2 +- examples/pom.xml| 2 +- external/flume-assembly/pom.xml | 2 +- external/flume-sink/pom.xml | 2 +- external/flume/pom.xml | 2 +- external/kafka-assembly/pom.xml | 2 +- external/kafka/pom.xml | 2 +- external/mqtt-assembly/pom.xml | 2 +- external/mqtt/pom.xml | 2 +- external/twitter/pom.xml| 2 +- external/zeromq/pom.xml | 2 +- extras/java8-tests/pom.xml | 2 +- extras/kinesis-asl-assembly/pom.xml | 2 +- extras/kinesis-asl/pom.xml | 2 +- extras/spark-ganglia-lgpl/pom.xml | 2 +- graphx/pom.xml | 2 +- launcher/pom.xml| 2 +- mllib/pom.xml | 2 +- network/common/pom.xml | 2 +- network/shuffle/pom.xml | 2 +- network/yarn/pom.xml| 2 +- pom.xml | 2 +- repl/pom.xml| 2 +- sql/catalyst/pom.xml| 2 +- sql/core/pom.xml| 2 +- sql/hive-thriftserver/pom.xml | 2 +- sql/hive/pom.xml| 2 +- streaming/pom.xml | 2 +- tools/pom.xml | 2 +- unsafe/pom.xml | 2 +- yarn/pom.xml| 2 +- 33 files changed, 33 insertions(+), 33 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b7497e3a/assembly/pom.xml -- diff --git a/assembly/pom.xml b/assembly/pom.xml index 3ef7d6f..e9c6d26 100644 --- a/assembly/pom.xml +++ b/assembly/pom.xml @@ -21,7 +21,7 @@ parent groupIdorg.apache.spark/groupId artifactIdspark-parent_2.10/artifactId -version1.5.0/version +version1.5.0-SNAPSHOT/version relativePath../pom.xml/relativePath /parent http://git-wip-us.apache.org/repos/asf/spark/blob/b7497e3a/bagel/pom.xml -- diff --git a/bagel/pom.xml b/bagel/pom.xml index 684e07b..ed5c37e 100644 --- a/bagel/pom.xml +++ b/bagel/pom.xml @@ -21,7 +21,7 @@ parent groupIdorg.apache.spark/groupId artifactIdspark-parent_2.10/artifactId -version1.5.0/version +version1.5.0-SNAPSHOT/version relativePath../pom.xml/relativePath /parent http://git-wip-us.apache.org/repos/asf/spark/blob/b7497e3a/core/pom.xml -- diff --git a/core/pom.xml b/core/pom.xml index bb25652..0e53a79 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -21,7 +21,7 @@ parent groupIdorg.apache.spark/groupId artifactIdspark-parent_2.10/artifactId -version1.5.0/version +version1.5.0-SNAPSHOT/version relativePath../pom.xml/relativePath /parent http://git-wip-us.apache.org/repos/asf/spark/blob/b7497e3a/examples/pom.xml -- diff --git a/examples/pom.xml b/examples/pom.xml index 9ef1eda..e6884b0 100644 --- a/examples/pom.xml +++ b/examples/pom.xml @@ -21,7 +21,7 @@ parent groupIdorg.apache.spark/groupId artifactIdspark-parent_2.10/artifactId -version1.5.0/version +version1.5.0-SNAPSHOT/version relativePath../pom.xml/relativePath /parent http://git-wip-us.apache.org/repos/asf/spark/blob/b7497e3a/external/flume-assembly/pom.xml -- diff --git a/external/flume-assembly/pom.xml b/external/flume-assembly/pom.xml index 6377c3e..1318959 100644 --- a/external/flume-assembly/pom.xml +++ b/external/flume-assembly/pom.xml @@ -21,7 +21,7 @@ parent groupIdorg.apache.spark/groupId artifactIdspark-parent_2.10/artifactId -version1.5.0/version +version1.5.0-SNAPSHOT/version relativePath../../pom.xml/relativePath /parent http://git-wip-us.apache.org/repos/asf/spark/blob/b7497e3a/external/flume-sink/pom.xml -- diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml index
spark git commit: [SPARK-9849] [SQL] DirectParquetOutputCommitter qualified name should be backward compatible
Repository: spark Updated Branches: refs/heads/master 5a5bbc299 - afa757c98 [SPARK-9849] [SQL] DirectParquetOutputCommitter qualified name should be backward compatible DirectParquetOutputCommitter was moved in SPARK-9763. However, users can explicitly set the class as a config option, so we must be able to resolve the old committer qualified name. Author: Reynold Xin r...@databricks.com Closes #8114 from rxin/SPARK-9849. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/afa757c9 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/afa757c9 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/afa757c9 Branch: refs/heads/master Commit: afa757c98c537965007cad4c61c436887f3ac6a6 Parents: 5a5bbc2 Author: Reynold Xin r...@databricks.com Authored: Tue Aug 11 18:08:49 2015 -0700 Committer: Yin Huai yh...@databricks.com Committed: Tue Aug 11 18:08:49 2015 -0700 -- .../datasources/parquet/ParquetRelation.scala | 7 + .../datasources/parquet/ParquetIOSuite.scala| 27 +++- 2 files changed, 33 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/afa757c9/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala index 4086a13..c71c69b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala @@ -209,6 +209,13 @@ private[sql] class ParquetRelation( override def prepareJobForWrite(job: Job): OutputWriterFactory = { val conf = ContextUtil.getConfiguration(job) +// SPARK-9849 DirectParquetOutputCommitter qualified name should be backward compatible +val committerClassname = conf.get(SQLConf.PARQUET_OUTPUT_COMMITTER_CLASS.key) +if (committerClassname == org.apache.spark.sql.parquet.DirectParquetOutputCommitter) { + conf.set(SQLConf.PARQUET_OUTPUT_COMMITTER_CLASS.key, +classOf[DirectParquetOutputCommitter].getCanonicalName) +} + val committerClass = conf.getClass( SQLConf.PARQUET_OUTPUT_COMMITTER_CLASS.key, http://git-wip-us.apache.org/repos/asf/spark/blob/afa757c9/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala index ee925af..cb16634 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala @@ -390,7 +390,32 @@ class ParquetIOSuite extends QueryTest with ParquetTest { } } - test(SPARK-8121: spark.sql.parquet.output.committer.class shouldn't be overriden) { + test(SPARK-9849 DirectParquetOutputCommitter qualified name should be backward compatible) { +val clonedConf = new Configuration(configuration) + +// Write to a parquet file and let it fail. +// _temporary should be missing if direct output committer works. +try { + configuration.set(spark.sql.parquet.output.committer.class, +org.apache.spark.sql.parquet.DirectParquetOutputCommitter) + sqlContext.udf.register(div0, (x: Int) = x / 0) + withTempPath { dir = +intercept[org.apache.spark.SparkException] { + sqlContext.sql(select div0(1)).write.parquet(dir.getCanonicalPath) +} +val path = new Path(dir.getCanonicalPath, _temporary) +val fs = path.getFileSystem(configuration) +assert(!fs.exists(path)) + } +} finally { + // Hadoop 1 doesn't have `Configuration.unset` + configuration.clear() + clonedConf.foreach(entry = configuration.set(entry.getKey, entry.getValue)) +} + } + + + test(SPARK-8121: spark.sql.parquet.output.committer.class shouldn't be overridden) { withTempPath { dir = val clonedConf = new Configuration(configuration) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9849] [SQL] DirectParquetOutputCommitter qualified name should be backward compatible
Repository: spark Updated Branches: refs/heads/branch-1.5 b7497e3a2 - ec7a4b9b0 [SPARK-9849] [SQL] DirectParquetOutputCommitter qualified name should be backward compatible DirectParquetOutputCommitter was moved in SPARK-9763. However, users can explicitly set the class as a config option, so we must be able to resolve the old committer qualified name. Author: Reynold Xin r...@databricks.com Closes #8114 from rxin/SPARK-9849. (cherry picked from commit afa757c98c537965007cad4c61c436887f3ac6a6) Signed-off-by: Yin Huai yh...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ec7a4b9b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ec7a4b9b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ec7a4b9b Branch: refs/heads/branch-1.5 Commit: ec7a4b9b0b1183965e086f724877d69bccbdbcbe Parents: b7497e3 Author: Reynold Xin r...@databricks.com Authored: Tue Aug 11 18:08:49 2015 -0700 Committer: Yin Huai yh...@databricks.com Committed: Tue Aug 11 18:09:05 2015 -0700 -- .../datasources/parquet/ParquetRelation.scala | 7 + .../datasources/parquet/ParquetIOSuite.scala| 27 +++- 2 files changed, 33 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ec7a4b9b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala index 4086a13..c71c69b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala @@ -209,6 +209,13 @@ private[sql] class ParquetRelation( override def prepareJobForWrite(job: Job): OutputWriterFactory = { val conf = ContextUtil.getConfiguration(job) +// SPARK-9849 DirectParquetOutputCommitter qualified name should be backward compatible +val committerClassname = conf.get(SQLConf.PARQUET_OUTPUT_COMMITTER_CLASS.key) +if (committerClassname == org.apache.spark.sql.parquet.DirectParquetOutputCommitter) { + conf.set(SQLConf.PARQUET_OUTPUT_COMMITTER_CLASS.key, +classOf[DirectParquetOutputCommitter].getCanonicalName) +} + val committerClass = conf.getClass( SQLConf.PARQUET_OUTPUT_COMMITTER_CLASS.key, http://git-wip-us.apache.org/repos/asf/spark/blob/ec7a4b9b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala index ee925af..cb16634 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala @@ -390,7 +390,32 @@ class ParquetIOSuite extends QueryTest with ParquetTest { } } - test(SPARK-8121: spark.sql.parquet.output.committer.class shouldn't be overriden) { + test(SPARK-9849 DirectParquetOutputCommitter qualified name should be backward compatible) { +val clonedConf = new Configuration(configuration) + +// Write to a parquet file and let it fail. +// _temporary should be missing if direct output committer works. +try { + configuration.set(spark.sql.parquet.output.committer.class, +org.apache.spark.sql.parquet.DirectParquetOutputCommitter) + sqlContext.udf.register(div0, (x: Int) = x / 0) + withTempPath { dir = +intercept[org.apache.spark.SparkException] { + sqlContext.sql(select div0(1)).write.parquet(dir.getCanonicalPath) +} +val path = new Path(dir.getCanonicalPath, _temporary) +val fs = path.getFileSystem(configuration) +assert(!fs.exists(path)) + } +} finally { + // Hadoop 1 doesn't have `Configuration.unset` + configuration.clear() + clonedConf.foreach(entry = configuration.set(entry.getKey, entry.getValue)) +} + } + + + test(SPARK-8121: spark.sql.parquet.output.committer.class shouldn't be overridden) { withTempPath { dir = val clonedConf = new Configuration(configuration) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9713] [ML] Document SparkR MLlib glm() integration in Spark 1.5
Repository: spark Updated Branches: refs/heads/branch-1.5 6ea33f5bf - 890c75bc2 [SPARK-9713] [ML] Document SparkR MLlib glm() integration in Spark 1.5 This documents the use of R model formulae in the SparkR guide. Also fixes some bugs in the R api doc. mengxr Author: Eric Liang e...@databricks.com Closes #8085 from ericl/docs. (cherry picked from commit 74a293f4537c6982345166f8883538f81d850872) Signed-off-by: Xiangrui Meng m...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/890c75bc Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/890c75bc Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/890c75bc Branch: refs/heads/branch-1.5 Commit: 890c75bc2c2e1405c00485a98c034342122b639f Parents: 6ea33f5 Author: Eric Liang e...@databricks.com Authored: Tue Aug 11 21:26:03 2015 -0700 Committer: Xiangrui Meng m...@databricks.com Committed: Tue Aug 11 21:26:12 2015 -0700 -- R/pkg/R/generics.R | 4 ++-- R/pkg/R/mllib.R| 8 docs/sparkr.md | 37 - 3 files changed, 42 insertions(+), 7 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/890c75bc/R/pkg/R/generics.R -- diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index c43b947..379a78b 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -535,8 +535,8 @@ setGeneric(showDF, function(x,...) { standardGeneric(showDF) }) #' @export setGeneric(summarize, function(x,...) { standardGeneric(summarize) }) -##' rdname summary -##' @export +#' @rdname summary +#' @export setGeneric(summary, function(x, ...) { standardGeneric(summary) }) # @rdname tojson http://git-wip-us.apache.org/repos/asf/spark/blob/890c75bc/R/pkg/R/mllib.R -- diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R index b524d1f..cea3d76 100644 --- a/R/pkg/R/mllib.R +++ b/R/pkg/R/mllib.R @@ -56,10 +56,10 @@ setMethod(glm, signature(formula = formula, family = ANY, data = DataFram #' #' Makes predictions from a model produced by glm(), similarly to R's predict(). #' -#' @param model A fitted MLlib model +#' @param object A fitted MLlib model #' @param newData DataFrame for testing #' @return DataFrame containing predicted values -#' @rdname glm +#' @rdname predict #' @export #' @examples #'\dontrun{ @@ -76,10 +76,10 @@ setMethod(predict, signature(object = PipelineModel), #' #' Returns the summary of a model produced by glm(), similarly to R's summary(). #' -#' @param model A fitted MLlib model +#' @param x A fitted MLlib model #' @return a list with a 'coefficient' component, which is the matrix of coefficients. See #' summary.glm for more information. -#' @rdname glm +#' @rdname summary #' @export #' @examples #'\dontrun{ http://git-wip-us.apache.org/repos/asf/spark/blob/890c75bc/docs/sparkr.md -- diff --git a/docs/sparkr.md b/docs/sparkr.md index 4385a4e..7139d16 100644 --- a/docs/sparkr.md +++ b/docs/sparkr.md @@ -11,7 +11,8 @@ title: SparkR (R on Spark) SparkR is an R package that provides a light-weight frontend to use Apache Spark from R. In Spark {{site.SPARK_VERSION}}, SparkR provides a distributed data frame implementation that supports operations like selection, filtering, aggregation etc. (similar to R data frames, -[dplyr](https://github.com/hadley/dplyr)) but on large datasets. +[dplyr](https://github.com/hadley/dplyr)) but on large datasets. SparkR also supports distributed +machine learning using MLlib. # SparkR DataFrames @@ -230,3 +231,37 @@ head(teenagers) {% endhighlight %} /div + +# Machine Learning + +SparkR allows the fitting of generalized linear models over DataFrames using the [glm()](api/R/glm.html) function. Under the hood, SparkR uses MLlib to train a model of the specified family. Currently the gaussian and binomial families are supported. We support a subset of the available R formula operators for model fitting, including '~', '.', '+', and '-'. The example below shows the use of building a gaussian GLM model using SparkR. + +div data-lang=r markdown=1 +{% highlight r %} +# Create the DataFrame +df - createDataFrame(sqlContext, iris) + +# Fit a linear model over the dataset. +model - glm(Sepal_Length ~ Sepal_Width + Species, data = df, family = gaussian) + +# Model coefficients are returned in a similar format to R's native glm(). +summary(model) +##$coefficients +##Estimate +##(Intercept)2.2513930 +##Sepal_Width0.8035609 +##Species_versicolor 1.4587432 +##Species_virginica 1.9468169 + +# Make predictions based on the model. +predictions - predict(model, newData = df)
spark git commit: [SPARK-9713] [ML] Document SparkR MLlib glm() integration in Spark 1.5
Repository: spark Updated Branches: refs/heads/master 3ef0f3292 - 74a293f45 [SPARK-9713] [ML] Document SparkR MLlib glm() integration in Spark 1.5 This documents the use of R model formulae in the SparkR guide. Also fixes some bugs in the R api doc. mengxr Author: Eric Liang e...@databricks.com Closes #8085 from ericl/docs. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/74a293f4 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/74a293f4 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/74a293f4 Branch: refs/heads/master Commit: 74a293f4537c6982345166f8883538f81d850872 Parents: 3ef0f32 Author: Eric Liang e...@databricks.com Authored: Tue Aug 11 21:26:03 2015 -0700 Committer: Xiangrui Meng m...@databricks.com Committed: Tue Aug 11 21:26:03 2015 -0700 -- R/pkg/R/generics.R | 4 ++-- R/pkg/R/mllib.R| 8 docs/sparkr.md | 37 - 3 files changed, 42 insertions(+), 7 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/74a293f4/R/pkg/R/generics.R -- diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index c43b947..379a78b 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -535,8 +535,8 @@ setGeneric(showDF, function(x,...) { standardGeneric(showDF) }) #' @export setGeneric(summarize, function(x,...) { standardGeneric(summarize) }) -##' rdname summary -##' @export +#' @rdname summary +#' @export setGeneric(summary, function(x, ...) { standardGeneric(summary) }) # @rdname tojson http://git-wip-us.apache.org/repos/asf/spark/blob/74a293f4/R/pkg/R/mllib.R -- diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R index b524d1f..cea3d76 100644 --- a/R/pkg/R/mllib.R +++ b/R/pkg/R/mllib.R @@ -56,10 +56,10 @@ setMethod(glm, signature(formula = formula, family = ANY, data = DataFram #' #' Makes predictions from a model produced by glm(), similarly to R's predict(). #' -#' @param model A fitted MLlib model +#' @param object A fitted MLlib model #' @param newData DataFrame for testing #' @return DataFrame containing predicted values -#' @rdname glm +#' @rdname predict #' @export #' @examples #'\dontrun{ @@ -76,10 +76,10 @@ setMethod(predict, signature(object = PipelineModel), #' #' Returns the summary of a model produced by glm(), similarly to R's summary(). #' -#' @param model A fitted MLlib model +#' @param x A fitted MLlib model #' @return a list with a 'coefficient' component, which is the matrix of coefficients. See #' summary.glm for more information. -#' @rdname glm +#' @rdname summary #' @export #' @examples #'\dontrun{ http://git-wip-us.apache.org/repos/asf/spark/blob/74a293f4/docs/sparkr.md -- diff --git a/docs/sparkr.md b/docs/sparkr.md index 4385a4e..7139d16 100644 --- a/docs/sparkr.md +++ b/docs/sparkr.md @@ -11,7 +11,8 @@ title: SparkR (R on Spark) SparkR is an R package that provides a light-weight frontend to use Apache Spark from R. In Spark {{site.SPARK_VERSION}}, SparkR provides a distributed data frame implementation that supports operations like selection, filtering, aggregation etc. (similar to R data frames, -[dplyr](https://github.com/hadley/dplyr)) but on large datasets. +[dplyr](https://github.com/hadley/dplyr)) but on large datasets. SparkR also supports distributed +machine learning using MLlib. # SparkR DataFrames @@ -230,3 +231,37 @@ head(teenagers) {% endhighlight %} /div + +# Machine Learning + +SparkR allows the fitting of generalized linear models over DataFrames using the [glm()](api/R/glm.html) function. Under the hood, SparkR uses MLlib to train a model of the specified family. Currently the gaussian and binomial families are supported. We support a subset of the available R formula operators for model fitting, including '~', '.', '+', and '-'. The example below shows the use of building a gaussian GLM model using SparkR. + +div data-lang=r markdown=1 +{% highlight r %} +# Create the DataFrame +df - createDataFrame(sqlContext, iris) + +# Fit a linear model over the dataset. +model - glm(Sepal_Length ~ Sepal_Width + Species, data = df, family = gaussian) + +# Model coefficients are returned in a similar format to R's native glm(). +summary(model) +##$coefficients +##Estimate +##(Intercept)2.2513930 +##Sepal_Width0.8035609 +##Species_versicolor 1.4587432 +##Species_virginica 1.9468169 + +# Make predictions based on the model. +predictions - predict(model, newData = df) +head(select(predictions, Sepal_Length, prediction)) +## Sepal_Length prediction +##1 5.1 5.063856 +##2
Git Push Summary
Repository: spark Updated Tags: refs/tags/v1.5.0-snapshot-20150811 [deleted] e9329ef6a - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-1517] Refactor release scripts to facilitate nightly publishing
Repository: spark Updated Branches: refs/heads/branch-1.5 0119edf52 - 6ea33f5bf [SPARK-1517] Refactor release scripts to facilitate nightly publishing This update contains some code changes to the release scripts that allow easier nightly publishing. I've been using these new scripts on Jenkins for cutting and publishing nightly snapshots for the last month or so, and it has been going well. I'd like to get them merged back upstream so this can be maintained by the community. The main changes are: 1. Separates the release tagging from various build possibilities for an already tagged release (`release-tag.sh` and `release-build.sh`). 2. Allow for injecting credentials through the environment, including GPG keys. This is then paired with secure key injection in Jenkins. 3. Support for copying build results to a remote directory, and also rotating results, e.g. the ability to keep the last N copies of binary or doc builds. I'm happy if anyone wants to take a look at this - it's not user facing but an internal utility used for generating releases. Author: Patrick Wendell patr...@databricks.com Closes #7411 from pwendell/release-script-updates and squashes the following commits: 74f9beb [Patrick Wendell] Moving maven build command to a variable 233ce85 [Patrick Wendell] [SPARK-1517] Refactor release scripts to facilitate nightly publishing (cherry picked from commit 3ef0f32928fc383ad3edd5ad167212aeb9eba6e1) Signed-off-by: Patrick Wendell patr...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6ea33f5b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6ea33f5b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6ea33f5b Branch: refs/heads/branch-1.5 Commit: 6ea33f5bf1bf5577d99951b77e473c2b1479ac5c Parents: 0119edf Author: Patrick Wendell patr...@databricks.com Authored: Tue Aug 11 21:16:48 2015 -0700 Committer: Patrick Wendell patr...@databricks.com Committed: Tue Aug 11 21:16:59 2015 -0700 -- dev/create-release/create-release.sh | 267 - dev/create-release/release-build.sh | 321 ++ dev/create-release/release-tag.sh| 79 3 files changed, 400 insertions(+), 267 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/6ea33f5b/dev/create-release/create-release.sh -- diff --git a/dev/create-release/create-release.sh b/dev/create-release/create-release.sh deleted file mode 100755 index 4311c8c..000 --- a/dev/create-release/create-release.sh +++ /dev/null @@ -1,267 +0,0 @@ -#!/usr/bin/env bash - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the License); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -#http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an AS IS BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Quick-and-dirty automation of making maven and binary releases. Not robust at all. -# Publishes releases to Maven and packages/copies binary release artifacts. -# Expects to be run in a totally empty directory. -# -# Options: -# --skip-create-release Assume the desired release tag already exists -# --skip-publish Do not publish to Maven central -# --skip-package Do not package and upload binary artifacts -# Would be nice to add: -# - Send output to stderr and have useful logging in stdout - -# Note: The following variables must be set before use! -ASF_USERNAME=${ASF_USERNAME:-pwendell} -ASF_PASSWORD=${ASF_PASSWORD:-XXX} -GPG_PASSPHRASE=${GPG_PASSPHRASE:-XXX} -GIT_BRANCH=${GIT_BRANCH:-branch-1.0} -RELEASE_VERSION=${RELEASE_VERSION:-1.2.0} -# Allows publishing under a different version identifier than -# was present in the actual release sources (e.g. rc-X) -PUBLISH_VERSION=${PUBLISH_VERSION:-$RELEASE_VERSION} -NEXT_VERSION=${NEXT_VERSION:-1.2.1} -RC_NAME=${RC_NAME:-rc2} - -M2_REPO=~/.m2/repository -SPARK_REPO=$M2_REPO/org/apache/spark -NEXUS_ROOT=https://repository.apache.org/service/local/staging -NEXUS_PROFILE=d63f592e7eac0 # Profile for Spark staging uploads - -if [ -z $JAVA_HOME ]; then - echo Error: JAVA_HOME is not set, cannot proceed. - exit -1 -fi -JAVA_7_HOME=${JAVA_7_HOME:-$JAVA_HOME} -