spark git commit: [SPARK-10809][MLLIB] Single-document topicDistributions method for LocalLDAModel
Repository: spark Updated Branches: refs/heads/master 4f8eefa36 -> bbea88852 [SPARK-10809][MLLIB] Single-document topicDistributions method for LocalLDAModel jira: https://issues.apache.org/jira/browse/SPARK-10809 We could provide a single-document topicDistributions method for LocalLDAModel to allow for quick queries which avoid RDD operations. Currently, the user must use an RDD of documents. add some missing assert too. Author: Yuhao YangCloses #9484 from hhbyyh/ldaTopicPre. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/bbea8885 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/bbea8885 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/bbea8885 Branch: refs/heads/master Commit: bbea88852ce6a3127d071ca40dbca2d042f9fbcf Parents: 4f8eefa Author: Yuhao Yang Authored: Mon Jan 11 14:55:44 2016 -0800 Committer: Joseph K. Bradley Committed: Mon Jan 11 14:55:44 2016 -0800 -- .../spark/mllib/clustering/LDAModel.scala | 26 .../spark/mllib/clustering/LDASuite.scala | 15 --- 2 files changed, 38 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/bbea8885/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala index 2fce3ff..b30ecb8 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala @@ -388,6 +388,32 @@ class LocalLDAModel private[spark] ( } /** + * Predicts the topic mixture distribution for a document (often called "theta" in the + * literature). Returns a vector of zeros for an empty document. + * + * Note this means to allow quick query for single document. For batch documents, please refer + * to [[topicDistributions()]] to avoid overhead. + * + * @param document document to predict topic mixture distributions for + * @return topic mixture distribution for the document + */ + @Since("2.0.0") + def topicDistribution(document: Vector): Vector = { +val expElogbeta = exp(LDAUtils.dirichletExpectation(topicsMatrix.toBreeze.toDenseMatrix.t).t) +if (document.numNonzeros == 0) { + Vectors.zeros(this.k) +} else { + val (gamma, _) = OnlineLDAOptimizer.variationalTopicInference( +document, +expElogbeta, +this.docConcentration.toBreeze, +gammaShape, +this.k) + Vectors.dense(normalize(gamma, 1.0).toArray) +} + } + + /** * Java-friendly version of [[topicDistributions]] */ @Since("1.4.1") http://git-wip-us.apache.org/repos/asf/spark/blob/bbea8885/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala index faef60e..ea23196 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala @@ -366,7 +366,8 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext { (0, 0.99504), (1, 0.99504), (1, 0.99504), (1, 0.99504)) -val actualPredictions = ldaModel.topicDistributions(docs).map { case (id, topics) => +val actualPredictions = ldaModel.topicDistributions(docs).cache() +val topTopics = actualPredictions.map { case (id, topics) => // convert results to expectedPredictions format, which only has highest probability topic val topicsBz = topics.toBreeze.toDenseVector (id, (argmax(topicsBz), max(topicsBz))) @@ -374,9 +375,17 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext { .values .collect() -expectedPredictions.zip(actualPredictions).forall { case (expected, actual) => - expected._1 === actual._1 && (expected._2 ~== actual._2 relTol 1E-3D) +expectedPredictions.zip(topTopics).foreach { case (expected, actual) => + assert(expected._1 === actual._1 && (expected._2 ~== actual._2 relTol 1E-3D)) } + +docs.collect() + .map(doc => ldaModel.topicDistribution(doc._2)) + .zip(actualPredictions.map(_._2).collect()) + .foreach { case (single, batch) => +assert(single ~== batch relTol 1E-3D) + } +actualPredictions.unpersist() } test("OnlineLDAOptimizer with asymmetric prior") {
spark git commit: [SPARK-12758][SQL] add note to Spark SQL Migration guide about TimestampType casting
Repository: spark Updated Branches: refs/heads/master a44991453 -> a767ee8a0 [SPARK-12758][SQL] add note to Spark SQL Migration guide about TimestampType casting Warning users about casting changes. Author: Brandon BradleyCloses #10708 from blbradley/spark-12758. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a767ee8a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a767ee8a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a767ee8a Branch: refs/heads/master Commit: a767ee8a0599f5482717493a3298413c65d8ff89 Parents: a449914 Author: Brandon Bradley Authored: Mon Jan 11 14:21:50 2016 -0800 Committer: Michael Armbrust Committed: Mon Jan 11 14:21:50 2016 -0800 -- docs/sql-programming-guide.md | 5 + 1 file changed, 5 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a767ee8a/docs/sql-programming-guide.md -- diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index b058833..bc89c78 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -2151,6 +2151,11 @@ options. ... {% endhighlight %} + - From Spark 1.6, LongType casts to TimestampType expect seconds instead of microseconds. This + change was made to match the behavior of Hive 1.2 for more consistent type casting to TimestampType + from numeric types. See [SPARK-11724](https://issues.apache.org/jira/browse/SPARK-11724) for + details. + ## Upgrading From Spark SQL 1.4 to 1.5 - Optimized execution using manually managed memory (Tungsten) is now enabled by default, along with - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-12576][SQL] Enable expression parsing in CatalystQl
Repository: spark Updated Branches: refs/heads/master bbea88852 -> fe9eb0b0c [SPARK-12576][SQL] Enable expression parsing in CatalystQl The PR allows us to use the new SQL parser to parse SQL expressions such as: ```1 + sin(x*x)``` We enable this functionality in this PR, but we will not start using this actively yet. This will be done as soon as we have reached grammar parity with the existing parser stack. cc rxin Author: Herman van HovellCloses #10649 from hvanhovell/SPARK-12576. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/fe9eb0b0 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/fe9eb0b0 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/fe9eb0b0 Branch: refs/heads/master Commit: fe9eb0b0ce397aeb40a32f8231d2ce8c17d7a609 Parents: bbea888 Author: Herman van Hovell Authored: Mon Jan 11 16:29:37 2016 -0800 Committer: Reynold Xin Committed: Mon Jan 11 16:29:37 2016 -0800 -- .../sql/catalyst/parser/SelectClauseParser.g| 7 + .../apache/spark/sql/catalyst/CatalystQl.scala | 59 +--- .../spark/sql/catalyst/parser/ParseDriver.scala | 24 ++- .../spark/sql/catalyst/CatalystQlSuite.scala| 151 +-- .../spark/sql/hive/ExtendedHiveQlParser.scala | 2 +- .../spark/sql/hive/HiveMetastoreCatalog.scala | 4 +- .../org/apache/spark/sql/hive/HiveQl.scala | 19 +-- .../spark/sql/hive/ErrorPositionSuite.scala | 5 +- .../org/apache/spark/sql/hive/HiveQlSuite.scala | 2 +- 9 files changed, 217 insertions(+), 56 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/fe9eb0b0/sql/catalyst/src/main/antlr3/org/apache/spark/sql/catalyst/parser/SelectClauseParser.g -- diff --git a/sql/catalyst/src/main/antlr3/org/apache/spark/sql/catalyst/parser/SelectClauseParser.g b/sql/catalyst/src/main/antlr3/org/apache/spark/sql/catalyst/parser/SelectClauseParser.g index 2d2bafb..f18b6ec 100644 --- a/sql/catalyst/src/main/antlr3/org/apache/spark/sql/catalyst/parser/SelectClauseParser.g +++ b/sql/catalyst/src/main/antlr3/org/apache/spark/sql/catalyst/parser/SelectClauseParser.g @@ -131,6 +131,13 @@ selectItem : (tableAllColumns) => tableAllColumns -> ^(TOK_SELEXPR tableAllColumns) | +namedExpression +; + +namedExpression +@init { gParent.pushMsg("select named expression", state); } +@after { gParent.popMsg(state); } +: ( expression ((KW_AS? identifier) | (KW_AS LPAREN identifier (COMMA identifier)* RPAREN))? ) -> ^(TOK_SELEXPR expression identifier*) http://git-wip-us.apache.org/repos/asf/spark/blob/fe9eb0b0/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystQl.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystQl.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystQl.scala index 2e3cc0b..c87b6c8 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystQl.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystQl.scala @@ -30,6 +30,12 @@ import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.CalendarInterval import org.apache.spark.util.random.RandomSampler +private[sql] object CatalystQl { + val parser = new CatalystQl + def parseExpression(sql: String): Expression = parser.parseExpression(sql) + def parseTableIdentifier(sql: String): TableIdentifier = parser.parseTableIdentifier(sql) +} + /** * This class translates a HQL String to a Catalyst [[LogicalPlan]] or [[Expression]]. */ @@ -41,16 +47,13 @@ private[sql] class CatalystQl(val conf: ParserConf = SimpleParserConf()) { } } - /** - * Returns the AST for the given SQL string. + * The safeParse method allows a user to focus on the parsing/AST transformation logic. This + * method will take care of possible errors during the parsing process. */ - protected def getAst(sql: String): ASTNode = ParseDriver.parse(sql, conf) - - /** Creates LogicalPlan for a given HiveQL string. */ - def createPlan(sql: String): LogicalPlan = { + protected def safeParse[T](sql: String, ast: ASTNode)(toResult: ASTNode => T): T = { try { - createPlan(sql, ParseDriver.parse(sql, conf)) + toResult(ast) } catch { case e: MatchError => throw e case e: AnalysisException => throw e @@ -58,26 +61,39 @@ private[sql] class CatalystQl(val conf: ParserConf = SimpleParserConf()) { throw new AnalysisException(e.getMessage) case e: NotImplementedError => throw new AnalysisException( - s""" - |Unsupported language
spark git commit: [SPARK-12758][SQL] add note to Spark SQL Migration guide about TimestampType casting
Repository: spark Updated Branches: refs/heads/branch-1.6 3b32aa9e2 -> dd2cf64f3 [SPARK-12758][SQL] add note to Spark SQL Migration guide about TimestampType casting Warning users about casting changes. Author: Brandon BradleyCloses #10708 from blbradley/spark-12758. (cherry picked from commit a767ee8a0599f5482717493a3298413c65d8ff89) Signed-off-by: Michael Armbrust Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/dd2cf64f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/dd2cf64f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/dd2cf64f Branch: refs/heads/branch-1.6 Commit: dd2cf64f300ec42802dbea38b95047842de81870 Parents: 3b32aa9 Author: Brandon Bradley Authored: Mon Jan 11 14:21:50 2016 -0800 Committer: Michael Armbrust Committed: Mon Jan 11 14:22:15 2016 -0800 -- docs/sql-programming-guide.md | 5 + 1 file changed, 5 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/dd2cf64f/docs/sql-programming-guide.md -- diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index b058833..bc89c78 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -2151,6 +2151,11 @@ options. ... {% endhighlight %} + - From Spark 1.6, LongType casts to TimestampType expect seconds instead of microseconds. This + change was made to match the behavior of Hive 1.2 for more consistent type casting to TimestampType + from numeric types. See [SPARK-11724](https://issues.apache.org/jira/browse/SPARK-11724) for + details. + ## Upgrading From Spark SQL 1.4 to 1.5 - Optimized execution using manually managed memory (Tungsten) is now enabled by default, along with - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-12685][MLLIB] word2vec trainWordsCount gets overflow
Repository: spark Updated Branches: refs/heads/master ee4ee02b8 -> 4f8eefa36 [SPARK-12685][MLLIB] word2vec trainWordsCount gets overflow jira: https://issues.apache.org/jira/browse/SPARK-12685 the log of `word2vec` reports trainWordsCount = -785727483 during computation over a large dataset. Update the priority as it will affect the computation process. `alpha = learningRate * (1 - numPartitions * wordCount.toDouble / (trainWordsCount + 1))` Author: Yuhao YangCloses #10627 from hhbyyh/w2voverflow. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4f8eefa3 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4f8eefa3 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4f8eefa3 Branch: refs/heads/master Commit: 4f8eefa36bb90812aac61ac7a762c9452de666bf Parents: ee4ee02 Author: Yuhao Yang Authored: Mon Jan 11 14:48:35 2016 -0800 Committer: Joseph K. Bradley Committed: Mon Jan 11 14:48:35 2016 -0800 -- .../main/scala/org/apache/spark/mllib/feature/Word2Vec.scala | 8 1 file changed, 4 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4f8eefa3/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala index a7e1b76..dc5d070 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala @@ -151,7 +151,7 @@ class Word2Vec extends Serializable with Logging { /** context words from [-window, window] */ private var window = 5 - private var trainWordsCount = 0 + private var trainWordsCount = 0L private var vocabSize = 0 @transient private var vocab: Array[VocabWord] = null @transient private var vocabHash = mutable.HashMap.empty[String, Int] @@ -159,13 +159,13 @@ class Word2Vec extends Serializable with Logging { private def learnVocab(words: RDD[String]): Unit = { vocab = words.map(w => (w, 1)) .reduceByKey(_ + _) + .filter(_._2 >= minCount) .map(x => VocabWord( x._1, x._2, new Array[Int](MAX_CODE_LENGTH), new Array[Int](MAX_CODE_LENGTH), 0)) - .filter(_.cn >= minCount) .collect() .sortWith((a, b) => a.cn > b.cn) @@ -179,7 +179,7 @@ class Word2Vec extends Serializable with Logging { trainWordsCount += vocab(a).cn a += 1 } -logInfo("trainWordsCount = " + trainWordsCount) +logInfo(s"vocabSize = $vocabSize, trainWordsCount = $trainWordsCount") } private def createExpTable(): Array[Float] = { @@ -332,7 +332,7 @@ class Word2Vec extends Serializable with Logging { val random = new XORShiftRandom(seed ^ ((idx + 1) << 16) ^ ((-k - 1) << 8)) val syn0Modify = new Array[Int](vocabSize) val syn1Modify = new Array[Int](vocabSize) -val model = iter.foldLeft((bcSyn0Global.value, bcSyn1Global.value, 0, 0)) { +val model = iter.foldLeft((bcSyn0Global.value, bcSyn1Global.value, 0L, 0L)) { case ((syn0, syn1, lastWordCount, wordCount), sentence) => var lwc = lastWordCount var wc = wordCount - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-12603][MLLIB] PySpark MLlib GaussianMixtureModel should support single instance predict/predictSoft
Repository: spark Updated Branches: refs/heads/master a767ee8a0 -> ee4ee02b8 [SPARK-12603][MLLIB] PySpark MLlib GaussianMixtureModel should support single instance predict/predictSoft PySpark MLlib ```GaussianMixtureModel``` should support single instance ```predict/predictSoft``` just like Scala do. Author: Yanbo LiangCloses #10552 from yanboliang/spark-12603. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ee4ee02b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ee4ee02b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ee4ee02b Branch: refs/heads/master Commit: ee4ee02b86be8756a6d895a2e23e80862134a6d3 Parents: a767ee8 Author: Yanbo Liang Authored: Mon Jan 11 14:43:25 2016 -0800 Committer: Joseph K. Bradley Committed: Mon Jan 11 14:43:25 2016 -0800 -- .../main/python/mllib/gaussian_mixture_model.py | 4 +++ .../examples/mllib/DenseGaussianMixture.scala | 6 .../python/GaussianMixtureModelWrapper.scala| 4 +++ .../mllib/clustering/GaussianMixtureModel.scala | 2 +- python/pyspark/mllib/clustering.py | 35 5 files changed, 37 insertions(+), 14 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ee4ee02b/examples/src/main/python/mllib/gaussian_mixture_model.py -- diff --git a/examples/src/main/python/mllib/gaussian_mixture_model.py b/examples/src/main/python/mllib/gaussian_mixture_model.py index 2cb8010..69e836f 100644 --- a/examples/src/main/python/mllib/gaussian_mixture_model.py +++ b/examples/src/main/python/mllib/gaussian_mixture_model.py @@ -62,5 +62,9 @@ if __name__ == "__main__": for i in range(args.k): print(("weight = ", model.weights[i], "mu = ", model.gaussians[i].mu, "sigma = ", model.gaussians[i].sigma.toArray())) +print("\n") +print(("The membership value of each vector to all mixture components (first 100): ", + model.predictSoft(data).take(100))) +print("\n") print(("Cluster labels (first 100): ", model.predict(data).take(100))) sc.stop() http://git-wip-us.apache.org/repos/asf/spark/blob/ee4ee02b/examples/src/main/scala/org/apache/spark/examples/mllib/DenseGaussianMixture.scala -- diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/DenseGaussianMixture.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/DenseGaussianMixture.scala index 1fce4ba..90b817b 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/DenseGaussianMixture.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/DenseGaussianMixture.scala @@ -58,6 +58,12 @@ object DenseGaussianMixture { (clusters.weights(i), clusters.gaussians(i).mu, clusters.gaussians(i).sigma)) } +println("The membership value of each vector to all mixture components (first <= 100):") +val membership = clusters.predictSoft(data) +membership.take(100).foreach { x => + print(" " + x.mkString(",")) +} +println() println("Cluster labels (first <= 100):") val clusterLabels = clusters.predict(data) clusterLabels.take(100).foreach { x => http://git-wip-us.apache.org/repos/asf/spark/blob/ee4ee02b/mllib/src/main/scala/org/apache/spark/mllib/api/python/GaussianMixtureModelWrapper.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/GaussianMixtureModelWrapper.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/GaussianMixtureModelWrapper.scala index 6a3b20c..a689b09 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/GaussianMixtureModelWrapper.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/GaussianMixtureModelWrapper.scala @@ -40,5 +40,9 @@ private[python] class GaussianMixtureModelWrapper(model: GaussianMixtureModel) { SerDe.dumps(JavaConverters.seqAsJavaListConverter(modelGaussians).asJava) } + def predictSoft(point: Vector): Vector = { +Vectors.dense(model.predictSoft(point)) + } + def save(sc: SparkContext, path: String): Unit = model.save(sc, path) } http://git-wip-us.apache.org/repos/asf/spark/blob/ee4ee02b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala index 16bc45b..42fe270 100644 ---
spark git commit: [SPARK-12734][HOTFIX][TEST-MAVEN] Fix bug in Netty exclusions
Repository: spark Updated Branches: refs/heads/master 008a55828 -> f13c7f8f7 [SPARK-12734][HOTFIX][TEST-MAVEN] Fix bug in Netty exclusions This is a hotfix for a build bug introduced by the Netty exclusion changes in #10672. We can't exclude `io.netty:netty` because Akka depends on it. There's not a direct conflict between `io.netty:netty` and `io.netty:netty-all`, because the former puts classes in the `org.jboss.netty` namespace while the latter uses the `io.netty` namespace. However, there still is a conflict between `org.jboss.netty:netty` and `io.netty:netty`, so we need to continue to exclude the JBoss version of that artifact. While the diff here looks somewhat large, note that this is only a revert of a some of the changes from #10672. You can see the net changes in pom.xml at https://github.com/apache/spark/compare/3119206b7188c23055621dfeaf6874f21c711a82...5211ab8#diff-600376dffeb79835ede4a0b285078036 Author: Josh RosenCloses #10693 from JoshRosen/netty-hotfix. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f13c7f8f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f13c7f8f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f13c7f8f Branch: refs/heads/master Commit: f13c7f8f7dc8766b0a42406b5c3639d6be55cf33 Parents: 008a558 Author: Josh Rosen Authored: Mon Jan 11 00:31:29 2016 -0800 Committer: Josh Rosen Committed: Mon Jan 11 00:31:29 2016 -0800 -- dev/deps/spark-deps-hadoop-2.2 | 1 + dev/deps/spark-deps-hadoop-2.3 | 1 + dev/deps/spark-deps-hadoop-2.4 | 1 + dev/deps/spark-deps-hadoop-2.6 | 1 + examples/pom.xml | 4 --- pom.xml| 50 ++--- 6 files changed, 11 insertions(+), 47 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f13c7f8f/dev/deps/spark-deps-hadoop-2.2 -- diff --git a/dev/deps/spark-deps-hadoop-2.2 b/dev/deps/spark-deps-hadoop-2.2 index 13d1b0e..e4373f7 100644 --- a/dev/deps/spark-deps-hadoop-2.2 +++ b/dev/deps/spark-deps-hadoop-2.2 @@ -142,6 +142,7 @@ metrics-graphite-3.1.2.jar metrics-json-3.1.2.jar metrics-jvm-3.1.2.jar minlog-1.2.jar +netty-3.8.0.Final.jar netty-all-4.0.29.Final.jar objenesis-1.2.jar opencsv-2.3.jar http://git-wip-us.apache.org/repos/asf/spark/blob/f13c7f8f/dev/deps/spark-deps-hadoop-2.3 -- diff --git a/dev/deps/spark-deps-hadoop-2.3 b/dev/deps/spark-deps-hadoop-2.3 index d7deaa0..7478181 100644 --- a/dev/deps/spark-deps-hadoop-2.3 +++ b/dev/deps/spark-deps-hadoop-2.3 @@ -133,6 +133,7 @@ metrics-json-3.1.2.jar metrics-jvm-3.1.2.jar minlog-1.2.jar mx4j-3.0.2.jar +netty-3.8.0.Final.jar netty-all-4.0.29.Final.jar objenesis-1.2.jar opencsv-2.3.jar http://git-wip-us.apache.org/repos/asf/spark/blob/f13c7f8f/dev/deps/spark-deps-hadoop-2.4 -- diff --git a/dev/deps/spark-deps-hadoop-2.4 b/dev/deps/spark-deps-hadoop-2.4 index 7ad2212..faffb8b 100644 --- a/dev/deps/spark-deps-hadoop-2.4 +++ b/dev/deps/spark-deps-hadoop-2.4 @@ -134,6 +134,7 @@ metrics-json-3.1.2.jar metrics-jvm-3.1.2.jar minlog-1.2.jar mx4j-3.0.2.jar +netty-3.8.0.Final.jar netty-all-4.0.29.Final.jar objenesis-1.2.jar opencsv-2.3.jar http://git-wip-us.apache.org/repos/asf/spark/blob/f13c7f8f/dev/deps/spark-deps-hadoop-2.6 -- diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6 index 7f85189..e703c7a 100644 --- a/dev/deps/spark-deps-hadoop-2.6 +++ b/dev/deps/spark-deps-hadoop-2.6 @@ -140,6 +140,7 @@ metrics-json-3.1.2.jar metrics-jvm-3.1.2.jar minlog-1.2.jar mx4j-3.0.2.jar +netty-3.8.0.Final.jar netty-all-4.0.29.Final.jar objenesis-1.2.jar opencsv-2.3.jar http://git-wip-us.apache.org/repos/asf/spark/blob/f13c7f8f/examples/pom.xml -- diff --git a/examples/pom.xml b/examples/pom.xml index 6013085..1a0d5e5 100644 --- a/examples/pom.xml +++ b/examples/pom.xml @@ -111,10 +111,6 @@ org.jruby jruby-complete - - io.netty - netty - http://git-wip-us.apache.org/repos/asf/spark/blob/f13c7f8f/pom.xml -- diff --git a/pom.xml b/pom.xml index cbed36c..06cccf1 100644 --- a/pom.xml +++ b/pom.xml @@ -519,12 +519,6 @@ ${akka.group} akka-remote_${scala.binary.version} ${akka.version} - - -io.netty -netty -
spark git commit: [SPARK-12539][FOLLOW-UP] always sort in partitioning writer
Repository: spark Updated Branches: refs/heads/master f13c7f8f7 -> f253feff6 [SPARK-12539][FOLLOW-UP] always sort in partitioning writer address comments in #10498 , especially https://github.com/apache/spark/pull/10498#discussion_r49021259 Author: Wenchen FanThis patch had conflicts when merged, resolved by Committer: Reynold Xin Closes #10638 from cloud-fan/bucket-write. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f253feff Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f253feff Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f253feff Branch: refs/heads/master Commit: f253feff62f3eb3cce22bbec0874f317a61b0092 Parents: f13c7f8 Author: Wenchen Fan Authored: Mon Jan 11 00:44:33 2016 -0800 Committer: Reynold Xin Committed: Mon Jan 11 00:44:33 2016 -0800 -- .../execution/datasources/WriterContainer.scala | 192 +-- .../apache/spark/sql/sources/interfaces.scala | 3 - 2 files changed, 48 insertions(+), 147 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f253feff/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala index 40ecdb8..fff7287 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala @@ -33,7 +33,7 @@ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.UnsafeKVExternalSorter import org.apache.spark.sql.sources.{HadoopFsRelation, OutputWriter, OutputWriterFactory} -import org.apache.spark.sql.types.{IntegerType, StringType, StructType} +import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} import org.apache.spark.util.SerializableConfiguration @@ -349,67 +349,6 @@ private[sql] class DynamicPartitionWriterContainer( } } - private def sameBucket(key1: UnsafeRow, key2: UnsafeRow): Boolean = { -val bucketIdIndex = partitionColumns.length -if (key1.getInt(bucketIdIndex) != key2.getInt(bucketIdIndex)) { - false -} else { - var i = partitionColumns.length - 1 - while (i >= 0) { -val dt = partitionColumns(i).dataType -if (key1.get(i, dt) != key2.get(i, dt)) return false -i -= 1 - } - true -} - } - - private def sortBasedWrite( - sorter: UnsafeKVExternalSorter, - iterator: Iterator[InternalRow], - getSortingKey: UnsafeProjection, - getOutputRow: UnsafeProjection, - getPartitionString: UnsafeProjection, - outputWriters: java.util.HashMap[InternalRow, OutputWriter]): Unit = { -while (iterator.hasNext) { - val currentRow = iterator.next() - sorter.insertKV(getSortingKey(currentRow), getOutputRow(currentRow)) -} - -logInfo(s"Sorting complete. Writing out partition files one at a time.") - -val needNewWriter: (UnsafeRow, UnsafeRow) => Boolean = if (sortColumns.isEmpty) { - (key1, key2) => key1 != key2 -} else { - (key1, key2) => key1 == null || !sameBucket(key1, key2) -} - -val sortedIterator = sorter.sortedIterator() -var currentKey: UnsafeRow = null -var currentWriter: OutputWriter = null -try { - while (sortedIterator.next()) { -if (needNewWriter(currentKey, sortedIterator.getKey)) { - if (currentWriter != null) { -currentWriter.close() - } - currentKey = sortedIterator.getKey.copy() - logDebug(s"Writing partition: $currentKey") - - // Either use an existing file from before, or open a new one. - currentWriter = outputWriters.remove(currentKey) - if (currentWriter == null) { -currentWriter = newOutputWriter(currentKey, getPartitionString) - } -} - -currentWriter.writeInternal(sortedIterator.getValue) - } -} finally { - if (currentWriter != null) { currentWriter.close() } -} - } - /** * Open and returns a new OutputWriter given a partition key and optional bucket id. * If bucket id is specified, we will append it to the end of the file name, but before the @@ -435,22 +374,18 @@ private[sql] class DynamicPartitionWriterContainer( } def writeRows(taskContext: TaskContext, iterator: Iterator[InternalRow]): Unit = { -val outputWriters = new
spark git commit: [SPARK-12269][STREAMING][KINESIS] Update aws-java-sdk version
Repository: spark Updated Branches: refs/heads/master bd723bd53 -> 8fe928b4f [SPARK-12269][STREAMING][KINESIS] Update aws-java-sdk version The current Spark Streaming kinesis connector references a quite old version 1.9.40 of the AWS Java SDK (1.10.40 is current). Numerous AWS features including Kinesis Firehose are unavailable in 1.9. Those two versions of the AWS SDK in turn require conflicting versions of Jackson (2.4.4 and 2.5.3 respectively) such that one cannot include the current AWS SDK in a project that also uses the Spark Streaming Kinesis ASL. Author: BrianLondonCloses #10256 from BrianLondon/master. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8fe928b4 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8fe928b4 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8fe928b4 Branch: refs/heads/master Commit: 8fe928b4fe380ba527164bd413402abfed13c0e1 Parents: bd723bd Author: BrianLondon Authored: Mon Jan 11 09:32:06 2016 + Committer: Sean Owen Committed: Mon Jan 11 09:32:06 2016 + -- dev/deps/spark-deps-hadoop-2.2 | 8 dev/deps/spark-deps-hadoop-2.3 | 8 dev/deps/spark-deps-hadoop-2.4 | 8 dev/deps/spark-deps-hadoop-2.6 | 8 pom.xml| 6 +++--- 5 files changed, 19 insertions(+), 19 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/8fe928b4/dev/deps/spark-deps-hadoop-2.2 -- diff --git a/dev/deps/spark-deps-hadoop-2.2 b/dev/deps/spark-deps-hadoop-2.2 index e4373f7..cd3ff29 100644 --- a/dev/deps/spark-deps-hadoop-2.2 +++ b/dev/deps/spark-deps-hadoop-2.2 @@ -84,13 +84,13 @@ hadoop-yarn-server-web-proxy-2.2.0.jar httpclient-4.3.2.jar httpcore-4.3.2.jar ivy-2.4.0.jar -jackson-annotations-2.4.4.jar -jackson-core-2.4.4.jar +jackson-annotations-2.5.3.jar +jackson-core-2.5.3.jar jackson-core-asl-1.9.13.jar -jackson-databind-2.4.4.jar +jackson-databind-2.5.3.jar jackson-jaxrs-1.9.13.jar jackson-mapper-asl-1.9.13.jar -jackson-module-scala_2.10-2.4.4.jar +jackson-module-scala_2.10-2.5.3.jar jackson-xc-1.9.13.jar janino-2.7.8.jar jansi-1.4.jar http://git-wip-us.apache.org/repos/asf/spark/blob/8fe928b4/dev/deps/spark-deps-hadoop-2.3 -- diff --git a/dev/deps/spark-deps-hadoop-2.3 b/dev/deps/spark-deps-hadoop-2.3 index 7478181..0985089 100644 --- a/dev/deps/spark-deps-hadoop-2.3 +++ b/dev/deps/spark-deps-hadoop-2.3 @@ -79,13 +79,13 @@ hadoop-yarn-server-web-proxy-2.3.0.jar httpclient-4.3.2.jar httpcore-4.3.2.jar ivy-2.4.0.jar -jackson-annotations-2.4.4.jar -jackson-core-2.4.4.jar +jackson-annotations-2.5.3.jar +jackson-core-2.5.3.jar jackson-core-asl-1.9.13.jar -jackson-databind-2.4.4.jar +jackson-databind-2.5.3.jar jackson-jaxrs-1.9.13.jar jackson-mapper-asl-1.9.13.jar -jackson-module-scala_2.10-2.4.4.jar +jackson-module-scala_2.10-2.5.3.jar jackson-xc-1.9.13.jar janino-2.7.8.jar jansi-1.4.jar http://git-wip-us.apache.org/repos/asf/spark/blob/8fe928b4/dev/deps/spark-deps-hadoop-2.4 -- diff --git a/dev/deps/spark-deps-hadoop-2.4 b/dev/deps/spark-deps-hadoop-2.4 index faffb8b..50f0626 100644 --- a/dev/deps/spark-deps-hadoop-2.4 +++ b/dev/deps/spark-deps-hadoop-2.4 @@ -79,13 +79,13 @@ hadoop-yarn-server-web-proxy-2.4.0.jar httpclient-4.3.2.jar httpcore-4.3.2.jar ivy-2.4.0.jar -jackson-annotations-2.4.4.jar -jackson-core-2.4.4.jar +jackson-annotations-2.5.3.jar +jackson-core-2.5.3.jar jackson-core-asl-1.9.13.jar -jackson-databind-2.4.4.jar +jackson-databind-2.5.3.jar jackson-jaxrs-1.9.13.jar jackson-mapper-asl-1.9.13.jar -jackson-module-scala_2.10-2.4.4.jar +jackson-module-scala_2.10-2.5.3.jar jackson-xc-1.9.13.jar janino-2.7.8.jar jansi-1.4.jar http://git-wip-us.apache.org/repos/asf/spark/blob/8fe928b4/dev/deps/spark-deps-hadoop-2.6 -- diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6 index e703c7a..2b6ca98 100644 --- a/dev/deps/spark-deps-hadoop-2.6 +++ b/dev/deps/spark-deps-hadoop-2.6 @@ -85,13 +85,13 @@ htrace-core-3.0.4.jar httpclient-4.3.2.jar httpcore-4.3.2.jar ivy-2.4.0.jar -jackson-annotations-2.4.4.jar -jackson-core-2.4.4.jar +jackson-annotations-2.5.3.jar +jackson-core-2.5.3.jar jackson-core-asl-1.9.13.jar -jackson-databind-2.4.4.jar +jackson-databind-2.5.3.jar jackson-jaxrs-1.9.13.jar jackson-mapper-asl-1.9.13.jar -jackson-module-scala_2.10-2.4.4.jar +jackson-module-scala_2.10-2.5.3.jar jackson-xc-1.9.13.jar janino-2.7.8.jar jansi-1.4.jar
spark git commit: removed lambda from sortByKey()
Repository: spark Updated Branches: refs/heads/branch-1.6 43b72d83e -> d4cfd2acd removed lambda from sortByKey() According to the documentation the sortByKey method does not take a lambda as an argument, thus the example is flawed. Removed the argument completely as this will default to ascending sort. Author: Udo KleinCloses #10640 from udoklein/patch-1. (cherry picked from commit bd723bd53d9a28239b60939a248a4ea13340aad8) Signed-off-by: Sean Owen Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d4cfd2ac Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d4cfd2ac Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d4cfd2ac Branch: refs/heads/branch-1.6 Commit: d4cfd2acd62f2b0638a1248a38263c04eaf8 Parents: 43b72d8 Author: Udo Klein Authored: Mon Jan 11 09:30:08 2016 + Committer: Sean Owen Committed: Mon Jan 11 09:30:27 2016 + -- examples/src/main/python/sort.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d4cfd2ac/examples/src/main/python/sort.py -- diff --git a/examples/src/main/python/sort.py b/examples/src/main/python/sort.py index f6b0ecb..b6c2916 100755 --- a/examples/src/main/python/sort.py +++ b/examples/src/main/python/sort.py @@ -30,7 +30,7 @@ if __name__ == "__main__": lines = sc.textFile(sys.argv[1], 1) sortedCount = lines.flatMap(lambda x: x.split(' ')) \ .map(lambda x: (int(x), 1)) \ -.sortByKey(lambda x: x) +.sortByKey() # This is just a demo on how to bring all the sorted data back to a single node. # In reality, we wouldn't want to collect all the data to the driver node. output = sortedCount.collect() - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-12734][BUILD] Backport Netty exclusion + Maven enforcer fixes to branch-1.5
Repository: spark Updated Branches: refs/heads/branch-1.5 de7194a0d -> 665aa47f8 [SPARK-12734][BUILD] Backport Netty exclusion + Maven enforcer fixes to branch-1.5 This patch backports the Netty exclusion fixes from #10672 to branch-1.5. Author: Josh RosenCloses #10690 from JoshRosen/netty-exclude-15-backport. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/665aa47f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/665aa47f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/665aa47f Branch: refs/heads/branch-1.5 Commit: 665aa47f840ef392ccf3dcd23e7fd93987f7769a Parents: de7194a Author: Josh Rosen Authored: Mon Jan 11 00:51:44 2016 -0800 Committer: Josh Rosen Committed: Mon Jan 11 00:51:44 2016 -0800 -- dev/test-dependencies.sh | 16 pom.xml | 21 - 2 files changed, 24 insertions(+), 13 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/665aa47f/dev/test-dependencies.sh -- diff --git a/dev/test-dependencies.sh b/dev/test-dependencies.sh index 129741f..47ae15b 100755 --- a/dev/test-dependencies.sh +++ b/dev/test-dependencies.sh @@ -76,18 +76,10 @@ for HADOOP_PROFILE in "${HADOOP_PROFILES[@]}"; do HADOOP_MODULE_PROFILES="-Phive-thriftserver -Pyarn -Phive" fi echo "Performing Maven install for $HADOOP_PROFILE" - $MVN $HADOOP_MODULE_PROFILES -P$HADOOP_PROFILE jar:jar install:install -q \ --pl '!assembly' \ --pl '!examples' \ --pl '!external/flume-assembly' \ --pl '!external/kafka-assembly' \ --pl '!external/twitter' \ --pl '!external/flume' \ --pl '!external/mqtt' \ --pl '!external/mqtt-assembly' \ --pl '!external/zeromq' \ --pl '!external/kafka' \ --DskipTests + $MVN $HADOOP_MODULE_PROFILES -P$HADOOP_PROFILE jar:jar jar:test-jar install:install -q + + echo "Performing Maven validate for $HADOOP_PROFILE" + $MVN $HADOOP_MODULE_PROFILES -P$HADOOP_PROFILE validate -q echo "Generating dependency manifest for $HADOOP_PROFILE" mkdir -p dev/pr-deps http://git-wip-us.apache.org/repos/asf/spark/blob/665aa47f/pom.xml -- diff --git a/pom.xml b/pom.xml index c751298..ef00c70 100644 --- a/pom.xml +++ b/pom.xml @@ -1058,6 +1058,12 @@ zookeeper ${zookeeper.version} ${hadoop.deps.scope} + + +org.jboss.netty +netty + + org.codehaus.jackson @@ -1774,7 +1780,7 @@ org.apache.maven.plugins maven-enforcer-plugin - 1.4 + 1.4.1 enforce-versions @@ -1789,6 +1795,19 @@ ${java.version} + + + + org.jboss.netty + +true + - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: removed lambda from sortByKey()
Repository: spark Updated Branches: refs/heads/branch-1.5 665aa47f8 -> 0e2aa4198 removed lambda from sortByKey() According to the documentation the sortByKey method does not take a lambda as an argument, thus the example is flawed. Removed the argument completely as this will default to ascending sort. Author: Udo KleinCloses #10640 from udoklein/patch-1. (cherry picked from commit bd723bd53d9a28239b60939a248a4ea13340aad8) Signed-off-by: Sean Owen Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0e2aa419 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0e2aa419 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0e2aa419 Branch: refs/heads/branch-1.5 Commit: 0e2aa41988c4ae8391b48b0902badf0cda188dcc Parents: 665aa47 Author: Udo Klein Authored: Mon Jan 11 09:30:08 2016 + Committer: Sean Owen Committed: Mon Jan 11 09:30:40 2016 + -- examples/src/main/python/sort.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0e2aa419/examples/src/main/python/sort.py -- diff --git a/examples/src/main/python/sort.py b/examples/src/main/python/sort.py index f6b0ecb..b6c2916 100755 --- a/examples/src/main/python/sort.py +++ b/examples/src/main/python/sort.py @@ -30,7 +30,7 @@ if __name__ == "__main__": lines = sc.textFile(sys.argv[1], 1) sortedCount = lines.flatMap(lambda x: x.split(' ')) \ .map(lambda x: (int(x), 1)) \ -.sortByKey(lambda x: x) +.sortByKey() # This is just a demo on how to bring all the sorted data back to a single node. # In reality, we wouldn't want to collect all the data to the driver node. output = sortedCount.collect() - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: removed lambda from sortByKey()
Repository: spark Updated Branches: refs/heads/master f253feff6 -> bd723bd53 removed lambda from sortByKey() According to the documentation the sortByKey method does not take a lambda as an argument, thus the example is flawed. Removed the argument completely as this will default to ascending sort. Author: Udo KleinCloses #10640 from udoklein/patch-1. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/bd723bd5 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/bd723bd5 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/bd723bd5 Branch: refs/heads/master Commit: bd723bd53d9a28239b60939a248a4ea13340aad8 Parents: f253fef Author: Udo Klein Authored: Mon Jan 11 09:30:08 2016 + Committer: Sean Owen Committed: Mon Jan 11 09:30:08 2016 + -- examples/src/main/python/sort.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/bd723bd5/examples/src/main/python/sort.py -- diff --git a/examples/src/main/python/sort.py b/examples/src/main/python/sort.py index f6b0ecb..b6c2916 100755 --- a/examples/src/main/python/sort.py +++ b/examples/src/main/python/sort.py @@ -30,7 +30,7 @@ if __name__ == "__main__": lines = sc.textFile(sys.argv[1], 1) sortedCount = lines.flatMap(lambda x: x.split(' ')) \ .map(lambda x: (int(x), 1)) \ -.sortByKey(lambda x: x) +.sortByKey() # This is just a demo on how to bring all the sorted data back to a single node. # In reality, we wouldn't want to collect all the data to the driver node. output = sortedCount.collect() - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-12742][SQL] org.apache.spark.sql.hive.LogicalPlanToSQLSuite failure due to Table already exists exception
Repository: spark Updated Branches: refs/heads/master fe9eb0b0c -> 473907adf [SPARK-12742][SQL] org.apache.spark.sql.hive.LogicalPlanToSQLSuite failure due to Table already exists exception ``` [info] Exception encountered when attempting to run a suite with class name: org.apache.spark.sql.hive.LogicalPlanToSQLSuite *** ABORTED *** (325 milliseconds) [info] org.apache.spark.sql.AnalysisException: Table `t1` already exists.; [info] at org.apache.spark.sql.DataFrameWriter.saveAsTable(DataFrameWriter.scala:296) [info] at org.apache.spark.sql.DataFrameWriter.saveAsTable(DataFrameWriter.scala:285) [info] at org.apache.spark.sql.hive.LogicalPlanToSQLSuite.beforeAll(LogicalPlanToSQLSuite.scala:33) [info] at org.scalatest.BeforeAndAfterAll$class.beforeAll(BeforeAndAfterAll.scala:187) [info] at org.apache.spark.sql.hive.LogicalPlanToSQLSuite.beforeAll(LogicalPlanToSQLSuite.scala:23) [info] at org.scalatest.BeforeAndAfterAll$class.run(BeforeAndAfterAll.scala:253) [info] at org.apache.spark.sql.hive.LogicalPlanToSQLSuite.run(LogicalPlanToSQLSuite.scala:23) [info] at org.scalatest.tools.Framework.org$scalatest$tools$Framework$$runSuite(Framework.scala:462) [info] at org.scalatest.tools.Framework$ScalaTestTask.execute(Framework.scala:671) [info] at sbt.ForkMain$Run$2.call(ForkMain.java:296) [info] at sbt.ForkMain$Run$2.call(ForkMain.java:286) [info] at java.util.concurrent.FutureTask.run(FutureTask.java:266) [info] at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) [info] at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) [info] at java.lang.Thread.run(Thread.java:745) ``` /cc liancheng Author: wangfeiCloses #10682 from scwf/fix-test. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/473907ad Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/473907ad Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/473907ad Branch: refs/heads/master Commit: 473907adf6e37855ee31d0703b43d7170e26b4b9 Parents: fe9eb0b Author: wangfei Authored: Mon Jan 11 18:18:44 2016 -0800 Committer: Cheng Lian Committed: Mon Jan 11 18:18:44 2016 -0800 -- .../scala/org/apache/spark/sql/hive/LogicalPlanToSQLSuite.scala | 3 +++ 1 file changed, 3 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/473907ad/sql/hive/src/test/scala/org/apache/spark/sql/hive/LogicalPlanToSQLSuite.scala -- diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/LogicalPlanToSQLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/LogicalPlanToSQLSuite.scala index 9a8a9c5..2ee8150 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/LogicalPlanToSQLSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/LogicalPlanToSQLSuite.scala @@ -24,6 +24,9 @@ class LogicalPlanToSQLSuite extends SQLBuilderTest with SQLTestUtils { import testImplicits._ protected override def beforeAll(): Unit = { +sql("DROP TABLE IF EXISTS t0") +sql("DROP TABLE IF EXISTS t1") +sql("DROP TABLE IF EXISTS t2") sqlContext.range(10).write.saveAsTable("t0") sqlContext - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-11823] Ignores HiveThriftBinaryServerSuite's test jdbc cancel
Repository: spark Updated Branches: refs/heads/branch-1.6 dd2cf64f3 -> a6c9c68d8 [SPARK-11823] Ignores HiveThriftBinaryServerSuite's test jdbc cancel https://issues.apache.org/jira/browse/SPARK-11823 This test often hangs and times out, leaving hanging processes. Let's ignore it for now and improve the test. Author: Yin HuaiCloses #10715 from yhuai/SPARK-11823-ignore. (cherry picked from commit aaa2c3b628319178ca1f3f68966ff253c2de49cb) Signed-off-by: Josh Rosen Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a6c9c68d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a6c9c68d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a6c9c68d Branch: refs/heads/branch-1.6 Commit: a6c9c68d8855e3a8bfc92f26b3877b92367087a4 Parents: dd2cf64 Author: Yin Huai Authored: Mon Jan 11 19:59:15 2016 -0800 Committer: Josh Rosen Committed: Mon Jan 11 19:59:37 2016 -0800 -- .../spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala| 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a6c9c68d/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala -- diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala index ebb2575..1bd6ec9 100644 --- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala +++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala @@ -347,7 +347,9 @@ class HiveThriftBinaryServerSuite extends HiveThriftJdbcTest { ) } - test("test jdbc cancel") { + // This test often hangs and then times out, leaving the hanging processes. + // Let's ignore it and improve the test. + ignore("test jdbc cancel") { withJdbcStatement { statement => val queries = Seq( "DROP TABLE IF EXISTS test_map", - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-12498][SQL][MINOR] BooleanSimplication simplification
Repository: spark Updated Branches: refs/heads/master 473907adf -> 36d493509 [SPARK-12498][SQL][MINOR] BooleanSimplication simplification Scala syntax allows binary case classes to be used as infix operator in pattern matching. This PR makes use of this syntax sugar to make `BooleanSimplification` more readable. Author: Cheng LianCloses #10445 from liancheng/boolean-simplification-simplification. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/36d49350 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/36d49350 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/36d49350 Branch: refs/heads/master Commit: 36d493509d32d14b54af62f5f65e8fa750e7413d Parents: 473907a Author: Cheng Lian Authored: Mon Jan 11 18:42:26 2016 -0800 Committer: Reynold Xin Committed: Mon Jan 11 18:42:26 2016 -0800 -- .../sql/catalyst/expressions/literals.scala | 4 + .../sql/catalyst/optimizer/Optimizer.scala | 190 +-- 2 files changed, 92 insertions(+), 102 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/36d49350/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala index 17351ef..e0b0203 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala @@ -28,6 +28,10 @@ import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types._ object Literal { + val TrueLiteral: Literal = Literal(true, BooleanType) + + val FalseLiteral: Literal = Literal(false, BooleanType) + def apply(v: Any): Literal = v match { case i: Int => Literal(i, IntegerType) case l: Long => Literal(l, LongType) http://git-wip-us.apache.org/repos/asf/spark/blob/36d49350/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala index f8121a7..b70bc18 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala @@ -21,6 +21,7 @@ import scala.collection.immutable.HashSet import org.apache.spark.sql.catalyst.analysis.{CleanupAliases, EliminateSubQueries} import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.catalyst.expressions.Literal.{FalseLiteral, TrueLiteral} import org.apache.spark.sql.catalyst.expressions.aggregate._ import org.apache.spark.sql.catalyst.planning.ExtractFiltersAndInnerJoins import org.apache.spark.sql.catalyst.plans.{FullOuter, Inner, LeftOuter, LeftSemi, RightOuter} @@ -519,112 +520,97 @@ object OptimizeIn extends Rule[LogicalPlan] { object BooleanSimplification extends Rule[LogicalPlan] with PredicateHelper { def apply(plan: LogicalPlan): LogicalPlan = plan transform { case q: LogicalPlan => q transformExpressionsUp { - case and @ And(left, right) => (left, right) match { -// true && r => r -case (Literal(true, BooleanType), r) => r -// l && true => l -case (l, Literal(true, BooleanType)) => l -// false && r => false -case (Literal(false, BooleanType), _) => Literal(false) -// l && false => false -case (_, Literal(false, BooleanType)) => Literal(false) -// a && a => a -case (l, r) if l fastEquals r => l -// a && (not(a) || b) => a && b -case (l, Or(l1, r)) if (Not(l) == l1) => And(l, r) -case (l, Or(r, l1)) if (Not(l) == l1) => And(l, r) -case (Or(l, l1), r) if (l1 == Not(r)) => And(l, r) -case (Or(l1, l), r) if (l1 == Not(r)) => And(l, r) -// (a || b) && (a || c) => a || (b && c) -case _ => - // 1. Split left and right to get the disjunctive predicates, - // i.e. lhs = (a, b), rhs = (a, c) - // 2. Find the common predict between lhsSet and rhsSet, i.e. common = (a) - // 3. Remove common predict from lhsSet and rhsSet, i.e. ldiff = (b), rdiff = (c) - // 4. Apply the formula, get the optimized predicate: common || (ldiff && rdiff) - val lhs = splitDisjunctivePredicates(left) - val rhs =
spark git commit: [SPARK-12692][BUILD][STREAMING] Scala style: Fix the style violation (Space before ", " or ":")
Repository: spark Updated Branches: refs/heads/master aaa2c3b62 -> 39ae04e6b [SPARK-12692][BUILD][STREAMING] Scala style: Fix the style violation (Space before "," or ":") Fix the style violation (space before , and :). This PR is a followup for #10643. Author: Kousuke SarutaCloses #10685 from sarutak/SPARK-12692-followup-streaming. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/39ae04e6 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/39ae04e6 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/39ae04e6 Branch: refs/heads/master Commit: 39ae04e6b714e085a1341aa84d8fc5fc827d5f35 Parents: aaa2c3b Author: Kousuke Saruta Authored: Mon Jan 11 21:06:22 2016 -0800 Committer: Reynold Xin Committed: Mon Jan 11 21:06:22 2016 -0800 -- .../clickstream/PageViewGenerator.scala | 14 .../spark/streaming/flume/sink/Logging.scala| 8 ++--- .../streaming/flume/FlumeInputDStream.scala | 18 +- .../kafka/DirectKafkaInputDStream.scala | 4 +-- .../streaming/kafka/KafkaInputDStream.scala | 4 +-- .../kafka/ReliableKafkaStreamSuite.scala| 2 +- .../spark/streaming/mqtt/MQTTInputDStream.scala | 4 +-- .../streaming/twitter/TwitterInputDStream.scala | 4 +-- project/MimaExcludes.scala | 12 +++ .../org/apache/spark/streaming/Checkpoint.scala | 12 +++ .../spark/streaming/StreamingContext.scala | 36 ++-- .../streaming/api/java/JavaDStreamLike.scala| 2 +- .../dstream/ConstantInputDStream.scala | 4 +-- .../dstream/DStreamCheckpointData.scala | 2 +- .../streaming/dstream/FileInputDStream.scala| 18 +- .../spark/streaming/dstream/InputDStream.scala | 6 ++-- .../dstream/PluggableInputDStream.scala | 4 +-- .../streaming/dstream/RawInputDStream.scala | 4 +-- .../dstream/ReceiverInputDStream.scala | 6 ++-- .../streaming/dstream/SocketInputDStream.scala | 4 +-- .../spark/streaming/dstream/StateDStream.scala | 6 ++-- .../spark/streaming/receiver/Receiver.scala | 8 ++--- .../spark/streaming/BasicOperationsSuite.scala | 2 +- .../spark/streaming/CheckpointSuite.scala | 2 +- .../spark/streaming/MasterFailureTest.scala | 4 +-- .../apache/spark/streaming/StateMapSuite.scala | 2 +- .../spark/streaming/StreamingContextSuite.scala | 2 +- .../apache/spark/streaming/TestSuiteBase.scala | 4 +-- .../scheduler/ReceiverTrackerSuite.scala| 4 +-- .../streaming/util/WriteAheadLogSuite.scala | 2 +- 30 files changed, 108 insertions(+), 96 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/39ae04e6/examples/src/main/scala/org/apache/spark/examples/streaming/clickstream/PageViewGenerator.scala -- diff --git a/examples/src/main/scala/org/apache/spark/examples/streaming/clickstream/PageViewGenerator.scala b/examples/src/main/scala/org/apache/spark/examples/streaming/clickstream/PageViewGenerator.scala index ce1a620..50216b9 100644 --- a/examples/src/main/scala/org/apache/spark/examples/streaming/clickstream/PageViewGenerator.scala +++ b/examples/src/main/scala/org/apache/spark/examples/streaming/clickstream/PageViewGenerator.scala @@ -23,15 +23,15 @@ import java.net.ServerSocket import java.util.Random /** Represents a page view on a website with associated dimension data. */ -class PageView(val url : String, val status : Int, val zipCode : Int, val userID : Int) +class PageView(val url: String, val status: Int, val zipCode: Int, val userID: Int) extends Serializable { - override def toString() : String = { + override def toString(): String = { "%s\t%s\t%s\t%s\n".format(url, status, zipCode, userID) } } object PageView extends Serializable { - def fromString(in : String) : PageView = { + def fromString(in: String): PageView = { val parts = in.split("\t") new PageView(parts(0), parts(1).toInt, parts(2).toInt, parts(3).toInt) } @@ -58,9 +58,9 @@ object PageViewGenerator { 404 -> .05) val userZipCode = Map(94709 -> .5, 94117 -> .5) - val userID = Map((1 to 100).map(_ -> .01) : _*) + val userID = Map((1 to 100).map(_ -> .01): _*) - def pickFromDistribution[T](inputMap : Map[T, Double]) : T = { + def pickFromDistribution[T](inputMap: Map[T, Double]): T = { val rand = new Random().nextDouble() var total = 0.0 for ((item, prob) <- inputMap) { @@ -72,7 +72,7 @@ object PageViewGenerator { inputMap.take(1).head._1 // Shouldn't get here if probabilities add up to 1.0 } - def getNextClickEvent() : String = { + def
spark git commit: [SPARK-12692][BUILD][YARN] Scala style: Fix the style violation (Space before ", " or ":")
Repository: spark Updated Branches: refs/heads/master 39ae04e6b -> 112abf910 [SPARK-12692][BUILD][YARN] Scala style: Fix the style violation (Space before "," or ":") Fix the style violation (space before , and :). This PR is a followup for #10643. Author: Kousuke SarutaCloses #10686 from sarutak/SPARK-12692-followup-yarn. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/112abf91 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/112abf91 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/112abf91 Branch: refs/heads/master Commit: 112abf9100f05be436e449817468c50174712c78 Parents: 39ae04e Author: Kousuke Saruta Authored: Mon Jan 11 21:37:54 2016 -0800 Committer: Reynold Xin Committed: Mon Jan 11 21:37:54 2016 -0800 -- .../scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/112abf91/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala -- diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala index e286aed..272f129 100644 --- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala +++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala @@ -357,7 +357,7 @@ object YarnSparkHadoopUtil { * * @return The correct OOM Error handler JVM option, platform dependent. */ - def getOutOfMemoryErrorArgument : String = { + def getOutOfMemoryErrorArgument: String = { if (Utils.isWindows) { escapeForShell("-XX:OnOutOfMemoryError=taskkill /F /PID p") } else { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-12734][HOTFIX] Build changes must trigger all tests; clean after install in dep tests
Repository: spark Updated Branches: refs/heads/branch-1.6 ce906b33d -> 3b32aa9e2 [SPARK-12734][HOTFIX] Build changes must trigger all tests; clean after install in dep tests This patch fixes a build/test issue caused by the combination of #10672 and a latent issue in the original `dev/test-dependencies` script. First, changes which _only_ touched build files were not triggering full Jenkins runs, making it possible for a build change to be merged even though it could cause failures in other tests. The `root` build module now depends on `build`, so all tests will now be run whenever a build-related file is changed. I also added a `clean` step to the Maven install step in `dev/test-dependencies` in order to address an issue where the dummy JARs stuck around and caused "multiple assembly JARs found" errors in tests. /cc zsxwing Author: Josh RosenCloses #10704 from JoshRosen/fix-build-test-problems. (cherry picked from commit a44991453a43615028083ba9546f5cd93112f6bd) Signed-off-by: Josh Rosen Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3b32aa9e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3b32aa9e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3b32aa9e Branch: refs/heads/branch-1.6 Commit: 3b32aa9e29506606d4ca2407aa65a1aab8794805 Parents: ce906b3 Author: Josh Rosen Authored: Mon Jan 11 12:56:43 2016 -0800 Committer: Josh Rosen Committed: Mon Jan 11 12:58:07 2016 -0800 -- dev/sparktestsupport/modules.py | 2 +- dev/test-dependencies.sh| 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/3b32aa9e/dev/sparktestsupport/modules.py -- diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py index 47cd600..21667d5 100644 --- a/dev/sparktestsupport/modules.py +++ b/dev/sparktestsupport/modules.py @@ -435,7 +435,7 @@ yarn = Module( # No other modules should directly depend on this module. root = Module( name="root", -dependencies=[], +dependencies=[build], # Changes to build should trigger all tests. source_file_regexes=[], # In order to run all of the tests, enable every test profile: build_profile_flags=list(set( http://git-wip-us.apache.org/repos/asf/spark/blob/3b32aa9e/dev/test-dependencies.sh -- diff --git a/dev/test-dependencies.sh b/dev/test-dependencies.sh index 47ae15b..efb49f7 100755 --- a/dev/test-dependencies.sh +++ b/dev/test-dependencies.sh @@ -76,7 +76,7 @@ for HADOOP_PROFILE in "${HADOOP_PROFILES[@]}"; do HADOOP_MODULE_PROFILES="-Phive-thriftserver -Pyarn -Phive" fi echo "Performing Maven install for $HADOOP_PROFILE" - $MVN $HADOOP_MODULE_PROFILES -P$HADOOP_PROFILE jar:jar jar:test-jar install:install -q + $MVN $HADOOP_MODULE_PROFILES -P$HADOOP_PROFILE jar:jar jar:test-jar install:install clean -q echo "Performing Maven validate for $HADOOP_PROFILE" $MVN $HADOOP_MODULE_PROFILES -P$HADOOP_PROFILE validate -q - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-12734][HOTFIX] Build changes must trigger all tests; clean after install in dep tests
Repository: spark Updated Branches: refs/heads/branch-1.5 0e2aa4198 -> 6cc8f407d [SPARK-12734][HOTFIX] Build changes must trigger all tests; clean after install in dep tests This patch fixes a build/test issue caused by the combination of #10672 and a latent issue in the original `dev/test-dependencies` script. First, changes which _only_ touched build files were not triggering full Jenkins runs, making it possible for a build change to be merged even though it could cause failures in other tests. The `root` build module now depends on `build`, so all tests will now be run whenever a build-related file is changed. I also added a `clean` step to the Maven install step in `dev/test-dependencies` in order to address an issue where the dummy JARs stuck around and caused "multiple assembly JARs found" errors in tests. /cc zsxwing Author: Josh RosenCloses #10704 from JoshRosen/fix-build-test-problems. (cherry picked from commit a44991453a43615028083ba9546f5cd93112f6bd) Signed-off-by: Josh Rosen Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6cc8f407 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6cc8f407 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6cc8f407 Branch: refs/heads/branch-1.5 Commit: 6cc8f407d549ec74dc336d043a03149649553b3f Parents: 0e2aa41 Author: Josh Rosen Authored: Mon Jan 11 12:56:43 2016 -0800 Committer: Josh Rosen Committed: Mon Jan 11 12:59:15 2016 -0800 -- dev/sparktestsupport/modules.py | 2 +- dev/test-dependencies.sh| 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/6cc8f407/dev/sparktestsupport/modules.py -- diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py index a412922..e5f15b5 100644 --- a/dev/sparktestsupport/modules.py +++ b/dev/sparktestsupport/modules.py @@ -413,7 +413,7 @@ ec2 = Module( # No other modules should directly depend on this module. root = Module( name="root", -dependencies=[], +dependencies=[build], # Changes to build should trigger all tests. source_file_regexes=[], # In order to run all of the tests, enable every test profile: build_profile_flags=list(set( http://git-wip-us.apache.org/repos/asf/spark/blob/6cc8f407/dev/test-dependencies.sh -- diff --git a/dev/test-dependencies.sh b/dev/test-dependencies.sh index 47ae15b..efb49f7 100755 --- a/dev/test-dependencies.sh +++ b/dev/test-dependencies.sh @@ -76,7 +76,7 @@ for HADOOP_PROFILE in "${HADOOP_PROFILES[@]}"; do HADOOP_MODULE_PROFILES="-Phive-thriftserver -Pyarn -Phive" fi echo "Performing Maven install for $HADOOP_PROFILE" - $MVN $HADOOP_MODULE_PROFILES -P$HADOOP_PROFILE jar:jar jar:test-jar install:install -q + $MVN $HADOOP_MODULE_PROFILES -P$HADOOP_PROFILE jar:jar jar:test-jar install:install clean -q echo "Performing Maven validate for $HADOOP_PROFILE" $MVN $HADOOP_MODULE_PROFILES -P$HADOOP_PROFILE validate -q - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-12734][HOTFIX] Build changes must trigger all tests; clean after install in dep tests
Repository: spark Updated Branches: refs/heads/master b313badaa -> a44991453 [SPARK-12734][HOTFIX] Build changes must trigger all tests; clean after install in dep tests This patch fixes a build/test issue caused by the combination of #10672 and a latent issue in the original `dev/test-dependencies` script. First, changes which _only_ touched build files were not triggering full Jenkins runs, making it possible for a build change to be merged even though it could cause failures in other tests. The `root` build module now depends on `build`, so all tests will now be run whenever a build-related file is changed. I also added a `clean` step to the Maven install step in `dev/test-dependencies` in order to address an issue where the dummy JARs stuck around and caused "multiple assembly JARs found" errors in tests. /cc zsxwing Author: Josh RosenCloses #10704 from JoshRosen/fix-build-test-problems. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a4499145 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a4499145 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a4499145 Branch: refs/heads/master Commit: a44991453a43615028083ba9546f5cd93112f6bd Parents: b313bad Author: Josh Rosen Authored: Mon Jan 11 12:56:43 2016 -0800 Committer: Josh Rosen Committed: Mon Jan 11 12:56:43 2016 -0800 -- dev/sparktestsupport/modules.py | 2 +- dev/test-dependencies.sh| 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a4499145/dev/sparktestsupport/modules.py -- diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py index 1fc6596..93a8c15 100644 --- a/dev/sparktestsupport/modules.py +++ b/dev/sparktestsupport/modules.py @@ -426,7 +426,7 @@ yarn = Module( # No other modules should directly depend on this module. root = Module( name="root", -dependencies=[], +dependencies=[build], # Changes to build should trigger all tests. source_file_regexes=[], # In order to run all of the tests, enable every test profile: build_profile_flags=list(set( http://git-wip-us.apache.org/repos/asf/spark/blob/a4499145/dev/test-dependencies.sh -- diff --git a/dev/test-dependencies.sh b/dev/test-dependencies.sh index def87aa..3cb5d2b 100755 --- a/dev/test-dependencies.sh +++ b/dev/test-dependencies.sh @@ -70,7 +70,7 @@ $MVN -q versions:set -DnewVersion=$TEMP_VERSION -DgenerateBackupPoms=false > /de # Generate manifests for each Hadoop profile: for HADOOP_PROFILE in "${HADOOP_PROFILES[@]}"; do echo "Performing Maven install for $HADOOP_PROFILE" - $MVN $HADOOP2_MODULE_PROFILES -P$HADOOP_PROFILE jar:jar jar:test-jar install:install -q + $MVN $HADOOP2_MODULE_PROFILES -P$HADOOP_PROFILE jar:jar jar:test-jar install:install clean -q echo "Performing Maven validate for $HADOOP_PROFILE" $MVN $HADOOP2_MODULE_PROFILES -P$HADOOP_PROFILE validate -q - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-12744][SQL] Change parsing JSON integers to timestamps to treat integers as number of seconds
Repository: spark Updated Branches: refs/heads/master 8fe928b4f -> 9559ac5f7 [SPARK-12744][SQL] Change parsing JSON integers to timestamps to treat integers as number of seconds JIRA: https://issues.apache.org/jira/browse/SPARK-12744 This PR makes parsing JSON integers to timestamps consistent with casting behavior. Author: Anatoliy PlastininCloses #10687 from antlypls/fix-json-timestamp-parsing. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9559ac5f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9559ac5f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9559ac5f Branch: refs/heads/master Commit: 9559ac5f74434cf4bf611bdcde9a216d39799826 Parents: 8fe928b Author: Anatoliy Plastinin Authored: Mon Jan 11 10:28:57 2016 -0800 Committer: Yin Huai Committed: Mon Jan 11 10:28:57 2016 -0800 -- .../execution/datasources/json/JacksonParser.scala | 2 +- .../sql/execution/datasources/json/JsonSuite.scala | 17 +++-- .../execution/datasources/json/TestJsonData.scala | 4 3 files changed, 20 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/9559ac5f/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala index 2e3fe3d..b2f5c1e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala @@ -90,7 +90,7 @@ object JacksonParser { DateTimeUtils.stringToTime(parser.getText).getTime * 1000L case (VALUE_NUMBER_INT, TimestampType) => -parser.getLongValue * 1000L +parser.getLongValue * 100L case (_, StringType) => val writer = new ByteArrayOutputStream() http://git-wip-us.apache.org/repos/asf/spark/blob/9559ac5f/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala index b3b6b7d..4ab1480 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala @@ -83,9 +83,9 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData { val doubleNumber: Double = 1.7976931348623157E308d checkTypePromotion(doubleNumber.toDouble, enforceCorrectType(doubleNumber, DoubleType)) -checkTypePromotion(DateTimeUtils.fromJavaTimestamp(new Timestamp(intNumber)), +checkTypePromotion(DateTimeUtils.fromJavaTimestamp(new Timestamp(intNumber * 1000L)), enforceCorrectType(intNumber, TimestampType)) -checkTypePromotion(DateTimeUtils.fromJavaTimestamp(new Timestamp(intNumber.toLong)), +checkTypePromotion(DateTimeUtils.fromJavaTimestamp(new Timestamp(intNumber.toLong * 1000L)), enforceCorrectType(intNumber.toLong, TimestampType)) val strTime = "2014-09-30 12:34:56" checkTypePromotion(DateTimeUtils.fromJavaTimestamp(Timestamp.valueOf(strTime)), @@ -1465,4 +1465,17 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData { } } + test("Casting long as timestamp") { +withTempTable("jsonTable") { + val schema = (new StructType).add("ts", TimestampType) + val jsonDF = sqlContext.read.schema(schema).json(timestampAsLong) + + jsonDF.registerTempTable("jsonTable") + + checkAnswer( +sql("select ts from jsonTable"), +Row(java.sql.Timestamp.valueOf("2016-01-02 03:04:05")) + ) +} + } } http://git-wip-us.apache.org/repos/asf/spark/blob/9559ac5f/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/TestJsonData.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/TestJsonData.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/TestJsonData.scala index cb61f7e..a083605 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/TestJsonData.scala +++
spark git commit: [STREAMING][MINOR] Typo fixes
Repository: spark Updated Branches: refs/heads/branch-1.6 d4cfd2acd -> ce906b33d [STREAMING][MINOR] Typo fixes Author: Jacek LaskowskiCloses #10698 from jaceklaskowski/streaming-kafka-typo-fixes. (cherry picked from commit b313badaa049f847f33663c61cd70ee2f2cbebac) Signed-off-by: Shixiong Zhu Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ce906b33 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ce906b33 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ce906b33 Branch: refs/heads/branch-1.6 Commit: ce906b33de64f55653b52376316aa2625fd86b47 Parents: d4cfd2a Author: Jacek Laskowski Authored: Mon Jan 11 11:29:15 2016 -0800 Committer: Shixiong Zhu Committed: Mon Jan 11 11:29:23 2016 -0800 -- .../main/scala/org/apache/spark/streaming/kafka/KafkaCluster.scala | 2 +- .../src/main/scala/org/apache/spark/streaming/kafka/KafkaRDD.scala | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ce906b33/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaCluster.scala -- diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaCluster.scala b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaCluster.scala index 8465432..e3a2e57 100644 --- a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaCluster.scala +++ b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaCluster.scala @@ -382,7 +382,7 @@ object KafkaCluster { val seedBrokers: Array[(String, Int)] = brokers.split(",").map { hp => val hpa = hp.split(":") if (hpa.size == 1) { -throw new SparkException(s"Broker not the in correct format of : [$brokers]") +throw new SparkException(s"Broker not in the correct format of : [$brokers]") } (hpa(0), hpa(1).toInt) } http://git-wip-us.apache.org/repos/asf/spark/blob/ce906b33/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaRDD.scala -- diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaRDD.scala b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaRDD.scala index ea5f842..4dbaf4f 100644 --- a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaRDD.scala +++ b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaRDD.scala @@ -156,7 +156,7 @@ class KafkaRDD[ var requestOffset = part.fromOffset var iter: Iterator[MessageAndOffset] = null -// The idea is to use the provided preferred host, except on task retry atttempts, +// The idea is to use the provided preferred host, except on task retry attempts, // to minimize number of kafka metadata requests private def connectLeader: SimpleConsumer = { if (context.attemptNumber > 0) { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [STREAMING][MINOR] Typo fixes
Repository: spark Updated Branches: refs/heads/master 9559ac5f7 -> b313badaa [STREAMING][MINOR] Typo fixes Author: Jacek LaskowskiCloses #10698 from jaceklaskowski/streaming-kafka-typo-fixes. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b313bada Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b313bada Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b313bada Branch: refs/heads/master Commit: b313badaa049f847f33663c61cd70ee2f2cbebac Parents: 9559ac5 Author: Jacek Laskowski Authored: Mon Jan 11 11:29:15 2016 -0800 Committer: Shixiong Zhu Committed: Mon Jan 11 11:29:15 2016 -0800 -- .../main/scala/org/apache/spark/streaming/kafka/KafkaCluster.scala | 2 +- .../src/main/scala/org/apache/spark/streaming/kafka/KafkaRDD.scala | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b313bada/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaCluster.scala -- diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaCluster.scala b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaCluster.scala index c4e18d9..d7885d7 100644 --- a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaCluster.scala +++ b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaCluster.scala @@ -385,7 +385,7 @@ object KafkaCluster { val seedBrokers: Array[(String, Int)] = brokers.split(",").map { hp => val hpa = hp.split(":") if (hpa.size == 1) { -throw new SparkException(s"Broker not the in correct format of : [$brokers]") +throw new SparkException(s"Broker not in the correct format of : [$brokers]") } (hpa(0), hpa(1).toInt) } http://git-wip-us.apache.org/repos/asf/spark/blob/b313bada/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaRDD.scala -- diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaRDD.scala b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaRDD.scala index 603be22..4eb1556 100644 --- a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaRDD.scala +++ b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaRDD.scala @@ -156,7 +156,7 @@ class KafkaRDD[ var requestOffset = part.fromOffset var iter: Iterator[MessageAndOffset] = null -// The idea is to use the provided preferred host, except on task retry atttempts, +// The idea is to use the provided preferred host, except on task retry attempts, // to minimize number of kafka metadata requests private def connectLeader: SimpleConsumer = { if (context.attemptNumber > 0) { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org