git commit: Proper indent for the previous commit.
Repository: spark Updated Branches: refs/heads/master feaa3706f -> b4dded40f Proper indent for the previous commit. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b4dded40 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b4dded40 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b4dded40 Branch: refs/heads/master Commit: b4dded40fbecb485f1ddfd8316b44d42a1554d64 Parents: feaa370 Author: Reynold Xin Authored: Fri Sep 12 22:51:25 2014 -0700 Committer: Reynold Xin Committed: Fri Sep 12 22:51:25 2014 -0700 -- .../main/scala/org/apache/spark/api/java/JavaSparkContext.scala| 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b4dded40/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala -- diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala b/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala index 23f7e6b..791d853 100644 --- a/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala +++ b/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala @@ -42,7 +42,7 @@ import org.apache.spark.rdd.{EmptyRDD, HadoopRDD, NewHadoopRDD, RDD} * [[org.apache.spark.api.java.JavaRDD]]s and works with Java collections instead of Scala ones. */ class JavaSparkContext(val sc: SparkContext) -extends JavaSparkContextVarargsWorkaround with Closeable { + extends JavaSparkContextVarargsWorkaround with Closeable { /** * Create a JavaSparkContext that loads settings from system properties (for instance, when - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
git commit: SPARK-3470 [CORE] [STREAMING] Add Closeable / close() to Java context objects
Repository: spark Updated Branches: refs/heads/master e11eeb71f -> feaa3706f SPARK-3470 [CORE] [STREAMING] Add Closeable / close() to Java context objects ... that expose a stop() lifecycle method. This doesn't add `AutoCloseable`, which is Java 7+ only. But it should be possible to use try-with-resources on a `Closeable` in Java 7, as long as the `close()` does not throw a checked exception, and these don't. Q.E.D. Author: Sean Owen Closes #2346 from srowen/SPARK-3470 and squashes the following commits: 612c21d [Sean Owen] Add Closeable / close() to Java context objects that expose a stop() lifecycle method Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/feaa3706 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/feaa3706 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/feaa3706 Branch: refs/heads/master Commit: feaa3706f17e44efcdac9f0a543a5b91232771ce Parents: e11eeb7 Author: Sean Owen Authored: Fri Sep 12 22:50:37 2014 -0700 Committer: Reynold Xin Committed: Fri Sep 12 22:50:37 2014 -0700 -- .../scala/org/apache/spark/api/java/JavaSparkContext.scala| 7 ++- .../spark/streaming/api/java/JavaStreamingContext.scala | 7 +-- 2 files changed, 11 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/feaa3706/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala -- diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala b/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala index 8e178bc..23f7e6b 100644 --- a/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala +++ b/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala @@ -17,6 +17,7 @@ package org.apache.spark.api.java +import java.io.Closeable import java.util import java.util.{Map => JMap} @@ -40,7 +41,9 @@ import org.apache.spark.rdd.{EmptyRDD, HadoopRDD, NewHadoopRDD, RDD} * A Java-friendly version of [[org.apache.spark.SparkContext]] that returns * [[org.apache.spark.api.java.JavaRDD]]s and works with Java collections instead of Scala ones. */ -class JavaSparkContext(val sc: SparkContext) extends JavaSparkContextVarargsWorkaround { +class JavaSparkContext(val sc: SparkContext) +extends JavaSparkContextVarargsWorkaround with Closeable { + /** * Create a JavaSparkContext that loads settings from system properties (for instance, when * launching with ./bin/spark-submit). @@ -534,6 +537,8 @@ class JavaSparkContext(val sc: SparkContext) extends JavaSparkContextVarargsWork sc.stop() } + override def close(): Unit = stop() + /** * Get Spark's home location from either a value set through the constructor, * or the spark.home Java property, or the SPARK_HOME environment variable http://git-wip-us.apache.org/repos/asf/spark/blob/feaa3706/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala -- diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala index 18605ca..9dc26dc 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala @@ -21,7 +21,7 @@ package org.apache.spark.streaming.api.java import scala.collection.JavaConversions._ import scala.reflect.ClassTag -import java.io.InputStream +import java.io.{Closeable, InputStream} import java.util.{List => JList, Map => JMap} import akka.actor.{Props, SupervisorStrategy} @@ -49,7 +49,7 @@ import org.apache.spark.streaming.receiver.Receiver * respectively. `context.awaitTransformation()` allows the current thread to wait for the * termination of a context by `stop()` or by an exception. */ -class JavaStreamingContext(val ssc: StreamingContext) { +class JavaStreamingContext(val ssc: StreamingContext) extends Closeable { /** * Create a StreamingContext. @@ -540,6 +540,9 @@ class JavaStreamingContext(val ssc: StreamingContext) { def stop(stopSparkContext: Boolean, stopGracefully: Boolean) = { ssc.stop(stopSparkContext, stopGracefully) } + + override def close(): Unit = stop() + } /** - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
git commit: [SQL][Docs] Update SQL programming guide to show the correct default value of containsNull in an ArrayType
Repository: spark Updated Branches: refs/heads/master 2584ea5b2 -> e11eeb71f [SQL][Docs] Update SQL programming guide to show the correct default value of containsNull in an ArrayType After #1889, the default value of `containsNull` in an `ArrayType` is `true`. Author: Yin Huai Closes #2374 from yhuai/containsNull and squashes the following commits: dc609a3 [Yin Huai] Update the SQL programming guide to show the correct default value of containsNull in an ArrayType (the default value is true instead of false). Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e11eeb71 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e11eeb71 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e11eeb71 Branch: refs/heads/master Commit: e11eeb71fa3a5fe7ddacb94d5b93b173d4d901a8 Parents: 2584ea5 Author: Yin Huai Authored: Fri Sep 12 21:58:02 2014 -0700 Committer: Reynold Xin Committed: Fri Sep 12 21:58:02 2014 -0700 -- docs/sql-programming-guide.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e11eeb71/docs/sql-programming-guide.md -- diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index d83efa4..3159d52 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -1232,7 +1232,7 @@ import org.apache.spark.sql._ scala.collection.Seq ArrayType(elementType, [containsNull]) - Note: The default value of containsNull is false. + Note: The default value of containsNull is true. @@ -1358,7 +1358,7 @@ please use factory methods provided in java.util.List DataType.createArrayType(elementType) - Note: The value of containsNull will be false + Note: The value of containsNull will be true DataType.createArrayType(elementType, containsNull). @@ -1505,7 +1505,7 @@ from pyspark.sql import * list, tuple, or array ArrayType(elementType, [containsNull]) - Note: The default value of containsNull is False. + Note: The default value of containsNull is True. - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
git commit: [SPARK-3469] Make sure all TaskCompletionListener are called even with failures
Repository: spark Updated Branches: refs/heads/master 6d887db78 -> 2584ea5b2 [SPARK-3469] Make sure all TaskCompletionListener are called even with failures This is necessary because we rely on this callback interface to clean resources up. The old behavior would lead to resource leaks. Note that this also changes the fault semantics of TaskCompletionListener. Previously failures in TaskCompletionListeners would result in the task being reported immediately. With this change, we report the exception at the end, and the reported exception is a TaskCompletionListenerException that contains all the exception messages. Author: Reynold Xin Closes #2343 from rxin/taskcontext-callback and squashes the following commits: a3845b2 [Reynold Xin] Mark TaskCompletionListenerException as private[spark]. ac5baea [Reynold Xin] Removed obsolete comment. aa68ea4 [Reynold Xin] Throw an exception if task completion callback fails. 29b6162 [Reynold Xin] oops compilation failed. 1cb444d [Reynold Xin] [SPARK-3469] Call all TaskCompletionListeners even if some fail. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2584ea5b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2584ea5b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2584ea5b Branch: refs/heads/master Commit: 2584ea5b23b1c5a4df9549b94bfc9b8e0900532e Parents: 6d887db Author: Reynold Xin Authored: Fri Sep 12 21:55:39 2014 -0700 Committer: Reynold Xin Committed: Fri Sep 12 21:55:39 2014 -0700 -- .../scala/org/apache/spark/TaskContext.scala| 18 +-- .../util/TaskCompletionListenerException.scala | 34 .../spark/scheduler/TaskContextSuite.scala | 22 +++-- 3 files changed, 69 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/2584ea5b/core/src/main/scala/org/apache/spark/TaskContext.scala -- diff --git a/core/src/main/scala/org/apache/spark/TaskContext.scala b/core/src/main/scala/org/apache/spark/TaskContext.scala index 2b99b8a..51b3e4d 100644 --- a/core/src/main/scala/org/apache/spark/TaskContext.scala +++ b/core/src/main/scala/org/apache/spark/TaskContext.scala @@ -21,7 +21,7 @@ import scala.collection.mutable.ArrayBuffer import org.apache.spark.annotation.DeveloperApi import org.apache.spark.executor.TaskMetrics -import org.apache.spark.util.TaskCompletionListener +import org.apache.spark.util.{TaskCompletionListenerException, TaskCompletionListener} /** @@ -41,7 +41,7 @@ class TaskContext( val attemptId: Long, val runningLocally: Boolean = false, private[spark] val taskMetrics: TaskMetrics = TaskMetrics.empty) - extends Serializable { + extends Serializable with Logging { @deprecated("use partitionId", "0.8.1") def splitId = partitionId @@ -103,8 +103,20 @@ class TaskContext( /** Marks the task as completed and triggers the listeners. */ private[spark] def markTaskCompleted(): Unit = { completed = true +val errorMsgs = new ArrayBuffer[String](2) // Process complete callbacks in the reverse order of registration -onCompleteCallbacks.reverse.foreach { _.onTaskCompletion(this) } +onCompleteCallbacks.reverse.foreach { listener => + try { +listener.onTaskCompletion(this) + } catch { +case e: Throwable => + errorMsgs += e.getMessage + logError("Error in TaskCompletionListener", e) + } +} +if (errorMsgs.nonEmpty) { + throw new TaskCompletionListenerException(errorMsgs) +} } /** Marks the task for interruption, i.e. cancellation. */ http://git-wip-us.apache.org/repos/asf/spark/blob/2584ea5b/core/src/main/scala/org/apache/spark/util/TaskCompletionListenerException.scala -- diff --git a/core/src/main/scala/org/apache/spark/util/TaskCompletionListenerException.scala b/core/src/main/scala/org/apache/spark/util/TaskCompletionListenerException.scala new file mode 100644 index 000..f64e069 --- /dev/null +++ b/core/src/main/scala/org/apache/spark/util/TaskCompletionListenerException.scala @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is d
git commit: [SPARK-3515][SQL] Moves test suite setup code to beforeAll rather than in constructor
Repository: spark Updated Branches: refs/heads/branch-1.1 9c06c7230 -> 44e534eb2 [SPARK-3515][SQL] Moves test suite setup code to beforeAll rather than in constructor Please refer to the JIRA ticket for details. **NOTE** We should check all test suites that do similar initialization-like side effects in their constructors. This PR only fixes `ParquetMetastoreSuite` because it breaks our Jenkins Maven build. Author: Cheng Lian Closes #2375 from liancheng/say-no-to-constructor and squashes the following commits: 0ceb75b [Cheng Lian] Moves test suite setup code to beforeAll rather than in constructor (cherry picked from commit 6d887db7891be643f0131b136e82191b5f6eb407) Signed-off-by: Michael Armbrust Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/44e534eb Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/44e534eb Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/44e534eb Branch: refs/heads/branch-1.1 Commit: 44e534eb286381030ae068ca89573ff84fb2a579 Parents: 9c06c72 Author: Cheng Lian Authored: Fri Sep 12 20:14:09 2014 -0700 Committer: Michael Armbrust Committed: Fri Sep 12 20:14:48 2014 -0700 -- .../sql/parquet/ParquetMetastoreSuite.scala | 53 +--- 1 file changed, 24 insertions(+), 29 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/44e534eb/sql/hive/src/test/scala/org/apache/spark/sql/parquet/ParquetMetastoreSuite.scala -- diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/parquet/ParquetMetastoreSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/parquet/ParquetMetastoreSuite.scala index 0723be7..e380280 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/parquet/ParquetMetastoreSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/parquet/ParquetMetastoreSuite.scala @@ -20,14 +20,10 @@ package org.apache.spark.sql.parquet import java.io.File -import org.apache.spark.sql.hive.execution.HiveTableScan import org.scalatest.BeforeAndAfterAll -import scala.reflect.ClassTag - -import org.apache.spark.sql.{SQLConf, QueryTest} -import org.apache.spark.sql.execution.{BroadcastHashJoin, ShuffledHashJoin} -import org.apache.spark.sql.hive.test.TestHive +import org.apache.spark.sql.QueryTest +import org.apache.spark.sql.hive.execution.HiveTableScan import org.apache.spark.sql.hive.test.TestHive._ case class ParquetData(intField: Int, stringField: String) @@ -36,27 +32,19 @@ case class ParquetData(intField: Int, stringField: String) * Tests for our SerDe -> Native parquet scan conversion. */ class ParquetMetastoreSuite extends QueryTest with BeforeAndAfterAll { - override def beforeAll(): Unit = { -setConf("spark.sql.hive.convertMetastoreParquet", "true") - } - - override def afterAll(): Unit = { -setConf("spark.sql.hive.convertMetastoreParquet", "false") - } - - val partitionedTableDir = File.createTempFile("parquettests", "sparksql") - partitionedTableDir.delete() - partitionedTableDir.mkdir() - - (1 to 10).foreach { p => -val partDir = new File(partitionedTableDir, s"p=$p") -sparkContext.makeRDD(1 to 10) - .map(i => ParquetData(i, s"part-$p")) - .saveAsParquetFile(partDir.getCanonicalPath) - } - - sql(s""" +val partitionedTableDir = File.createTempFile("parquettests", "sparksql") +partitionedTableDir.delete() +partitionedTableDir.mkdir() + +(1 to 10).foreach { p => + val partDir = new File(partitionedTableDir, s"p=$p") + sparkContext.makeRDD(1 to 10) +.map(i => ParquetData(i, s"part-$p")) +.saveAsParquetFile(partDir.getCanonicalPath) +} + +sql(s""" create external table partitioned_parquet ( intField INT, @@ -70,7 +58,7 @@ class ParquetMetastoreSuite extends QueryTest with BeforeAndAfterAll { location '${partitionedTableDir.getCanonicalPath}' """) - sql(s""" +sql(s""" create external table normal_parquet ( intField INT, @@ -83,8 +71,15 @@ class ParquetMetastoreSuite extends QueryTest with BeforeAndAfterAll { location '${new File(partitionedTableDir, "p=1").getCanonicalPath}' """) - (1 to 10).foreach { p => -sql(s"ALTER TABLE partitioned_parquet ADD PARTITION (p=$p)") +(1 to 10).foreach { p => + sql(s"ALTER TABLE partitioned_parquet ADD PARTITION (p=$p)") +} + +setConf("spark.sql.hive.convertMetastoreParquet", "true") + } + + override def afterAll(): Unit = { +setConf("spark.sql.hive.convertMetastoreParquet", "false") } test("project the partitioning column") { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@s
git commit: [SPARK-3515][SQL] Moves test suite setup code to beforeAll rather than in constructor
Repository: spark Updated Branches: refs/heads/master 885d1621b -> 6d887db78 [SPARK-3515][SQL] Moves test suite setup code to beforeAll rather than in constructor Please refer to the JIRA ticket for details. **NOTE** We should check all test suites that do similar initialization-like side effects in their constructors. This PR only fixes `ParquetMetastoreSuite` because it breaks our Jenkins Maven build. Author: Cheng Lian Closes #2375 from liancheng/say-no-to-constructor and squashes the following commits: 0ceb75b [Cheng Lian] Moves test suite setup code to beforeAll rather than in constructor Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6d887db7 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6d887db7 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6d887db7 Branch: refs/heads/master Commit: 6d887db7891be643f0131b136e82191b5f6eb407 Parents: 885d162 Author: Cheng Lian Authored: Fri Sep 12 20:14:09 2014 -0700 Committer: Michael Armbrust Committed: Fri Sep 12 20:14:09 2014 -0700 -- .../sql/parquet/ParquetMetastoreSuite.scala | 53 +--- 1 file changed, 24 insertions(+), 29 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/6d887db7/sql/hive/src/test/scala/org/apache/spark/sql/parquet/ParquetMetastoreSuite.scala -- diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/parquet/ParquetMetastoreSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/parquet/ParquetMetastoreSuite.scala index 0723be7..e380280 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/parquet/ParquetMetastoreSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/parquet/ParquetMetastoreSuite.scala @@ -20,14 +20,10 @@ package org.apache.spark.sql.parquet import java.io.File -import org.apache.spark.sql.hive.execution.HiveTableScan import org.scalatest.BeforeAndAfterAll -import scala.reflect.ClassTag - -import org.apache.spark.sql.{SQLConf, QueryTest} -import org.apache.spark.sql.execution.{BroadcastHashJoin, ShuffledHashJoin} -import org.apache.spark.sql.hive.test.TestHive +import org.apache.spark.sql.QueryTest +import org.apache.spark.sql.hive.execution.HiveTableScan import org.apache.spark.sql.hive.test.TestHive._ case class ParquetData(intField: Int, stringField: String) @@ -36,27 +32,19 @@ case class ParquetData(intField: Int, stringField: String) * Tests for our SerDe -> Native parquet scan conversion. */ class ParquetMetastoreSuite extends QueryTest with BeforeAndAfterAll { - override def beforeAll(): Unit = { -setConf("spark.sql.hive.convertMetastoreParquet", "true") - } - - override def afterAll(): Unit = { -setConf("spark.sql.hive.convertMetastoreParquet", "false") - } - - val partitionedTableDir = File.createTempFile("parquettests", "sparksql") - partitionedTableDir.delete() - partitionedTableDir.mkdir() - - (1 to 10).foreach { p => -val partDir = new File(partitionedTableDir, s"p=$p") -sparkContext.makeRDD(1 to 10) - .map(i => ParquetData(i, s"part-$p")) - .saveAsParquetFile(partDir.getCanonicalPath) - } - - sql(s""" +val partitionedTableDir = File.createTempFile("parquettests", "sparksql") +partitionedTableDir.delete() +partitionedTableDir.mkdir() + +(1 to 10).foreach { p => + val partDir = new File(partitionedTableDir, s"p=$p") + sparkContext.makeRDD(1 to 10) +.map(i => ParquetData(i, s"part-$p")) +.saveAsParquetFile(partDir.getCanonicalPath) +} + +sql(s""" create external table partitioned_parquet ( intField INT, @@ -70,7 +58,7 @@ class ParquetMetastoreSuite extends QueryTest with BeforeAndAfterAll { location '${partitionedTableDir.getCanonicalPath}' """) - sql(s""" +sql(s""" create external table normal_parquet ( intField INT, @@ -83,8 +71,15 @@ class ParquetMetastoreSuite extends QueryTest with BeforeAndAfterAll { location '${new File(partitionedTableDir, "p=1").getCanonicalPath}' """) - (1 to 10).foreach { p => -sql(s"ALTER TABLE partitioned_parquet ADD PARTITION (p=$p)") +(1 to 10).foreach { p => + sql(s"ALTER TABLE partitioned_parquet ADD PARTITION (p=$p)") +} + +setConf("spark.sql.hive.convertMetastoreParquet", "true") + } + + override def afterAll(): Unit = { +setConf("spark.sql.hive.convertMetastoreParquet", "false") } test("project the partitioning column") { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
git commit: [SPARK-3500] [SQL] use JavaSchemaRDD as SchemaRDD._jschema_rdd
Repository: spark Updated Branches: refs/heads/branch-1.1 6cbf83c05 -> 9c06c7230 [SPARK-3500] [SQL] use JavaSchemaRDD as SchemaRDD._jschema_rdd Currently, SchemaRDD._jschema_rdd is SchemaRDD, the Scala API (coalesce(), repartition()) can not been called in Python easily, there is no way to specify the implicit parameter `ord`. The _jrdd is an JavaRDD, so _jschema_rdd should also be JavaSchemaRDD. In this patch, change _schema_rdd to JavaSchemaRDD, also added an assert for it. If some methods are missing from JavaSchemaRDD, then it's called by _schema_rdd.baseSchemaRDD().xxx(). BTW, Do we need JavaSQLContext? Author: Davies Liu Closes #2369 from davies/fix_schemardd and squashes the following commits: abee159 [Davies Liu] use JavaSchemaRDD as SchemaRDD._jschema_rdd (cherry picked from commit 885d1621bc06bc1f009c9707c3452eac26baf828) Signed-off-by: Josh Rosen Conflicts: python/pyspark/tests.py Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9c06c723 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9c06c723 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9c06c723 Branch: refs/heads/branch-1.1 Commit: 9c06c723018d4ef96ff31eb947226a6273ed8080 Parents: 6cbf83c Author: Davies Liu Authored: Fri Sep 12 19:05:39 2014 -0700 Committer: Josh Rosen Committed: Fri Sep 12 19:28:45 2014 -0700 -- python/pyspark/sql.py | 38 ++ python/pyspark/tests.py | 37 + 2 files changed, 55 insertions(+), 20 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/9c06c723/python/pyspark/sql.py -- diff --git a/python/pyspark/sql.py b/python/pyspark/sql.py index 0ff6a54..07b39c9 100644 --- a/python/pyspark/sql.py +++ b/python/pyspark/sql.py @@ -1121,7 +1121,7 @@ class SQLContext: batched = isinstance(rdd._jrdd_deserializer, BatchedSerializer) jrdd = self._pythonToJava(rdd._jrdd, batched) srdd = self._ssql_ctx.applySchemaToPythonRDD(jrdd.rdd(), str(schema)) -return SchemaRDD(srdd, self) +return SchemaRDD(srdd.toJavaSchemaRDD(), self) def registerRDDAsTable(self, rdd, tableName): """Registers the given RDD as a temporary table in the catalog. @@ -1133,8 +1133,8 @@ class SQLContext: >>> sqlCtx.registerRDDAsTable(srdd, "table1") """ if (rdd.__class__ is SchemaRDD): -jschema_rdd = rdd._jschema_rdd -self._ssql_ctx.registerRDDAsTable(jschema_rdd, tableName) +srdd = rdd._jschema_rdd.baseSchemaRDD() +self._ssql_ctx.registerRDDAsTable(srdd, tableName) else: raise ValueError("Can only register SchemaRDD as table") @@ -1150,7 +1150,7 @@ class SQLContext: >>> sorted(srdd.collect()) == sorted(srdd2.collect()) True """ -jschema_rdd = self._ssql_ctx.parquetFile(path) +jschema_rdd = self._ssql_ctx.parquetFile(path).toJavaSchemaRDD() return SchemaRDD(jschema_rdd, self) def jsonFile(self, path, schema=None): @@ -1206,11 +1206,11 @@ class SQLContext: [Row(f1=u'row1', f2=None, f3=None)...Row(f1=u'row3', f2=[], f3=None)] """ if schema is None: -jschema_rdd = self._ssql_ctx.jsonFile(path) +srdd = self._ssql_ctx.jsonFile(path) else: scala_datatype = self._ssql_ctx.parseDataType(str(schema)) -jschema_rdd = self._ssql_ctx.jsonFile(path, scala_datatype) -return SchemaRDD(jschema_rdd, self) +srdd = self._ssql_ctx.jsonFile(path, scala_datatype) +return SchemaRDD(srdd.toJavaSchemaRDD(), self) def jsonRDD(self, rdd, schema=None): """Loads an RDD storing one JSON object per string as a L{SchemaRDD}. @@ -1274,11 +1274,11 @@ class SQLContext: keyed._bypass_serializer = True jrdd = keyed._jrdd.map(self._jvm.BytesToString()) if schema is None: -jschema_rdd = self._ssql_ctx.jsonRDD(jrdd.rdd()) +srdd = self._ssql_ctx.jsonRDD(jrdd.rdd()) else: scala_datatype = self._ssql_ctx.parseDataType(str(schema)) -jschema_rdd = self._ssql_ctx.jsonRDD(jrdd.rdd(), scala_datatype) -return SchemaRDD(jschema_rdd, self) +srdd = self._ssql_ctx.jsonRDD(jrdd.rdd(), scala_datatype) +return SchemaRDD(srdd.toJavaSchemaRDD(), self) def sql(self, sqlQuery): """Return a L{SchemaRDD} representing the result of the given query. @@ -1289,7 +1289,7 @@ class SQLContext: >>> srdd2.collect() [Row(f1=1, f2=u'row1'), Row(f1=2, f2=u'row2'), Row(f1=3, f2=u'row3')] """ -return SchemaRDD(self._ssql_ctx.sql(sqlQuer
git commit: [SPARK-3500] [SQL] use JavaSchemaRDD as SchemaRDD._jschema_rdd
Repository: spark Updated Branches: refs/heads/master 71af030b4 -> 885d1621b [SPARK-3500] [SQL] use JavaSchemaRDD as SchemaRDD._jschema_rdd Currently, SchemaRDD._jschema_rdd is SchemaRDD, the Scala API (coalesce(), repartition()) can not been called in Python easily, there is no way to specify the implicit parameter `ord`. The _jrdd is an JavaRDD, so _jschema_rdd should also be JavaSchemaRDD. In this patch, change _schema_rdd to JavaSchemaRDD, also added an assert for it. If some methods are missing from JavaSchemaRDD, then it's called by _schema_rdd.baseSchemaRDD().xxx(). BTW, Do we need JavaSQLContext? Author: Davies Liu Closes #2369 from davies/fix_schemardd and squashes the following commits: abee159 [Davies Liu] use JavaSchemaRDD as SchemaRDD._jschema_rdd Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/885d1621 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/885d1621 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/885d1621 Branch: refs/heads/master Commit: 885d1621bc06bc1f009c9707c3452eac26baf828 Parents: 71af030 Author: Davies Liu Authored: Fri Sep 12 19:05:39 2014 -0700 Committer: Josh Rosen Committed: Fri Sep 12 19:05:39 2014 -0700 -- python/pyspark/sql.py | 38 ++ python/pyspark/tests.py | 28 2 files changed, 46 insertions(+), 20 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/885d1621/python/pyspark/sql.py -- diff --git a/python/pyspark/sql.py b/python/pyspark/sql.py index 53eea6d..fc9310f 100644 --- a/python/pyspark/sql.py +++ b/python/pyspark/sql.py @@ -1122,7 +1122,7 @@ class SQLContext(object): batched = isinstance(rdd._jrdd_deserializer, BatchedSerializer) jrdd = self._pythonToJava(rdd._jrdd, batched) srdd = self._ssql_ctx.applySchemaToPythonRDD(jrdd.rdd(), str(schema)) -return SchemaRDD(srdd, self) +return SchemaRDD(srdd.toJavaSchemaRDD(), self) def registerRDDAsTable(self, rdd, tableName): """Registers the given RDD as a temporary table in the catalog. @@ -1134,8 +1134,8 @@ class SQLContext(object): >>> sqlCtx.registerRDDAsTable(srdd, "table1") """ if (rdd.__class__ is SchemaRDD): -jschema_rdd = rdd._jschema_rdd -self._ssql_ctx.registerRDDAsTable(jschema_rdd, tableName) +srdd = rdd._jschema_rdd.baseSchemaRDD() +self._ssql_ctx.registerRDDAsTable(srdd, tableName) else: raise ValueError("Can only register SchemaRDD as table") @@ -1151,7 +1151,7 @@ class SQLContext(object): >>> sorted(srdd.collect()) == sorted(srdd2.collect()) True """ -jschema_rdd = self._ssql_ctx.parquetFile(path) +jschema_rdd = self._ssql_ctx.parquetFile(path).toJavaSchemaRDD() return SchemaRDD(jschema_rdd, self) def jsonFile(self, path, schema=None): @@ -1207,11 +1207,11 @@ class SQLContext(object): [Row(f1=u'row1', f2=None, f3=None)...Row(f1=u'row3', f2=[], f3=None)] """ if schema is None: -jschema_rdd = self._ssql_ctx.jsonFile(path) +srdd = self._ssql_ctx.jsonFile(path) else: scala_datatype = self._ssql_ctx.parseDataType(str(schema)) -jschema_rdd = self._ssql_ctx.jsonFile(path, scala_datatype) -return SchemaRDD(jschema_rdd, self) +srdd = self._ssql_ctx.jsonFile(path, scala_datatype) +return SchemaRDD(srdd.toJavaSchemaRDD(), self) def jsonRDD(self, rdd, schema=None): """Loads an RDD storing one JSON object per string as a L{SchemaRDD}. @@ -1275,11 +1275,11 @@ class SQLContext(object): keyed._bypass_serializer = True jrdd = keyed._jrdd.map(self._jvm.BytesToString()) if schema is None: -jschema_rdd = self._ssql_ctx.jsonRDD(jrdd.rdd()) +srdd = self._ssql_ctx.jsonRDD(jrdd.rdd()) else: scala_datatype = self._ssql_ctx.parseDataType(str(schema)) -jschema_rdd = self._ssql_ctx.jsonRDD(jrdd.rdd(), scala_datatype) -return SchemaRDD(jschema_rdd, self) +srdd = self._ssql_ctx.jsonRDD(jrdd.rdd(), scala_datatype) +return SchemaRDD(srdd.toJavaSchemaRDD(), self) def sql(self, sqlQuery): """Return a L{SchemaRDD} representing the result of the given query. @@ -1290,7 +1290,7 @@ class SQLContext(object): >>> srdd2.collect() [Row(f1=1, f2=u'row1'), Row(f1=2, f2=u'row2'), Row(f1=3, f2=u'row3')] """ -return SchemaRDD(self._ssql_ctx.sql(sqlQuery), self) +return SchemaRDD(self._ssql_ctx.sql(sqlQuery).toJavaSchemaRDD(), self) def table(sel
git commit: [SPARK-3094] [PySpark] compatitable with PyPy
Repository: spark Updated Branches: refs/heads/master 25311c2c5 -> 71af030b4 [SPARK-3094] [PySpark] compatitable with PyPy After this patch, we can run PySpark in PyPy (testing with PyPy 2.3.1 in Mac 10.9), for example: ``` PYSPARK_PYTHON=pypy ./bin/spark-submit wordcount.py ``` The performance speed up will depend on work load (from 20% to 3000%). Here are some benchmarks: Job | CPython 2.7 | PyPy 2.3.1 | Speed up --- | | - | --- Word Count | 41s | 15s | 2.7x Sort | 46s | 44s | 1.05x Stats | 174s | 3.6s | 48x Here is the code used for benchmark: ```python rdd = sc.textFile("text") def wordcount(): rdd.flatMap(lambda x:x.split('/'))\ .map(lambda x:(x,1)).reduceByKey(lambda x,y:x+y).collectAsMap() def sort(): rdd.sortBy(lambda x:x, 1).count() def stats(): sc.parallelize(range(1024), 20).flatMap(lambda x: xrange(5024)).stats() ``` Author: Davies Liu Closes #2144 from davies/pypy and squashes the following commits: 9aed6c5 [Davies Liu] use protocol 2 in CloudPickle 4bc1f04 [Davies Liu] refactor b20ab3a [Davies Liu] pickle sys.stdout and stderr in portable way 3ca2351 [Davies Liu] Merge branch 'master' into pypy fae8b19 [Davies Liu] improve attrgetter, add tests 591f830 [Davies Liu] try to run tests with PyPy in run-tests c8d62ba [Davies Liu] cleanup f651fd0 [Davies Liu] fix tests using array with PyPy 1b98fb3 [Davies Liu] serialize itemgetter/attrgetter in portable ways 3c1dbfe [Davies Liu] Merge branch 'master' into pypy 42fb5fa [Davies Liu] Merge branch 'master' into pypy cb2d724 [Davies Liu] fix tests 9986692 [Davies Liu] Merge branch 'master' into pypy 25b4ca7 [Davies Liu] support PyPy Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/71af030b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/71af030b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/71af030b Branch: refs/heads/master Commit: 71af030b46a89aaa9a87f18f56b9e1f1cd8ce2e7 Parents: 25311c2 Author: Davies Liu Authored: Fri Sep 12 18:42:50 2014 -0700 Committer: Josh Rosen Committed: Fri Sep 12 18:42:50 2014 -0700 -- python/pyspark/cloudpickle.py | 168 +++-- python/pyspark/daemon.py | 6 +- python/pyspark/serializers.py | 10 ++- python/pyspark/tests.py | 85 +-- python/run-tests | 21 + 5 files changed, 172 insertions(+), 118 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/71af030b/python/pyspark/cloudpickle.py -- diff --git a/python/pyspark/cloudpickle.py b/python/pyspark/cloudpickle.py index 80e51d1..32dda38 100644 --- a/python/pyspark/cloudpickle.py +++ b/python/pyspark/cloudpickle.py @@ -52,35 +52,19 @@ from functools import partial import itertools from copy_reg import _extension_registry, _inverted_registry, _extension_cache import new -import dis import traceback +import platform -#relevant opcodes -STORE_GLOBAL = chr(dis.opname.index('STORE_GLOBAL')) -DELETE_GLOBAL = chr(dis.opname.index('DELETE_GLOBAL')) -LOAD_GLOBAL = chr(dis.opname.index('LOAD_GLOBAL')) -GLOBAL_OPS = [STORE_GLOBAL, DELETE_GLOBAL, LOAD_GLOBAL] +PyImp = platform.python_implementation() -HAVE_ARGUMENT = chr(dis.HAVE_ARGUMENT) -EXTENDED_ARG = chr(dis.EXTENDED_ARG) import logging cloudLog = logging.getLogger("Cloud.Transport") -try: -import ctypes -except (MemoryError, ImportError): -logging.warning('Exception raised on importing ctypes. Likely python bug.. some functionality will be disabled', exc_info = True) -ctypes = None -PyObject_HEAD = None -else: - -# for reading internal structures -PyObject_HEAD = [ -('ob_refcnt', ctypes.c_size_t), -('ob_type', ctypes.c_void_p), -] +if PyImp == "PyPy": +# register builtin type in `new` +new.method = types.MethodType try: from cStringIO import StringIO @@ -225,6 +209,8 @@ class CloudPickler(pickle.Pickler): if themodule: self.modules.add(themodule) +if getattr(themodule, name, None) is obj: +return self.save_global(obj, name) if not self.savedDjangoEnv: #hack for django - if we detect the settings module, we transport it @@ -306,44 +292,28 @@ class CloudPickler(pickle.Pickler): # create a skeleton function object and memoize it save(_make_skel_func) -save((code, len(closure), base_globals)) +save((code, closure, base_globals)) write(pickle.REDUCE) self.memoize(func) # save the rest of the func data needed by _fill_function save(f_globals) save(defaults) -save(closure) save(dct) write(pickle.TUPLE) writ
git commit: [SPARK-3456] YarnAllocator on alpha can lose container requests to RM
Repository: spark Updated Branches: refs/heads/master af2583826 -> 25311c2c5 [SPARK-3456] YarnAllocator on alpha can lose container requests to RM Author: Thomas Graves Closes #2373 from tgravescs/SPARK-3456 and squashes the following commits: 77e9532 [Thomas Graves] [SPARK-3456] YarnAllocator on alpha can lose container requests to RM Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/25311c2c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/25311c2c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/25311c2c Branch: refs/heads/master Commit: 25311c2c545a60eb9dcf704814d4600987852155 Parents: af25838 Author: Thomas Graves Authored: Fri Sep 12 20:31:11 2014 -0500 Committer: Thomas Graves Committed: Fri Sep 12 20:31:11 2014 -0500 -- .../apache/spark/deploy/yarn/YarnAllocationHandler.scala | 11 ++- .../org/apache/spark/deploy/yarn/YarnAllocator.scala | 8 ++-- .../apache/spark/deploy/yarn/YarnAllocationHandler.scala | 3 ++- 3 files changed, 14 insertions(+), 8 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/25311c2c/yarn/alpha/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocationHandler.scala -- diff --git a/yarn/alpha/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocationHandler.scala b/yarn/alpha/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocationHandler.scala index 5a1b42c..6c93d85 100644 --- a/yarn/alpha/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocationHandler.scala +++ b/yarn/alpha/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocationHandler.scala @@ -48,16 +48,17 @@ private[yarn] class YarnAllocationHandler( private val lastResponseId = new AtomicInteger() private val releaseList: CopyOnWriteArrayList[ContainerId] = new CopyOnWriteArrayList() - override protected def allocateContainers(count: Int): YarnAllocateResponse = { + override protected def allocateContainers(count: Int, pending: Int): YarnAllocateResponse = { var resourceRequests: List[ResourceRequest] = null -logDebug("numExecutors: " + count) +logDebug("asking for additional executors: " + count + " with already pending: " + pending) +val totalNumAsk = count + pending if (count <= 0) { resourceRequests = List() } else if (preferredHostToCount.isEmpty) { logDebug("host preferences is empty") resourceRequests = List(createResourceRequest( - AllocationType.ANY, null, count, YarnSparkHadoopUtil.RM_REQUEST_PRIORITY)) + AllocationType.ANY, null, totalNumAsk, YarnSparkHadoopUtil.RM_REQUEST_PRIORITY)) } else { // request for all hosts in preferred nodes and for numExecutors - // candidates.size, request by default allocation policy. @@ -80,7 +81,7 @@ private[yarn] class YarnAllocationHandler( val anyContainerRequests: ResourceRequest = createResourceRequest( AllocationType.ANY, resource = null, -count, +totalNumAsk, YarnSparkHadoopUtil.RM_REQUEST_PRIORITY) val containerRequests: ArrayBuffer[ResourceRequest] = new ArrayBuffer[ResourceRequest]( @@ -103,7 +104,7 @@ private[yarn] class YarnAllocationHandler( req.addAllReleases(releasedContainerList) if (count > 0) { - logInfo("Allocating %d executor containers with %d of memory each.".format(count, + logInfo("Allocating %d executor containers with %d of memory each.".format(totalNumAsk, executorMemory + memoryOverhead)) } else { logDebug("Empty allocation req .. release : " + releasedContainerList) http://git-wip-us.apache.org/repos/asf/spark/blob/25311c2c/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala -- diff --git a/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala b/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala index 0b8744f..299e38a 100644 --- a/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala +++ b/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala @@ -112,6 +112,9 @@ private[yarn] abstract class YarnAllocator( def allocateResources() = { val missing = maxExecutors - numPendingAllocate.get() - numExecutorsRunning.get() +// this is needed by alpha, do it here since we add numPending right after this +val executorsPending = numPendingAllocate.get() + if (missing > 0) { numPendingAllocate.addAndGet(missing) logInfo("Will Allocate %d executor containers, each with %d memory".format( @@ -121,7 +124,7 @@ private[yarn] abstract class YarnAllocator( logDebug("Empty allocation req
git commit: [SPARK-3217] Add Guava to classpath when SPARK_PREPEND_CLASSES is set.
Repository: spark Updated Branches: refs/heads/master 1d767967e -> af2583826 [SPARK-3217] Add Guava to classpath when SPARK_PREPEND_CLASSES is set. When that option is used, the compiled classes from the build directory are prepended to the classpath. Now that we avoid packaging Guava, that means we have classes referencing the original Guava location in the app's classpath, so errors happen. For that case, add Guava manually to the classpath. Note: if Spark is compiled with "-Phadoop-provided", it's tricky to make things work with SPARK_PREPEND_CLASSES, because you need to add the Hadoop classpath using SPARK_CLASSPATH and that means the older Hadoop Guava overrides the newer one Spark needs. So someone using SPARK_PREPEND_CLASSES needs to remember to not use that profile. Author: Marcelo Vanzin Closes #2141 from vanzin/SPARK-3217 and squashes the following commits: b967324 [Marcelo Vanzin] [SPARK-3217] Add Guava to classpath when SPARK_PREPEND_CLASSES is set. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/af258382 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/af258382 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/af258382 Branch: refs/heads/master Commit: af2583826c15d2a4e2732017ea20feeff0fb79f6 Parents: 1d76796 Author: Marcelo Vanzin Authored: Fri Sep 12 14:54:42 2014 -0700 Committer: Patrick Wendell Committed: Fri Sep 12 14:54:42 2014 -0700 -- bin/compute-classpath.sh | 1 + core/pom.xml | 27 +++ 2 files changed, 28 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/af258382/bin/compute-classpath.sh -- diff --git a/bin/compute-classpath.sh b/bin/compute-classpath.sh index 15c6779..0f63e36 100755 --- a/bin/compute-classpath.sh +++ b/bin/compute-classpath.sh @@ -43,6 +43,7 @@ if [ -n "$SPARK_PREPEND_CLASSES" ]; then echo "NOTE: SPARK_PREPEND_CLASSES is set, placing locally compiled Spark"\ "classes ahead of assembly." >&2 CLASSPATH="$CLASSPATH:$FWDIR/core/target/scala-$SCALA_VERSION/classes" + CLASSPATH="$CLASSPATH:$FWDIR/core/target/jars/*" CLASSPATH="$CLASSPATH:$FWDIR/repl/target/scala-$SCALA_VERSION/classes" CLASSPATH="$CLASSPATH:$FWDIR/mllib/target/scala-$SCALA_VERSION/classes" CLASSPATH="$CLASSPATH:$FWDIR/bagel/target/scala-$SCALA_VERSION/classes" http://git-wip-us.apache.org/repos/asf/spark/blob/af258382/core/pom.xml -- diff --git a/core/pom.xml b/core/pom.xml index b2b788a..2a81f6d 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -351,6 +351,33 @@ + + +org.apache.maven.plugins +maven-dependency-plugin + + +copy-dependencies +package + + copy-dependencies + + + ${project.build.directory} + false + false + true + true + guava + true + + + + - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
git commit: SPARK-3014. Log a more informative messages in a couple failure scenario...
Repository: spark Updated Branches: refs/heads/master 15a564598 -> 1d767967e SPARK-3014. Log a more informative messages in a couple failure scenario... ...s Author: Sandy Ryza Closes #1934 from sryza/sandy-spark-3014 and squashes the following commits: ae19cc1 [Sandy Ryza] SPARK-3014. Log a more informative messages in a couple failure scenarios Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1d767967 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1d767967 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1d767967 Branch: refs/heads/master Commit: 1d767967e925f1d727957c2d43383ef6ad2c5d5e Parents: 15a5645 Author: Sandy Ryza Authored: Fri Sep 12 16:48:28 2014 -0500 Committer: Thomas Graves Committed: Fri Sep 12 16:48:28 2014 -0500 -- core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala | 6 -- .../scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala | 6 ++ 2 files changed, 6 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/1d767967/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala -- diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala index 0fdb5ae..5ed3575 100644 --- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala +++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala @@ -18,7 +18,7 @@ package org.apache.spark.deploy import java.io.{File, PrintStream} -import java.lang.reflect.InvocationTargetException +import java.lang.reflect.{Modifier, InvocationTargetException} import java.net.URL import scala.collection.mutable.{ArrayBuffer, HashMap, Map} @@ -323,7 +323,9 @@ object SparkSubmit { } val mainMethod = mainClass.getMethod("main", new Array[String](0).getClass) - +if (!Modifier.isStatic(mainMethod.getModifiers)) { + throw new IllegalStateException("The main method in the given main class must be static") +} try { mainMethod.invoke(null, childArgs.toArray) } catch { http://git-wip-us.apache.org/repos/asf/spark/blob/1d767967/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala -- diff --git a/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala b/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala index 878b6db..735d772 100644 --- a/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala +++ b/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala @@ -283,11 +283,9 @@ private[spark] class ApplicationMaster(args: ApplicationMasterArguments, } val sparkContext = sparkContextRef.get() -assert(sparkContext != null || count >= numTries) if (sparkContext == null) { - logError( -"Unable to retrieve sparkContext inspite of waiting for %d, numTries = %d".format( - count * waitTime, numTries)) + logError(("SparkContext did not initialize after waiting for %d ms. Please check earlier" ++ " log output for errors. Failing the application.").format(numTries * waitTime)) } sparkContext } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
git commit: [SPARK-3427] [GraphX] Avoid active vertex tracking in static PageRank
Repository: spark Updated Branches: refs/heads/master eae81b0bf -> 15a564598 [SPARK-3427] [GraphX] Avoid active vertex tracking in static PageRank GraphX's current implementation of static (fixed iteration count) PageRank uses the Pregel API. This unnecessarily tracks active vertices, even though in static PageRank all vertices are always active. Active vertex tracking incurs the following costs: 1. A shuffle per iteration to ship the active sets to the edge partitions. 2. A hash table creation per iteration at each partition to index the active sets for lookup. 3. A hash lookup per edge to check whether the source vertex is active. I reimplemented static PageRank using the lower-level GraphX API instead of the Pregel API. In benchmarks on a 16-node m2.4xlarge cluster, this provided a 23% speedup (from 514 s to 397 s, mean over 3 trials) for 10 iterations of PageRank on a synthetic graph with 10M vertices and 1.27B edges. Author: Ankur Dave Closes #2308 from ankurdave/SPARK-3427 and squashes the following commits: 449996a [Ankur Dave] Avoid unnecessary active vertex tracking in static PageRank Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/15a56459 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/15a56459 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/15a56459 Branch: refs/heads/master Commit: 15a564598fe63003652b1e24527c432080b5976c Parents: eae81b0 Author: Ankur Dave Authored: Fri Sep 12 14:08:38 2014 -0700 Committer: Reynold Xin Committed: Fri Sep 12 14:08:38 2014 -0700 -- .../org/apache/spark/graphx/lib/PageRank.scala | 45 +--- 1 file changed, 29 insertions(+), 16 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/15a56459/graphx/src/main/scala/org/apache/spark/graphx/lib/PageRank.scala -- diff --git a/graphx/src/main/scala/org/apache/spark/graphx/lib/PageRank.scala b/graphx/src/main/scala/org/apache/spark/graphx/lib/PageRank.scala index 614555a..257e2f3 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/lib/PageRank.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/lib/PageRank.scala @@ -79,30 +79,43 @@ object PageRank extends Logging { def run[VD: ClassTag, ED: ClassTag]( graph: Graph[VD, ED], numIter: Int, resetProb: Double = 0.15): Graph[Double, Double] = { -// Initialize the pagerankGraph with each edge attribute having +// Initialize the PageRank graph with each edge attribute having // weight 1/outDegree and each vertex with attribute 1.0. -val pagerankGraph: Graph[Double, Double] = graph +var rankGraph: Graph[Double, Double] = graph // Associate the degree with each vertex .outerJoinVertices(graph.outDegrees) { (vid, vdata, deg) => deg.getOrElse(0) } // Set the weight on the edges based on the degree .mapTriplets( e => 1.0 / e.srcAttr ) // Set the vertex attributes to the initial pagerank values - .mapVertices( (id, attr) => 1.0 ) - .cache() + .mapVertices( (id, attr) => resetProb ) -// Define the three functions needed to implement PageRank in the GraphX -// version of Pregel -def vertexProgram(id: VertexId, attr: Double, msgSum: Double): Double = - resetProb + (1.0 - resetProb) * msgSum -def sendMessage(edge: EdgeTriplet[Double, Double]) = - Iterator((edge.dstId, edge.srcAttr * edge.attr)) -def messageCombiner(a: Double, b: Double): Double = a + b -// The initial message received by all vertices in PageRank -val initialMessage = 0.0 +var iteration = 0 +var prevRankGraph: Graph[Double, Double] = null +while (iteration < numIter) { + rankGraph.cache() -// Execute pregel for a fixed number of iterations. -Pregel(pagerankGraph, initialMessage, numIter, activeDirection = EdgeDirection.Out)( - vertexProgram, sendMessage, messageCombiner) + // Compute the outgoing rank contributions of each vertex, perform local preaggregation, and + // do the final aggregation at the receiving vertices. Requires a shuffle for aggregation. + val rankUpdates = rankGraph.mapReduceTriplets[Double]( +e => Iterator((e.dstId, e.srcAttr * e.attr)), _ + _) + + // Apply the final rank updates to get the new ranks, using join to preserve ranks of vertices + // that didn't receive a message. Requires a shuffle for broadcasting updated ranks to the + // edge partitions. + prevRankGraph = rankGraph + rankGraph = rankGraph.joinVertices(rankUpdates) { +(id, oldRank, msgSum) => resetProb + (1.0 - resetProb) * msgSum + }.cache() + + rankGraph.edges.foreachPartition(x => {}) // also materializes rankGraph.vertices + logInfo(s"PageRank finish
git commit: MAINTENANCE: Automated closing of pull requests.
Repository: spark Updated Branches: refs/heads/master 8194fc662 -> eae81b0bf MAINTENANCE: Automated closing of pull requests. This commit exists to close the following pull requests on Github: Closes #930 (close requested by 'andrewor14') Closes #867 (close requested by 'marmbrus') Closes #1829 (close requested by 'marmbrus') Closes #1131 (close requested by 'JoshRosen') Closes #1571 (close requested by 'andrewor14') Closes #2359 (close requested by 'andrewor14') Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/eae81b0b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/eae81b0b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/eae81b0b Branch: refs/heads/master Commit: eae81b0bfdf3159be90f507a03853800aec1874a Parents: 8194fc6 Author: Patrick Wendell Authored: Fri Sep 12 13:43:29 2014 -0700 Committer: Patrick Wendell Committed: Fri Sep 12 13:43:29 2014 -0700 -- -- - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
svn commit: r1624639 - in /spark: downloads.md js/downloads.js site/downloads.html site/js/downloads.js
Author: pwendell Date: Fri Sep 12 20:36:11 2014 New Revision: 1624639 URL: http://svn.apache.org/r1624639 Log: Adding pretty titles for download links. Modified: spark/downloads.md spark/js/downloads.js spark/site/downloads.html spark/site/js/downloads.js Modified: spark/downloads.md URL: http://svn.apache.org/viewvc/spark/downloads.md?rev=1624639&r1=1624638&r2=1624639&view=diff == --- spark/downloads.md (original) +++ spark/downloads.md Fri Sep 12 20:36:11 2014 @@ -24,10 +24,7 @@ The latest release of Spark is Spark 1.1 2. Chose a package type: - - Note: Spark can be - built from source for many other Hadoop versions. - + 3. Chose a download type: Modified: spark/js/downloads.js URL: http://svn.apache.org/viewvc/spark/js/downloads.js?rev=1624639&r1=1624638&r2=1624639&view=diff == --- spark/js/downloads.js (original) +++ spark/js/downloads.js Fri Sep 12 20:36:11 2014 @@ -7,15 +7,24 @@ function addRelease(version, releaseDate releases[version] = {released: releaseDate, packages: packages, downloadable: downloadable}; } -var sources = ["sources"]; +var sources = {pretty: "Source Code [can build several Hadoop versions]", tag: "sources"}; +var hadoop1 = {pretty: "Pre-built for Hadoop 1.X", tag: "hadoop1"}; +var cdh4 = {pretty: "Pre-built for CDH 4", tag: "cdh4"}; +var hadoop2 = {pretty: "Pre-built for Hadoop 2.2", tag: "hadoop2"}; +var hadoop2p3 = {pretty: "Pre-built for Hadoop 2.3", tag: "hadoop2.3"}; +var hadoop2p4 = {pretty: "Pre-built for Hadoop 2.4", tag: "hadoop2.4"}; +var mapr3 = {pretty: "Pre-built for MapR 3.X", tag: "mapr3"}; +var mapr4 = {pretty: "Pre-built for MapR 4.X", tag: "mapr4"}; + +var sourcePackage = [sources]; // 0.7+ -var packagesV1 = sources.concat(["hadoop1", "cdh4"]); +var packagesV1 = sourcePackage.concat([hadoop1, cdh4]); // 0.8.1+ -var packagesV2 = packagesV1.concat(["hadoop2"]); +var packagesV2 = packagesV1.concat([hadoop2]); // 1.0.1+ -var packagesV3 = packagesV2.concat(["mapr3", "mapr4"]); +var packagesV3 = packagesV2.concat([mapr3, mapr4]); // 1.1.0+ -var packagesV4 = packagesV1.concat(["hadoop2.3", "hadoop2.4", "mapr3", "mapr4"]); +var packagesV4 = packagesV1.concat([hadoop2p3, hadoop2p4, mapr3, mapr4]); addRelease("1.1.0", new Date("9/11/2014"), packagesV4, true); addRelease("1.0.2", new Date("8/5/2014"), packagesV3, true); @@ -87,7 +96,10 @@ function onVersionSelect() { var version = getSelectedValue(versionSelect); var packages = releases[version]["packages"]; for (var idx in packages) { -append(packageSelect, "" + packages[idx] + "") +var option = " $pretty " + .replace(/\$tag/, packages[idx].tag) + .replace(/\$pretty/, packages[idx].pretty); +append(packageSelect, option); } var href = "http://www.apache.org/dist/spark/spark-"; + version + "/"; Modified: spark/site/downloads.html URL: http://svn.apache.org/viewvc/spark/site/downloads.html?rev=1624639&r1=1624638&r2=1624639&view=diff == --- spark/site/downloads.html (original) +++ spark/site/downloads.html Fri Sep 12 20:36:11 2014 @@ -182,10 +182,7 @@ $(document).ready(function() { Chose a package type: - - Note: Spark can be - built from source for many other Hadoop versions. - + Chose a download type: Modified: spark/site/js/downloads.js URL: http://svn.apache.org/viewvc/spark/site/js/downloads.js?rev=1624639&r1=1624638&r2=1624639&view=diff == --- spark/site/js/downloads.js (original) +++ spark/site/js/downloads.js Fri Sep 12 20:36:11 2014 @@ -7,15 +7,24 @@ function addRelease(version, releaseDate releases[version] = {released: releaseDate, packages: packages, downloadable: downloadable}; } -var sources = ["sources"]; +var sources = {pretty: "Source Code [can build several Hadoop versions]", tag: "sources"}; +var hadoop1 = {pretty: "Pre-built for Hadoop 1.X", tag: "hadoop1"}; +var cdh4 = {pretty: "Pre-built for CDH 4", tag: "cdh4"}; +var hadoop2 = {pretty: "Pre-built for Hadoop 2.2", tag: "hadoop2"}; +var hadoop2p3 = {pretty: "Pre-built for Hadoop 2.3", tag: "hadoop2.3"}; +var hadoop2p4 = {pretty: "Pre-built for Hadoop 2.4", tag: "hadoop2.4"}; +var mapr3 = {pretty: "Pre-built for MapR 3.X", tag: "mapr3"}; +var mapr4 = {pretty: "Pre-built for MapR 4.X", tag: "mapr4"}; + +var sourcePackage = [sources]; // 0.7+ -var packagesV1 = sources.concat(["hadoop1", "cdh4"]); +var packagesV1 = sourcePackage.concat([hadoop1, cdh4]); // 0.8.1+ -var packagesV2 = packagesV1.concat(["hadoop2"]); +var packagesV2 = packagesV1.concat([hadoop2]); // 1.0.1+ -var packagesV3 = packagesV2.concat(["mapr3", "mapr4"]); +var packagesV3 = packagesV2.concat([mapr3, mapr4]); // 1.1.0+ -var packagesV4 = pa
git commit: [SPARK-3481] [SQL] Eliminate the error log in local Hive comparison test
Repository: spark Updated Branches: refs/heads/branch-1.0 ae6f5545a -> 4f991c9ff [SPARK-3481] [SQL] Eliminate the error log in local Hive comparison test Logically, we should remove the Hive Table/Database first and then reset the Hive configuration, repoint to the new data warehouse directory etc. Otherwise it raised exceptions like "Database doesn't not exists: default" in the local testing. Author: Cheng Hao Closes #2352 from chenghao-intel/test_hive and squashes the following commits: 74fd76b [Cheng Hao] eliminate the error log (cherry picked from commit 8194fc662c08eb445444c207264e22361def54ea) Signed-off-by: Michael Armbrust Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4f991c9f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4f991c9f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4f991c9f Branch: refs/heads/branch-1.0 Commit: 4f991c9fff169557b8d98923064d5ce547fa5569 Parents: ae6f554 Author: Cheng Hao Authored: Fri Sep 12 11:29:30 2014 -0700 Committer: Michael Armbrust Committed: Fri Sep 12 11:30:11 2014 -0700 -- .../scala/org/apache/spark/sql/hive/TestHive.scala | 17 - 1 file changed, 8 insertions(+), 9 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4f991c9f/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala -- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala index 9386008..5ddaa09 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala @@ -292,15 +292,6 @@ class TestHiveContext(sc: SparkContext) extends LocalHiveContext(sc) { logger.asInstanceOf[org.apache.log4j.Logger].setLevel(org.apache.log4j.Level.WARN) } - // It is important that we RESET first as broken hooks that might have been set could break - // other sql exec here. - runSqlHive("RESET") - // For some reason, RESET does not reset the following variables... - runSqlHive("set datanucleus.cache.collections=true") - runSqlHive("set datanucleus.cache.collections.lazy=true") - // Lots of tests fail if we do not change the partition whitelist from the default. - runSqlHive("set hive.metastore.partition.name.whitelist.pattern=.*") - loadedTables.clear() catalog.client.getAllTables("default").foreach { t => logger.debug(s"Deleting table $t") @@ -326,6 +317,14 @@ class TestHiveContext(sc: SparkContext) extends LocalHiveContext(sc) { FunctionRegistry.unregisterTemporaryUDF(udfName) } + // It is important that we RESET first as broken hooks that might have been set could break + // other sql exec here. + runSqlHive("RESET") + // For some reason, RESET does not reset the following variables... + runSqlHive("set datanucleus.cache.collections=true") + runSqlHive("set datanucleus.cache.collections.lazy=true") + // Lots of tests fail if we do not change the partition whitelist from the default. + runSqlHive("set hive.metastore.partition.name.whitelist.pattern=.*") configure() runSqlHive("USE default") - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
git commit: [SPARK-3481] [SQL] Eliminate the error log in local Hive comparison test
Repository: spark Updated Branches: refs/heads/branch-1.1 f17b7957a -> 6cbf83c05 [SPARK-3481] [SQL] Eliminate the error log in local Hive comparison test Logically, we should remove the Hive Table/Database first and then reset the Hive configuration, repoint to the new data warehouse directory etc. Otherwise it raised exceptions like "Database doesn't not exists: default" in the local testing. Author: Cheng Hao Closes #2352 from chenghao-intel/test_hive and squashes the following commits: 74fd76b [Cheng Hao] eliminate the error log (cherry picked from commit 8194fc662c08eb445444c207264e22361def54ea) Signed-off-by: Michael Armbrust Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6cbf83c0 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6cbf83c0 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6cbf83c0 Branch: refs/heads/branch-1.1 Commit: 6cbf83c05c7a073d4df81b59a1663fea38ce65f6 Parents: f17b795 Author: Cheng Hao Authored: Fri Sep 12 11:29:30 2014 -0700 Committer: Michael Armbrust Committed: Fri Sep 12 11:29:44 2014 -0700 -- .../scala/org/apache/spark/sql/hive/TestHive.scala | 17 - 1 file changed, 8 insertions(+), 9 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/6cbf83c0/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala -- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala index a013f3f..8bb2216 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala @@ -309,15 +309,6 @@ class TestHiveContext(sc: SparkContext) extends HiveContext(sc) { log.asInstanceOf[org.apache.log4j.Logger].setLevel(org.apache.log4j.Level.WARN) } - // It is important that we RESET first as broken hooks that might have been set could break - // other sql exec here. - runSqlHive("RESET") - // For some reason, RESET does not reset the following variables... - runSqlHive("set datanucleus.cache.collections=true") - runSqlHive("set datanucleus.cache.collections.lazy=true") - // Lots of tests fail if we do not change the partition whitelist from the default. - runSqlHive("set hive.metastore.partition.name.whitelist.pattern=.*") - loadedTables.clear() catalog.client.getAllTables("default").foreach { t => logDebug(s"Deleting table $t") @@ -343,6 +334,14 @@ class TestHiveContext(sc: SparkContext) extends HiveContext(sc) { FunctionRegistry.unregisterTemporaryUDF(udfName) } + // It is important that we RESET first as broken hooks that might have been set could break + // other sql exec here. + runSqlHive("RESET") + // For some reason, RESET does not reset the following variables... + runSqlHive("set datanucleus.cache.collections=true") + runSqlHive("set datanucleus.cache.collections.lazy=true") + // Lots of tests fail if we do not change the partition whitelist from the default. + runSqlHive("set hive.metastore.partition.name.whitelist.pattern=.*") configure() runSqlHive("USE default") - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
git commit: [SPARK-3481] [SQL] Eliminate the error log in local Hive comparison test
Repository: spark Updated Branches: refs/heads/master 533377621 -> 8194fc662 [SPARK-3481] [SQL] Eliminate the error log in local Hive comparison test Logically, we should remove the Hive Table/Database first and then reset the Hive configuration, repoint to the new data warehouse directory etc. Otherwise it raised exceptions like "Database doesn't not exists: default" in the local testing. Author: Cheng Hao Closes #2352 from chenghao-intel/test_hive and squashes the following commits: 74fd76b [Cheng Hao] eliminate the error log Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8194fc66 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8194fc66 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8194fc66 Branch: refs/heads/master Commit: 8194fc662c08eb445444c207264e22361def54ea Parents: 5333776 Author: Cheng Hao Authored: Fri Sep 12 11:29:30 2014 -0700 Committer: Michael Armbrust Committed: Fri Sep 12 11:29:30 2014 -0700 -- .../scala/org/apache/spark/sql/hive/TestHive.scala | 17 - 1 file changed, 8 insertions(+), 9 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/8194fc66/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala -- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala index 6974f3e..a3bfd3a 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala @@ -376,15 +376,6 @@ class TestHiveContext(sc: SparkContext) extends HiveContext(sc) { log.asInstanceOf[org.apache.log4j.Logger].setLevel(org.apache.log4j.Level.WARN) } - // It is important that we RESET first as broken hooks that might have been set could break - // other sql exec here. - runSqlHive("RESET") - // For some reason, RESET does not reset the following variables... - runSqlHive("set datanucleus.cache.collections=true") - runSqlHive("set datanucleus.cache.collections.lazy=true") - // Lots of tests fail if we do not change the partition whitelist from the default. - runSqlHive("set hive.metastore.partition.name.whitelist.pattern=.*") - loadedTables.clear() catalog.client.getAllTables("default").foreach { t => logDebug(s"Deleting table $t") @@ -410,6 +401,14 @@ class TestHiveContext(sc: SparkContext) extends HiveContext(sc) { FunctionRegistry.unregisterTemporaryUDF(udfName) } + // It is important that we RESET first as broken hooks that might have been set could break + // other sql exec here. + runSqlHive("RESET") + // For some reason, RESET does not reset the following variables... + runSqlHive("set datanucleus.cache.collections=true") + runSqlHive("set datanucleus.cache.collections.lazy=true") + // Lots of tests fail if we do not change the partition whitelist from the default. + runSqlHive("set hive.metastore.partition.name.whitelist.pattern=.*") configure() runSqlHive("USE default") - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
git commit: Revert "[Spark-3490] Disable SparkUI for tests"
Repository: spark Updated Branches: refs/heads/branch-1.1 e69deb818 -> f17b7957a Revert "[Spark-3490] Disable SparkUI for tests" This reverts commit 2ffc7980c6818eec05e32141c52e335bc71daed9. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f17b7957 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f17b7957 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f17b7957 Branch: refs/heads/branch-1.1 Commit: f17b7957a4283952021d9e4106c5bd9994148128 Parents: e69deb8 Author: Andrew Or Authored: Fri Sep 12 10:40:03 2014 -0700 Committer: Andrew Or Committed: Fri Sep 12 10:40:03 2014 -0700 -- .../scala/org/apache/spark/SparkContext.scala | 12 +- .../cluster/CoarseGrainedSchedulerBackend.scala | 2 +- .../cluster/SimrSchedulerBackend.scala | 6 +- .../cluster/SparkDeploySchedulerBackend.scala | 4 +- .../scala/org/apache/spark/ui/UISuite.scala | 44 +- pom.xml | 1 - project/SparkBuild.scala| 2 +- .../spark/streaming/StreamingContext.scala | 11 +- .../spark/streaming/StreamingSource.scala | 2 +- .../spark/streaming/ui/StreamingTab.scala | 25 +- .../org/apache/spark/streaming/UISuite.scala| 16 +- .../spark/deploy/yarn/ApplicationMaster.scala | 443 --- .../cluster/YarnClientSchedulerBackend.scala| 6 +- 13 files changed, 37 insertions(+), 537 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f17b7957/core/src/main/scala/org/apache/spark/SparkContext.scala -- diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index abb6a83..0470fbe 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -220,14 +220,8 @@ class SparkContext(config: SparkConf) extends Logging { new MetadataCleaner(MetadataCleanerType.SPARK_CONTEXT, this.cleanup, conf) // Initialize the Spark UI, registering all associated listeners - private[spark] val ui: Option[SparkUI] = -if (conf.getBoolean("spark.ui.enabled", true)) { - Some(new SparkUI(this)) -} else { - // For tests, do not enable the UI - None -} - ui.foreach(_.bind()) + private[spark] val ui = new SparkUI(this) + ui.bind() /** A default Hadoop Configuration for the Hadoop code (e.g. file systems) that we reuse. */ val hadoopConfiguration: Configuration = { @@ -1014,7 +1008,7 @@ class SparkContext(config: SparkConf) extends Logging { /** Shut down the SparkContext. */ def stop() { postApplicationEnd() -ui.foreach(_.stop()) +ui.stop() // Do this only if not stopped already - best case effort. // prevent NPE if stopped more than once. val dagSchedulerCopy = dagScheduler http://git-wip-us.apache.org/repos/asf/spark/blob/f17b7957/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala -- diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala index 04046e2..2a3711a 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala @@ -292,7 +292,7 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, actorSystem: A logInfo(s"Add WebUI Filter. $filterName, $filterParams, $proxyBase") conf.set("spark.ui.filters", filterName) conf.set(s"spark.$filterName.params", filterParams) - scheduler.sc.ui.foreach { ui => JettyUtils.addFilters(ui.getHandlers, conf) } + JettyUtils.addFilters(scheduler.sc.ui.getHandlers, conf) } } } http://git-wip-us.apache.org/repos/asf/spark/blob/f17b7957/core/src/main/scala/org/apache/spark/scheduler/cluster/SimrSchedulerBackend.scala -- diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/SimrSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/SimrSchedulerBackend.scala index b781842..4f7133c 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/SimrSchedulerBackend.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/SimrSchedulerBackend.scala @@ -17,6 +17,7 @@ package org.apache.spark.scheduler.cluster +import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{Path, FileSystem} import org.ap
git commit: [PySpark] Add blank line so that Python RDD.top() docstring renders correctly
Repository: spark Updated Branches: refs/heads/master f116f76bf -> 533377621 [PySpark] Add blank line so that Python RDD.top() docstring renders correctly Author: RJ Nowling Closes #2370 from rnowling/python_rdd_docstrings and squashes the following commits: 5230574 [RJ Nowling] Add blank line so that Python RDD.top() docstring renders correctly Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/53337762 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/53337762 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/53337762 Branch: refs/heads/master Commit: 533377621f1e178e18fa0b79d653a11b66e4e250 Parents: f116f76 Author: RJ Nowling Authored: Fri Sep 12 09:46:21 2014 -0700 Committer: Josh Rosen Committed: Fri Sep 12 09:46:21 2014 -0700 -- python/pyspark/rdd.py | 1 + 1 file changed, 1 insertion(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/53337762/python/pyspark/rdd.py -- diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py index 5667154..6ad5ab2 100644 --- a/python/pyspark/rdd.py +++ b/python/pyspark/rdd.py @@ -1060,6 +1060,7 @@ class RDD(object): Get the top N elements from a RDD. Note: It returns the list sorted in descending order. + >>> sc.parallelize([10, 4, 2, 12, 3]).top(1) [12] >>> sc.parallelize([2, 3, 4, 5, 6], 2).top(2) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
svn commit: r1624579 - in /spark: releases/_posts/2014-09-11-spark-release-1-1-0.md site/releases/spark-release-1-1-0.html
Author: pwendell Date: Fri Sep 12 16:04:01 2014 New Revision: 1624579 URL: http://svn.apache.org/r1624579 Log: Adding missing release credit Modified: spark/releases/_posts/2014-09-11-spark-release-1-1-0.md spark/site/releases/spark-release-1-1-0.html Modified: spark/releases/_posts/2014-09-11-spark-release-1-1-0.md URL: http://svn.apache.org/viewvc/spark/releases/_posts/2014-09-11-spark-release-1-1-0.md?rev=1624579&r1=1624578&r2=1624579&view=diff == --- spark/releases/_posts/2014-09-11-spark-release-1-1-0.md (original) +++ spark/releases/_posts/2014-09-11-spark-release-1-1-0.md Fri Sep 12 16:04:01 2014 @@ -63,6 +63,7 @@ Spark 1.1.0 is backwards compatible with * Anant -- Python and doc fixes * Anatoli Fomenko -- MLlib doc fix * Andrew Ash -- doc improvements and bug fixes + * Andrew Xia -- external sorting feature * Andrew Or -- external spilling feature, bug fixes, and optimizations * Andrew Schumacher -- Parquet support in Spark SQL * Ankit Bhardwaj -- bug fix Modified: spark/site/releases/spark-release-1-1-0.html URL: http://svn.apache.org/viewvc/spark/site/releases/spark-release-1-1-0.html?rev=1624579&r1=1624578&r2=1624579&view=diff == --- spark/site/releases/spark-release-1-1-0.html (original) +++ spark/site/releases/spark-release-1-1-0.html Fri Sep 12 16:04:01 2014 @@ -224,6 +224,7 @@ Anant – Python and doc fixes Anatoli Fomenko – MLlib doc fix Andrew Ash – doc improvements and bug fixes + Andrew Xia – external sorting feature Andrew Or – external spilling feature, bug fixes, and optimizations Andrew Schumacher – Parquet support in Spark SQL Ankit Bhardwaj – bug fix - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
svn commit: r1624578 - in /spark: releases/_posts/2014-09-11-spark-release-1-1-0.md site/releases/spark-release-1-1-0.html
Author: pwendell Date: Fri Sep 12 16:03:14 2014 New Revision: 1624578 URL: http://svn.apache.org/r1624578 Log: Adding missing release credit Modified: spark/releases/_posts/2014-09-11-spark-release-1-1-0.md spark/site/releases/spark-release-1-1-0.html Modified: spark/releases/_posts/2014-09-11-spark-release-1-1-0.md URL: http://svn.apache.org/viewvc/spark/releases/_posts/2014-09-11-spark-release-1-1-0.md?rev=1624578&r1=1624577&r2=1624578&view=diff == --- spark/releases/_posts/2014-09-11-spark-release-1-1-0.md (original) +++ spark/releases/_posts/2014-09-11-spark-release-1-1-0.md Fri Sep 12 16:03:14 2014 @@ -141,6 +141,7 @@ Spark 1.1.0 is backwards compatible with * Lu Luorta -- graphX fix * Luogan Kun -- SQL test fix * Ly Lai -- bug fix in Spark SQL + * Madhu Siddalingaiah -- external sorting feature * Manish Amde -- Multiclass support for decision trees * Manuel Laflamme -- streaming fix * Marcelo Vanzin -- fixes and improvements in YARN code Modified: spark/site/releases/spark-release-1-1-0.html URL: http://svn.apache.org/viewvc/spark/site/releases/spark-release-1-1-0.html?rev=1624578&r1=1624577&r2=1624578&view=diff == --- spark/site/releases/spark-release-1-1-0.html (original) +++ spark/site/releases/spark-release-1-1-0.html Fri Sep 12 16:03:14 2014 @@ -302,6 +302,7 @@ Lu Luorta – graphX fix Luogan Kun – SQL test fix Ly Lai – bug fix in Spark SQL + Madhu Siddalingaiah – external sorting feature Manish Amde – Multiclass support for decision trees Manuel Laflamme – streaming fix Marcelo Vanzin – fixes and improvements in YARN code - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
git commit: [SPARK-2558][DOCS] Add --queue example to YARN doc
Repository: spark Updated Branches: refs/heads/master b8634df1f -> f116f76bf [SPARK-2558][DOCS] Add --queue example to YARN doc Put original YARN queue spark-submit arg description in running-on-yarn html table and example command line Author: Mark G. Whitney Closes #2218 from kramimus/2258-yarndoc and squashes the following commits: 4b5d808 [Mark G. Whitney] remove yarn queue config f8cda0d [Mark G. Whitney] [SPARK-2558][DOCS] Add spark.yarn.queue description to YARN doc Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f116f76b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f116f76b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f116f76b Branch: refs/heads/master Commit: f116f76bf1f1610905ca094c8edc53151a78d2f4 Parents: b8634df Author: Mark G. Whitney Authored: Fri Sep 12 08:08:58 2014 -0500 Committer: Thomas Graves Committed: Fri Sep 12 08:08:58 2014 -0500 -- docs/running-on-yarn.md | 1 + 1 file changed, 1 insertion(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f116f76b/docs/running-on-yarn.md -- diff --git a/docs/running-on-yarn.md b/docs/running-on-yarn.md index d8b22f3..212248b 100644 --- a/docs/running-on-yarn.md +++ b/docs/running-on-yarn.md @@ -155,6 +155,7 @@ For example: --driver-memory 4g \ --executor-memory 2g \ --executor-cores 1 \ +--queue thequeue \ lib/spark-examples*.jar \ 10 - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
git commit: [SPARK-3160] [SPARK-3494] [mllib] DecisionTree: eliminate pre-allocated nodes, parentImpurities arrays. Memory calc bug fix.
Repository: spark Updated Branches: refs/heads/master 42904b8d0 -> b8634df1f [SPARK-3160] [SPARK-3494] [mllib] DecisionTree: eliminate pre-allocated nodes, parentImpurities arrays. Memory calc bug fix. This PR includes some code simplifications and re-organization which will be helpful for implementing random forests. The main changes are that the nodes and parentImpurities arrays are no longer pre-allocated in the main train() method. Also added 2 bug fixes: * maxMemoryUsage calculation * over-allocation of space for bins in DTStatsAggregator for unordered features. Relation to RFs: * Since RFs will be deeper and will therefore be more likely sparse (not full trees), it could be a cost savings to avoid pre-allocating a full tree. * The associated re-organization also reduces bookkeeping, which will make RFs easier to implement. * The return code doneTraining may be generalized to include cases such as nodes ready for local training. Details: No longer pre-allocate parentImpurities array in main train() method. * parentImpurities values are now stored in individual nodes (in Node.stats.impurity). * These were not really needed. They were used in calculateGainForSplit(), but they can be calculated anyways using parentNodeAgg. No longer using Node.build since tree structure is constructed on-the-fly. * Did not eliminate since it is public (Developer) API. Marked as deprecated. Eliminated pre-allocated nodes array in main train() method. * Nodes are constructed and added to the tree structure as needed during training. * Moved tree construction from main train() method into findBestSplitsPerGroup() since there is no need to keep the (split, gain) array for an entire level of nodes. Only one element of that array is needed at a time, so we do not the array. findBestSplits() now returns 2 items: * rootNode (newly created root node on first iteration, same root node on later iterations) * doneTraining (indicating if all nodes at that level were leafs) Updated DecisionTreeSuite. Notes: * Improved test "Second level node building with vs. without groups" ** generateOrderedLabeledPoints() modified so that it really does require 2 levels of internal nodes. * Related update: Added Node.deepCopy (private[tree]), used for test suite CC: mengxr Author: Joseph K. Bradley Closes #2341 from jkbradley/dt-spark-3160 and squashes the following commits: 07dd1ee [Joseph K. Bradley] Fixed overflow bug with computing maxMemoryUsage in DecisionTree. Also fixed bug with over-allocating space in DTStatsAggregator for unordered features. debe072 [Joseph K. Bradley] Merge remote-tracking branch 'upstream/master' into dt-spark-3160 5c4ac33 [Joseph K. Bradley] Added check in Strategy to make sure minInstancesPerNode >= 1 0dd4d87 [Joseph K. Bradley] Merge remote-tracking branch 'upstream/master' into dt-spark-3160 306120f [Joseph K. Bradley] Fixed typo in DecisionTreeModel.scala doc eaa1dcf [Joseph K. Bradley] Added topNode doc in DecisionTree and scalastyle fix d4d7864 [Joseph K. Bradley] Marked Node.build as deprecated d4dbb99 [Joseph K. Bradley] Merge remote-tracking branch 'upstream/master' into dt-spark-3160 1a8f0ad [Joseph K. Bradley] Eliminated pre-allocated nodes array in main train() method. * Nodes are constructed and added to the tree structure as needed during training. 2ab763b [Joseph K. Bradley] Simplifications to DecisionTree code: Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b8634df1 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b8634df1 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b8634df1 Branch: refs/heads/master Commit: b8634df1f1eb6ce909bec779522c9c9912c7d06a Parents: 42904b8 Author: Joseph K. Bradley Authored: Fri Sep 12 01:37:59 2014 -0700 Committer: Xiangrui Meng Committed: Fri Sep 12 01:37:59 2014 -0700 -- .../apache/spark/mllib/tree/DecisionTree.scala | 191 ++--- .../mllib/tree/configuration/Strategy.scala | 3 + .../mllib/tree/impl/DTStatsAggregator.scala | 11 +- .../mllib/tree/impl/DecisionTreeMetadata.scala | 3 +- .../mllib/tree/model/DecisionTreeModel.scala| 2 +- .../apache/spark/mllib/tree/model/Node.scala| 37 +++ .../spark/mllib/tree/DecisionTreeSuite.scala| 277 ++- 7 files changed, 268 insertions(+), 256 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b8634df1/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala index 9859656..56bb881 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.sca