date:20140912

git commit: Proper indent for the previous commit.

2014-09-12 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master feaa3706f -> b4dded40f


Proper indent for the previous commit.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b4dded40
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b4dded40
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b4dded40

Branch: refs/heads/master
Commit: b4dded40fbecb485f1ddfd8316b44d42a1554d64
Parents: feaa370
Author: Reynold Xin 
Authored: Fri Sep 12 22:51:25 2014 -0700
Committer: Reynold Xin 
Committed: Fri Sep 12 22:51:25 2014 -0700

--
 .../main/scala/org/apache/spark/api/java/JavaSparkContext.scala| 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/b4dded40/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala
--
diff --git 
a/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala 
b/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala
index 23f7e6b..791d853 100644
--- a/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala
@@ -42,7 +42,7 @@ import org.apache.spark.rdd.{EmptyRDD, HadoopRDD, 
NewHadoopRDD, RDD}
  * [[org.apache.spark.api.java.JavaRDD]]s and works with Java collections 
instead of Scala ones.
  */
 class JavaSparkContext(val sc: SparkContext)
-extends JavaSparkContextVarargsWorkaround with Closeable {
+  extends JavaSparkContextVarargsWorkaround with Closeable {
 
   /**
* Create a JavaSparkContext that loads settings from system properties (for 
instance, when


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

git commit: SPARK-3470 [CORE] [STREAMING] Add Closeable / close() to Java context objects

2014-09-12 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master e11eeb71f -> feaa3706f


SPARK-3470 [CORE] [STREAMING] Add Closeable / close() to Java context objects

...  that expose a stop() lifecycle method. This doesn't add `AutoCloseable`, 
which is Java 7+ only. But it should be possible to use try-with-resources on a 
`Closeable` in Java 7, as long as the `close()` does not throw a checked 
exception, and these don't. Q.E.D.

Author: Sean Owen 

Closes #2346 from srowen/SPARK-3470 and squashes the following commits:

612c21d [Sean Owen] Add Closeable / close() to Java context objects that expose 
a stop() lifecycle method


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/feaa3706
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/feaa3706
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/feaa3706

Branch: refs/heads/master
Commit: feaa3706f17e44efcdac9f0a543a5b91232771ce
Parents: e11eeb7
Author: Sean Owen 
Authored: Fri Sep 12 22:50:37 2014 -0700
Committer: Reynold Xin 
Committed: Fri Sep 12 22:50:37 2014 -0700

--
 .../scala/org/apache/spark/api/java/JavaSparkContext.scala| 7 ++-
 .../spark/streaming/api/java/JavaStreamingContext.scala   | 7 +--
 2 files changed, 11 insertions(+), 3 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/feaa3706/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala
--
diff --git 
a/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala 
b/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala
index 8e178bc..23f7e6b 100644
--- a/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.api.java
 
+import java.io.Closeable
 import java.util
 import java.util.{Map => JMap}
 
@@ -40,7 +41,9 @@ import org.apache.spark.rdd.{EmptyRDD, HadoopRDD, 
NewHadoopRDD, RDD}
  * A Java-friendly version of [[org.apache.spark.SparkContext]] that returns
  * [[org.apache.spark.api.java.JavaRDD]]s and works with Java collections 
instead of Scala ones.
  */
-class JavaSparkContext(val sc: SparkContext) extends 
JavaSparkContextVarargsWorkaround {
+class JavaSparkContext(val sc: SparkContext)
+extends JavaSparkContextVarargsWorkaround with Closeable {
+
   /**
* Create a JavaSparkContext that loads settings from system properties (for 
instance, when
* launching with ./bin/spark-submit).
@@ -534,6 +537,8 @@ class JavaSparkContext(val sc: SparkContext) extends 
JavaSparkContextVarargsWork
 sc.stop()
   }
 
+  override def close(): Unit = stop()
+
   /**
* Get Spark's home location from either a value set through the constructor,
* or the spark.home Java property, or the SPARK_HOME environment variable

http://git-wip-us.apache.org/repos/asf/spark/blob/feaa3706/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala
--
diff --git 
a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala
 
b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala
index 18605ca..9dc26dc 100644
--- 
a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala
+++ 
b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala
@@ -21,7 +21,7 @@ package org.apache.spark.streaming.api.java
 import scala.collection.JavaConversions._
 import scala.reflect.ClassTag
 
-import java.io.InputStream
+import java.io.{Closeable, InputStream}
 import java.util.{List => JList, Map => JMap}
 
 import akka.actor.{Props, SupervisorStrategy}
@@ -49,7 +49,7 @@ import org.apache.spark.streaming.receiver.Receiver
  * respectively. `context.awaitTransformation()` allows the current thread to 
wait for the
  * termination of a context by `stop()` or by an exception.
  */
-class JavaStreamingContext(val ssc: StreamingContext) {
+class JavaStreamingContext(val ssc: StreamingContext) extends Closeable {
 
   /**
* Create a StreamingContext.
@@ -540,6 +540,9 @@ class JavaStreamingContext(val ssc: StreamingContext) {
   def stop(stopSparkContext: Boolean, stopGracefully: Boolean) = {
 ssc.stop(stopSparkContext, stopGracefully)
   }
+
+  override def close(): Unit = stop()
+
 }
 
 /**


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

git commit: [SQL][Docs] Update SQL programming guide to show the correct default value of containsNull in an ArrayType

2014-09-12 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master 2584ea5b2 -> e11eeb71f


[SQL][Docs] Update SQL programming guide to show the correct default value of 
containsNull in an ArrayType

After #1889, the default value of `containsNull` in an `ArrayType` is `true`.

Author: Yin Huai 

Closes #2374 from yhuai/containsNull and squashes the following commits:

dc609a3 [Yin Huai] Update the SQL programming guide to show the correct default 
value of containsNull in an ArrayType (the default value is true instead of 
false).


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e11eeb71
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e11eeb71
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e11eeb71

Branch: refs/heads/master
Commit: e11eeb71fa3a5fe7ddacb94d5b93b173d4d901a8
Parents: 2584ea5
Author: Yin Huai 
Authored: Fri Sep 12 21:58:02 2014 -0700
Committer: Reynold Xin 
Committed: Fri Sep 12 21:58:02 2014 -0700

--
 docs/sql-programming-guide.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/e11eeb71/docs/sql-programming-guide.md
--
diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index d83efa4..3159d52 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -1232,7 +1232,7 @@ import  org.apache.spark.sql._
scala.collection.Seq 
   
   ArrayType(elementType, [containsNull])
-  Note: The default value of containsNull is false.
+  Note: The default value of containsNull is true.
   
 
 
@@ -1358,7 +1358,7 @@ please use factory methods provided in
java.util.List 
   
   DataType.createArrayType(elementType)
-  Note: The value of containsNull will be false
+  Note: The value of containsNull will be true
   DataType.createArrayType(elementType, containsNull).
   
 
@@ -1505,7 +1505,7 @@ from pyspark.sql import *
list, tuple, or array 
   
   ArrayType(elementType, [containsNull])
-  Note: The default value of containsNull is False.
+  Note: The default value of containsNull is True.
   
 
 


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

git commit: [SPARK-3469] Make sure all TaskCompletionListener are called even with failures

2014-09-12 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master 6d887db78 -> 2584ea5b2


[SPARK-3469] Make sure all TaskCompletionListener are called even with failures

This is necessary because we rely on this callback interface to clean resources 
up. The old behavior would lead to resource leaks.

Note that this also changes the fault semantics of TaskCompletionListener. 
Previously failures in TaskCompletionListeners would result in the task being 
reported immediately. With this change, we report the exception at the end, and 
the reported exception is a TaskCompletionListenerException that contains all 
the exception messages.

Author: Reynold Xin 

Closes #2343 from rxin/taskcontext-callback and squashes the following commits:

a3845b2 [Reynold Xin] Mark TaskCompletionListenerException as private[spark].
ac5baea [Reynold Xin] Removed obsolete comment.
aa68ea4 [Reynold Xin] Throw an exception if task completion callback fails.
29b6162 [Reynold Xin] oops compilation failed.
1cb444d [Reynold Xin] [SPARK-3469] Call all TaskCompletionListeners even if 
some fail.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2584ea5b
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2584ea5b
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2584ea5b

Branch: refs/heads/master
Commit: 2584ea5b23b1c5a4df9549b94bfc9b8e0900532e
Parents: 6d887db
Author: Reynold Xin 
Authored: Fri Sep 12 21:55:39 2014 -0700
Committer: Reynold Xin 
Committed: Fri Sep 12 21:55:39 2014 -0700

--
 .../scala/org/apache/spark/TaskContext.scala| 18 +--
 .../util/TaskCompletionListenerException.scala  | 34 
 .../spark/scheduler/TaskContextSuite.scala  | 22 +++--
 3 files changed, 69 insertions(+), 5 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/2584ea5b/core/src/main/scala/org/apache/spark/TaskContext.scala
--
diff --git a/core/src/main/scala/org/apache/spark/TaskContext.scala 
b/core/src/main/scala/org/apache/spark/TaskContext.scala
index 2b99b8a..51b3e4d 100644
--- a/core/src/main/scala/org/apache/spark/TaskContext.scala
+++ b/core/src/main/scala/org/apache/spark/TaskContext.scala
@@ -21,7 +21,7 @@ import scala.collection.mutable.ArrayBuffer
 
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.executor.TaskMetrics
-import org.apache.spark.util.TaskCompletionListener
+import org.apache.spark.util.{TaskCompletionListenerException, 
TaskCompletionListener}
 
 
 /**
@@ -41,7 +41,7 @@ class TaskContext(
 val attemptId: Long,
 val runningLocally: Boolean = false,
 private[spark] val taskMetrics: TaskMetrics = TaskMetrics.empty)
-  extends Serializable {
+  extends Serializable with Logging {
 
   @deprecated("use partitionId", "0.8.1")
   def splitId = partitionId
@@ -103,8 +103,20 @@ class TaskContext(
   /** Marks the task as completed and triggers the listeners. */
   private[spark] def markTaskCompleted(): Unit = {
 completed = true
+val errorMsgs = new ArrayBuffer[String](2)
 // Process complete callbacks in the reverse order of registration
-onCompleteCallbacks.reverse.foreach { _.onTaskCompletion(this) }
+onCompleteCallbacks.reverse.foreach { listener =>
+  try {
+listener.onTaskCompletion(this)
+  } catch {
+case e: Throwable =>
+  errorMsgs += e.getMessage
+  logError("Error in TaskCompletionListener", e)
+  }
+}
+if (errorMsgs.nonEmpty) {
+  throw new TaskCompletionListenerException(errorMsgs)
+}
   }
 
   /** Marks the task for interruption, i.e. cancellation. */

http://git-wip-us.apache.org/repos/asf/spark/blob/2584ea5b/core/src/main/scala/org/apache/spark/util/TaskCompletionListenerException.scala
--
diff --git 
a/core/src/main/scala/org/apache/spark/util/TaskCompletionListenerException.scala
 
b/core/src/main/scala/org/apache/spark/util/TaskCompletionListenerException.scala
new file mode 100644
index 000..f64e069
--- /dev/null
+++ 
b/core/src/main/scala/org/apache/spark/util/TaskCompletionListenerException.scala
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is d

git commit: [SPARK-3515][SQL] Moves test suite setup code to beforeAll rather than in constructor

2014-09-12 Thread marmbrus

Repository: spark
Updated Branches:
  refs/heads/branch-1.1 9c06c7230 -> 44e534eb2


[SPARK-3515][SQL] Moves test suite setup code to beforeAll rather than in 
constructor

Please refer to the JIRA ticket for details.

**NOTE** We should check all test suites that do similar initialization-like 
side effects in their constructors. This PR only fixes `ParquetMetastoreSuite` 
because it breaks our Jenkins Maven build.

Author: Cheng Lian 

Closes #2375 from liancheng/say-no-to-constructor and squashes the following 
commits:

0ceb75b [Cheng Lian] Moves test suite setup code to beforeAll rather than in 
constructor

(cherry picked from commit 6d887db7891be643f0131b136e82191b5f6eb407)
Signed-off-by: Michael Armbrust 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/44e534eb
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/44e534eb
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/44e534eb

Branch: refs/heads/branch-1.1
Commit: 44e534eb286381030ae068ca89573ff84fb2a579
Parents: 9c06c72
Author: Cheng Lian 
Authored: Fri Sep 12 20:14:09 2014 -0700
Committer: Michael Armbrust 
Committed: Fri Sep 12 20:14:48 2014 -0700

--
 .../sql/parquet/ParquetMetastoreSuite.scala | 53 +---
 1 file changed, 24 insertions(+), 29 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/44e534eb/sql/hive/src/test/scala/org/apache/spark/sql/parquet/ParquetMetastoreSuite.scala
--
diff --git 
a/sql/hive/src/test/scala/org/apache/spark/sql/parquet/ParquetMetastoreSuite.scala
 
b/sql/hive/src/test/scala/org/apache/spark/sql/parquet/ParquetMetastoreSuite.scala
index 0723be7..e380280 100644
--- 
a/sql/hive/src/test/scala/org/apache/spark/sql/parquet/ParquetMetastoreSuite.scala
+++ 
b/sql/hive/src/test/scala/org/apache/spark/sql/parquet/ParquetMetastoreSuite.scala
@@ -20,14 +20,10 @@ package org.apache.spark.sql.parquet
 
 import java.io.File
 
-import org.apache.spark.sql.hive.execution.HiveTableScan
 import org.scalatest.BeforeAndAfterAll
 
-import scala.reflect.ClassTag
-
-import org.apache.spark.sql.{SQLConf, QueryTest}
-import org.apache.spark.sql.execution.{BroadcastHashJoin, ShuffledHashJoin}
-import org.apache.spark.sql.hive.test.TestHive
+import org.apache.spark.sql.QueryTest
+import org.apache.spark.sql.hive.execution.HiveTableScan
 import org.apache.spark.sql.hive.test.TestHive._
 
 case class ParquetData(intField: Int, stringField: String)
@@ -36,27 +32,19 @@ case class ParquetData(intField: Int, stringField: String)
  * Tests for our SerDe -> Native parquet scan conversion.
  */
 class ParquetMetastoreSuite extends QueryTest with BeforeAndAfterAll {
-
   override def beforeAll(): Unit = {
-setConf("spark.sql.hive.convertMetastoreParquet", "true")
-  }
-
-  override def afterAll(): Unit = {
-setConf("spark.sql.hive.convertMetastoreParquet", "false")
-  }
-
-  val partitionedTableDir = File.createTempFile("parquettests", "sparksql")
-  partitionedTableDir.delete()
-  partitionedTableDir.mkdir()
-
-  (1 to 10).foreach { p =>
-val partDir = new File(partitionedTableDir, s"p=$p")
-sparkContext.makeRDD(1 to 10)
-  .map(i => ParquetData(i, s"part-$p"))
-  .saveAsParquetFile(partDir.getCanonicalPath)
-  }
-
-  sql(s"""
+val partitionedTableDir = File.createTempFile("parquettests", "sparksql")
+partitionedTableDir.delete()
+partitionedTableDir.mkdir()
+
+(1 to 10).foreach { p =>
+  val partDir = new File(partitionedTableDir, s"p=$p")
+  sparkContext.makeRDD(1 to 10)
+.map(i => ParquetData(i, s"part-$p"))
+.saveAsParquetFile(partDir.getCanonicalPath)
+}
+
+sql(s"""
 create external table partitioned_parquet
 (
   intField INT,
@@ -70,7 +58,7 @@ class ParquetMetastoreSuite extends QueryTest with 
BeforeAndAfterAll {
 location '${partitionedTableDir.getCanonicalPath}'
 """)
 
-  sql(s"""
+sql(s"""
 create external table normal_parquet
 (
   intField INT,
@@ -83,8 +71,15 @@ class ParquetMetastoreSuite extends QueryTest with 
BeforeAndAfterAll {
 location '${new File(partitionedTableDir, "p=1").getCanonicalPath}'
 """)
 
-  (1 to 10).foreach { p =>
-sql(s"ALTER TABLE partitioned_parquet ADD PARTITION (p=$p)")
+(1 to 10).foreach { p =>
+  sql(s"ALTER TABLE partitioned_parquet ADD PARTITION (p=$p)")
+}
+
+setConf("spark.sql.hive.convertMetastoreParquet", "true")
+  }
+
+  override def afterAll(): Unit = {
+setConf("spark.sql.hive.convertMetastoreParquet", "false")
   }
 
   test("project the partitioning column") {


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@s

git commit: [SPARK-3515][SQL] Moves test suite setup code to beforeAll rather than in constructor

2014-09-12 Thread marmbrus

Repository: spark
Updated Branches:
  refs/heads/master 885d1621b -> 6d887db78


[SPARK-3515][SQL] Moves test suite setup code to beforeAll rather than in 
constructor

Please refer to the JIRA ticket for details.

**NOTE** We should check all test suites that do similar initialization-like 
side effects in their constructors. This PR only fixes `ParquetMetastoreSuite` 
because it breaks our Jenkins Maven build.

Author: Cheng Lian 

Closes #2375 from liancheng/say-no-to-constructor and squashes the following 
commits:

0ceb75b [Cheng Lian] Moves test suite setup code to beforeAll rather than in 
constructor


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6d887db7
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6d887db7
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6d887db7

Branch: refs/heads/master
Commit: 6d887db7891be643f0131b136e82191b5f6eb407
Parents: 885d162
Author: Cheng Lian 
Authored: Fri Sep 12 20:14:09 2014 -0700
Committer: Michael Armbrust 
Committed: Fri Sep 12 20:14:09 2014 -0700

--
 .../sql/parquet/ParquetMetastoreSuite.scala | 53 +---
 1 file changed, 24 insertions(+), 29 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/6d887db7/sql/hive/src/test/scala/org/apache/spark/sql/parquet/ParquetMetastoreSuite.scala
--
diff --git 
a/sql/hive/src/test/scala/org/apache/spark/sql/parquet/ParquetMetastoreSuite.scala
 
b/sql/hive/src/test/scala/org/apache/spark/sql/parquet/ParquetMetastoreSuite.scala
index 0723be7..e380280 100644
--- 
a/sql/hive/src/test/scala/org/apache/spark/sql/parquet/ParquetMetastoreSuite.scala
+++ 
b/sql/hive/src/test/scala/org/apache/spark/sql/parquet/ParquetMetastoreSuite.scala
@@ -20,14 +20,10 @@ package org.apache.spark.sql.parquet
 
 import java.io.File
 
-import org.apache.spark.sql.hive.execution.HiveTableScan
 import org.scalatest.BeforeAndAfterAll
 
-import scala.reflect.ClassTag
-
-import org.apache.spark.sql.{SQLConf, QueryTest}
-import org.apache.spark.sql.execution.{BroadcastHashJoin, ShuffledHashJoin}
-import org.apache.spark.sql.hive.test.TestHive
+import org.apache.spark.sql.QueryTest
+import org.apache.spark.sql.hive.execution.HiveTableScan
 import org.apache.spark.sql.hive.test.TestHive._
 
 case class ParquetData(intField: Int, stringField: String)
@@ -36,27 +32,19 @@ case class ParquetData(intField: Int, stringField: String)
  * Tests for our SerDe -> Native parquet scan conversion.
  */
 class ParquetMetastoreSuite extends QueryTest with BeforeAndAfterAll {
-
   override def beforeAll(): Unit = {
-setConf("spark.sql.hive.convertMetastoreParquet", "true")
-  }
-
-  override def afterAll(): Unit = {
-setConf("spark.sql.hive.convertMetastoreParquet", "false")
-  }
-
-  val partitionedTableDir = File.createTempFile("parquettests", "sparksql")
-  partitionedTableDir.delete()
-  partitionedTableDir.mkdir()
-
-  (1 to 10).foreach { p =>
-val partDir = new File(partitionedTableDir, s"p=$p")
-sparkContext.makeRDD(1 to 10)
-  .map(i => ParquetData(i, s"part-$p"))
-  .saveAsParquetFile(partDir.getCanonicalPath)
-  }
-
-  sql(s"""
+val partitionedTableDir = File.createTempFile("parquettests", "sparksql")
+partitionedTableDir.delete()
+partitionedTableDir.mkdir()
+
+(1 to 10).foreach { p =>
+  val partDir = new File(partitionedTableDir, s"p=$p")
+  sparkContext.makeRDD(1 to 10)
+.map(i => ParquetData(i, s"part-$p"))
+.saveAsParquetFile(partDir.getCanonicalPath)
+}
+
+sql(s"""
 create external table partitioned_parquet
 (
   intField INT,
@@ -70,7 +58,7 @@ class ParquetMetastoreSuite extends QueryTest with 
BeforeAndAfterAll {
 location '${partitionedTableDir.getCanonicalPath}'
 """)
 
-  sql(s"""
+sql(s"""
 create external table normal_parquet
 (
   intField INT,
@@ -83,8 +71,15 @@ class ParquetMetastoreSuite extends QueryTest with 
BeforeAndAfterAll {
 location '${new File(partitionedTableDir, "p=1").getCanonicalPath}'
 """)
 
-  (1 to 10).foreach { p =>
-sql(s"ALTER TABLE partitioned_parquet ADD PARTITION (p=$p)")
+(1 to 10).foreach { p =>
+  sql(s"ALTER TABLE partitioned_parquet ADD PARTITION (p=$p)")
+}
+
+setConf("spark.sql.hive.convertMetastoreParquet", "true")
+  }
+
+  override def afterAll(): Unit = {
+setConf("spark.sql.hive.convertMetastoreParquet", "false")
   }
 
   test("project the partitioning column") {


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

git commit: [SPARK-3500] [SQL] use JavaSchemaRDD as SchemaRDD._jschema_rdd

2014-09-12 Thread joshrosen

Repository: spark
Updated Branches:
  refs/heads/branch-1.1 6cbf83c05 -> 9c06c7230


[SPARK-3500] [SQL] use JavaSchemaRDD as SchemaRDD._jschema_rdd

Currently, SchemaRDD._jschema_rdd is SchemaRDD, the Scala API (coalesce(), 
repartition()) can not been called in Python easily, there is no way to specify 
the implicit parameter `ord`. The _jrdd is an JavaRDD, so _jschema_rdd should 
also be JavaSchemaRDD.

In this patch, change _schema_rdd to JavaSchemaRDD, also added an assert for 
it. If some methods are missing from JavaSchemaRDD, then it's called by 
_schema_rdd.baseSchemaRDD().xxx().

BTW, Do we need JavaSQLContext?

Author: Davies Liu 

Closes #2369 from davies/fix_schemardd and squashes the following commits:

abee159 [Davies Liu] use JavaSchemaRDD as SchemaRDD._jschema_rdd

(cherry picked from commit 885d1621bc06bc1f009c9707c3452eac26baf828)
Signed-off-by: Josh Rosen 

Conflicts:
python/pyspark/tests.py


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9c06c723
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9c06c723
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9c06c723

Branch: refs/heads/branch-1.1
Commit: 9c06c723018d4ef96ff31eb947226a6273ed8080
Parents: 6cbf83c
Author: Davies Liu 
Authored: Fri Sep 12 19:05:39 2014 -0700
Committer: Josh Rosen 
Committed: Fri Sep 12 19:28:45 2014 -0700

--
 python/pyspark/sql.py   | 38 ++
 python/pyspark/tests.py | 37 +
 2 files changed, 55 insertions(+), 20 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/9c06c723/python/pyspark/sql.py
--
diff --git a/python/pyspark/sql.py b/python/pyspark/sql.py
index 0ff6a54..07b39c9 100644
--- a/python/pyspark/sql.py
+++ b/python/pyspark/sql.py
@@ -1121,7 +1121,7 @@ class SQLContext:
 batched = isinstance(rdd._jrdd_deserializer, BatchedSerializer)
 jrdd = self._pythonToJava(rdd._jrdd, batched)
 srdd = self._ssql_ctx.applySchemaToPythonRDD(jrdd.rdd(), str(schema))
-return SchemaRDD(srdd, self)
+return SchemaRDD(srdd.toJavaSchemaRDD(), self)
 
 def registerRDDAsTable(self, rdd, tableName):
 """Registers the given RDD as a temporary table in the catalog.
@@ -1133,8 +1133,8 @@ class SQLContext:
 >>> sqlCtx.registerRDDAsTable(srdd, "table1")
 """
 if (rdd.__class__ is SchemaRDD):
-jschema_rdd = rdd._jschema_rdd
-self._ssql_ctx.registerRDDAsTable(jschema_rdd, tableName)
+srdd = rdd._jschema_rdd.baseSchemaRDD()
+self._ssql_ctx.registerRDDAsTable(srdd, tableName)
 else:
 raise ValueError("Can only register SchemaRDD as table")
 
@@ -1150,7 +1150,7 @@ class SQLContext:
 >>> sorted(srdd.collect()) == sorted(srdd2.collect())
 True
 """
-jschema_rdd = self._ssql_ctx.parquetFile(path)
+jschema_rdd = self._ssql_ctx.parquetFile(path).toJavaSchemaRDD()
 return SchemaRDD(jschema_rdd, self)
 
 def jsonFile(self, path, schema=None):
@@ -1206,11 +1206,11 @@ class SQLContext:
 [Row(f1=u'row1', f2=None, f3=None)...Row(f1=u'row3', f2=[], f3=None)]
 """
 if schema is None:
-jschema_rdd = self._ssql_ctx.jsonFile(path)
+srdd = self._ssql_ctx.jsonFile(path)
 else:
 scala_datatype = self._ssql_ctx.parseDataType(str(schema))
-jschema_rdd = self._ssql_ctx.jsonFile(path, scala_datatype)
-return SchemaRDD(jschema_rdd, self)
+srdd = self._ssql_ctx.jsonFile(path, scala_datatype)
+return SchemaRDD(srdd.toJavaSchemaRDD(), self)
 
 def jsonRDD(self, rdd, schema=None):
 """Loads an RDD storing one JSON object per string as a L{SchemaRDD}.
@@ -1274,11 +1274,11 @@ class SQLContext:
 keyed._bypass_serializer = True
 jrdd = keyed._jrdd.map(self._jvm.BytesToString())
 if schema is None:
-jschema_rdd = self._ssql_ctx.jsonRDD(jrdd.rdd())
+srdd = self._ssql_ctx.jsonRDD(jrdd.rdd())
 else:
 scala_datatype = self._ssql_ctx.parseDataType(str(schema))
-jschema_rdd = self._ssql_ctx.jsonRDD(jrdd.rdd(), scala_datatype)
-return SchemaRDD(jschema_rdd, self)
+srdd = self._ssql_ctx.jsonRDD(jrdd.rdd(), scala_datatype)
+return SchemaRDD(srdd.toJavaSchemaRDD(), self)
 
 def sql(self, sqlQuery):
 """Return a L{SchemaRDD} representing the result of the given query.
@@ -1289,7 +1289,7 @@ class SQLContext:
 >>> srdd2.collect()
 [Row(f1=1, f2=u'row1'), Row(f1=2, f2=u'row2'), Row(f1=3, f2=u'row3')]
 """
-return SchemaRDD(self._ssql_ctx.sql(sqlQuer

git commit: [SPARK-3500] [SQL] use JavaSchemaRDD as SchemaRDD._jschema_rdd

2014-09-12 Thread joshrosen

Repository: spark
Updated Branches:
  refs/heads/master 71af030b4 -> 885d1621b


[SPARK-3500] [SQL] use JavaSchemaRDD as SchemaRDD._jschema_rdd

Currently, SchemaRDD._jschema_rdd is SchemaRDD, the Scala API (coalesce(), 
repartition()) can not been called in Python easily, there is no way to specify 
the implicit parameter `ord`. The _jrdd is an JavaRDD, so _jschema_rdd should 
also be JavaSchemaRDD.

In this patch, change _schema_rdd to JavaSchemaRDD, also added an assert for 
it. If some methods are missing from JavaSchemaRDD, then it's called by 
_schema_rdd.baseSchemaRDD().xxx().

BTW, Do we need JavaSQLContext?

Author: Davies Liu 

Closes #2369 from davies/fix_schemardd and squashes the following commits:

abee159 [Davies Liu] use JavaSchemaRDD as SchemaRDD._jschema_rdd


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/885d1621
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/885d1621
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/885d1621

Branch: refs/heads/master
Commit: 885d1621bc06bc1f009c9707c3452eac26baf828
Parents: 71af030
Author: Davies Liu 
Authored: Fri Sep 12 19:05:39 2014 -0700
Committer: Josh Rosen 
Committed: Fri Sep 12 19:05:39 2014 -0700

--
 python/pyspark/sql.py   | 38 ++
 python/pyspark/tests.py | 28 
 2 files changed, 46 insertions(+), 20 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/885d1621/python/pyspark/sql.py
--
diff --git a/python/pyspark/sql.py b/python/pyspark/sql.py
index 53eea6d..fc9310f 100644
--- a/python/pyspark/sql.py
+++ b/python/pyspark/sql.py
@@ -1122,7 +1122,7 @@ class SQLContext(object):
 batched = isinstance(rdd._jrdd_deserializer, BatchedSerializer)
 jrdd = self._pythonToJava(rdd._jrdd, batched)
 srdd = self._ssql_ctx.applySchemaToPythonRDD(jrdd.rdd(), str(schema))
-return SchemaRDD(srdd, self)
+return SchemaRDD(srdd.toJavaSchemaRDD(), self)
 
 def registerRDDAsTable(self, rdd, tableName):
 """Registers the given RDD as a temporary table in the catalog.
@@ -1134,8 +1134,8 @@ class SQLContext(object):
 >>> sqlCtx.registerRDDAsTable(srdd, "table1")
 """
 if (rdd.__class__ is SchemaRDD):
-jschema_rdd = rdd._jschema_rdd
-self._ssql_ctx.registerRDDAsTable(jschema_rdd, tableName)
+srdd = rdd._jschema_rdd.baseSchemaRDD()
+self._ssql_ctx.registerRDDAsTable(srdd, tableName)
 else:
 raise ValueError("Can only register SchemaRDD as table")
 
@@ -1151,7 +1151,7 @@ class SQLContext(object):
 >>> sorted(srdd.collect()) == sorted(srdd2.collect())
 True
 """
-jschema_rdd = self._ssql_ctx.parquetFile(path)
+jschema_rdd = self._ssql_ctx.parquetFile(path).toJavaSchemaRDD()
 return SchemaRDD(jschema_rdd, self)
 
 def jsonFile(self, path, schema=None):
@@ -1207,11 +1207,11 @@ class SQLContext(object):
 [Row(f1=u'row1', f2=None, f3=None)...Row(f1=u'row3', f2=[], f3=None)]
 """
 if schema is None:
-jschema_rdd = self._ssql_ctx.jsonFile(path)
+srdd = self._ssql_ctx.jsonFile(path)
 else:
 scala_datatype = self._ssql_ctx.parseDataType(str(schema))
-jschema_rdd = self._ssql_ctx.jsonFile(path, scala_datatype)
-return SchemaRDD(jschema_rdd, self)
+srdd = self._ssql_ctx.jsonFile(path, scala_datatype)
+return SchemaRDD(srdd.toJavaSchemaRDD(), self)
 
 def jsonRDD(self, rdd, schema=None):
 """Loads an RDD storing one JSON object per string as a L{SchemaRDD}.
@@ -1275,11 +1275,11 @@ class SQLContext(object):
 keyed._bypass_serializer = True
 jrdd = keyed._jrdd.map(self._jvm.BytesToString())
 if schema is None:
-jschema_rdd = self._ssql_ctx.jsonRDD(jrdd.rdd())
+srdd = self._ssql_ctx.jsonRDD(jrdd.rdd())
 else:
 scala_datatype = self._ssql_ctx.parseDataType(str(schema))
-jschema_rdd = self._ssql_ctx.jsonRDD(jrdd.rdd(), scala_datatype)
-return SchemaRDD(jschema_rdd, self)
+srdd = self._ssql_ctx.jsonRDD(jrdd.rdd(), scala_datatype)
+return SchemaRDD(srdd.toJavaSchemaRDD(), self)
 
 def sql(self, sqlQuery):
 """Return a L{SchemaRDD} representing the result of the given query.
@@ -1290,7 +1290,7 @@ class SQLContext(object):
 >>> srdd2.collect()
 [Row(f1=1, f2=u'row1'), Row(f1=2, f2=u'row2'), Row(f1=3, f2=u'row3')]
 """
-return SchemaRDD(self._ssql_ctx.sql(sqlQuery), self)
+return SchemaRDD(self._ssql_ctx.sql(sqlQuery).toJavaSchemaRDD(), self)
 
 def table(sel

git commit: [SPARK-3094] [PySpark] compatitable with PyPy

2014-09-12 Thread joshrosen

Repository: spark
Updated Branches:
  refs/heads/master 25311c2c5 -> 71af030b4


[SPARK-3094] [PySpark] compatitable with PyPy

After this patch, we can run PySpark in PyPy (testing with PyPy 2.3.1 in Mac 
10.9), for example:

```
PYSPARK_PYTHON=pypy ./bin/spark-submit wordcount.py
```

The performance speed up will depend on work load (from 20% to 3000%). Here are 
some benchmarks:

 Job | CPython 2.7 | PyPy 2.3.1  | Speed up
 --- |  | - | ---
 Word Count | 41s   | 15s  | 2.7x
 Sort | 46s |  44s | 1.05x
 Stats | 174s | 3.6s | 48x

Here is the code used for benchmark:

```python
rdd = sc.textFile("text")
def wordcount():
rdd.flatMap(lambda x:x.split('/'))\
.map(lambda x:(x,1)).reduceByKey(lambda x,y:x+y).collectAsMap()
def sort():
rdd.sortBy(lambda x:x, 1).count()
def stats():
sc.parallelize(range(1024), 20).flatMap(lambda x: xrange(5024)).stats()
```

Author: Davies Liu 

Closes #2144 from davies/pypy and squashes the following commits:

9aed6c5 [Davies Liu] use protocol 2 in CloudPickle
4bc1f04 [Davies Liu] refactor
b20ab3a [Davies Liu] pickle sys.stdout and stderr in portable way
3ca2351 [Davies Liu] Merge branch 'master' into pypy
fae8b19 [Davies Liu] improve attrgetter, add tests
591f830 [Davies Liu] try to run tests with PyPy in run-tests
c8d62ba [Davies Liu] cleanup
f651fd0 [Davies Liu] fix tests using array with PyPy
1b98fb3 [Davies Liu] serialize itemgetter/attrgetter in portable ways
3c1dbfe [Davies Liu] Merge branch 'master' into pypy
42fb5fa [Davies Liu] Merge branch 'master' into pypy
cb2d724 [Davies Liu] fix tests
9986692 [Davies Liu] Merge branch 'master' into pypy
25b4ca7 [Davies Liu] support PyPy


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/71af030b
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/71af030b
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/71af030b

Branch: refs/heads/master
Commit: 71af030b46a89aaa9a87f18f56b9e1f1cd8ce2e7
Parents: 25311c2
Author: Davies Liu 
Authored: Fri Sep 12 18:42:50 2014 -0700
Committer: Josh Rosen 
Committed: Fri Sep 12 18:42:50 2014 -0700

--
 python/pyspark/cloudpickle.py | 168 +++--
 python/pyspark/daemon.py  |   6 +-
 python/pyspark/serializers.py |  10 ++-
 python/pyspark/tests.py   |  85 +--
 python/run-tests  |  21 +
 5 files changed, 172 insertions(+), 118 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/71af030b/python/pyspark/cloudpickle.py
--
diff --git a/python/pyspark/cloudpickle.py b/python/pyspark/cloudpickle.py
index 80e51d1..32dda38 100644
--- a/python/pyspark/cloudpickle.py
+++ b/python/pyspark/cloudpickle.py
@@ -52,35 +52,19 @@ from functools import partial
 import itertools
 from copy_reg import _extension_registry, _inverted_registry, _extension_cache
 import new
-import dis
 import traceback
+import platform
 
-#relevant opcodes
-STORE_GLOBAL = chr(dis.opname.index('STORE_GLOBAL'))
-DELETE_GLOBAL = chr(dis.opname.index('DELETE_GLOBAL'))
-LOAD_GLOBAL = chr(dis.opname.index('LOAD_GLOBAL'))
-GLOBAL_OPS = [STORE_GLOBAL, DELETE_GLOBAL, LOAD_GLOBAL]
+PyImp = platform.python_implementation()
 
-HAVE_ARGUMENT = chr(dis.HAVE_ARGUMENT)
-EXTENDED_ARG = chr(dis.EXTENDED_ARG)
 
 import logging
 cloudLog = logging.getLogger("Cloud.Transport")
 
-try:
-import ctypes
-except (MemoryError, ImportError):
-logging.warning('Exception raised on importing ctypes. Likely python bug.. 
some functionality will be disabled', exc_info = True)
-ctypes = None
-PyObject_HEAD = None
-else:
-
-# for reading internal structures
-PyObject_HEAD = [
-('ob_refcnt', ctypes.c_size_t),
-('ob_type', ctypes.c_void_p),
-]
 
+if PyImp == "PyPy":
+# register builtin type in `new`
+new.method = types.MethodType
 
 try:
 from cStringIO import StringIO
@@ -225,6 +209,8 @@ class CloudPickler(pickle.Pickler):
 
 if themodule:
 self.modules.add(themodule)
+if getattr(themodule, name, None) is obj:
+return self.save_global(obj, name)
 
 if not self.savedDjangoEnv:
 #hack for django - if we detect the settings module, we transport 
it
@@ -306,44 +292,28 @@ class CloudPickler(pickle.Pickler):
 
 # create a skeleton function object and memoize it
 save(_make_skel_func)
-save((code, len(closure), base_globals))
+save((code, closure, base_globals))
 write(pickle.REDUCE)
 self.memoize(func)
 
 # save the rest of the func data needed by _fill_function
 save(f_globals)
 save(defaults)
-save(closure)
 save(dct)
 write(pickle.TUPLE)
 writ

git commit: [SPARK-3456] YarnAllocator on alpha can lose container requests to RM

2014-09-12 Thread tgraves

Repository: spark
Updated Branches:
  refs/heads/master af2583826 -> 25311c2c5


[SPARK-3456] YarnAllocator on alpha can lose container requests to RM

Author: Thomas Graves 

Closes #2373 from tgravescs/SPARK-3456 and squashes the following commits:

77e9532 [Thomas Graves] [SPARK-3456] YarnAllocator on alpha can lose container 
requests to RM


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/25311c2c
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/25311c2c
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/25311c2c

Branch: refs/heads/master
Commit: 25311c2c545a60eb9dcf704814d4600987852155
Parents: af25838
Author: Thomas Graves 
Authored: Fri Sep 12 20:31:11 2014 -0500
Committer: Thomas Graves 
Committed: Fri Sep 12 20:31:11 2014 -0500

--
 .../apache/spark/deploy/yarn/YarnAllocationHandler.scala | 11 ++-
 .../org/apache/spark/deploy/yarn/YarnAllocator.scala |  8 ++--
 .../apache/spark/deploy/yarn/YarnAllocationHandler.scala |  3 ++-
 3 files changed, 14 insertions(+), 8 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/25311c2c/yarn/alpha/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocationHandler.scala
--
diff --git 
a/yarn/alpha/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocationHandler.scala
 
b/yarn/alpha/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocationHandler.scala
index 5a1b42c..6c93d85 100644
--- 
a/yarn/alpha/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocationHandler.scala
+++ 
b/yarn/alpha/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocationHandler.scala
@@ -48,16 +48,17 @@ private[yarn] class YarnAllocationHandler(
   private val lastResponseId = new AtomicInteger()
   private val releaseList: CopyOnWriteArrayList[ContainerId] = new 
CopyOnWriteArrayList()
 
-  override protected def allocateContainers(count: Int): YarnAllocateResponse 
= {
+  override protected def allocateContainers(count: Int, pending: Int): 
YarnAllocateResponse = {
 var resourceRequests: List[ResourceRequest] = null
 
-logDebug("numExecutors: " + count)
+logDebug("asking for additional executors: " + count + " with already 
pending: " + pending)
+val totalNumAsk = count + pending
 if (count <= 0) {
   resourceRequests = List()
 } else if (preferredHostToCount.isEmpty) {
 logDebug("host preferences is empty")
 resourceRequests = List(createResourceRequest(
-  AllocationType.ANY, null, count, 
YarnSparkHadoopUtil.RM_REQUEST_PRIORITY))
+  AllocationType.ANY, null, totalNumAsk, 
YarnSparkHadoopUtil.RM_REQUEST_PRIORITY))
 } else {
   // request for all hosts in preferred nodes and for numExecutors -
   // candidates.size, request by default allocation policy.
@@ -80,7 +81,7 @@ private[yarn] class YarnAllocationHandler(
   val anyContainerRequests: ResourceRequest = createResourceRequest(
 AllocationType.ANY,
 resource = null,
-count,
+totalNumAsk,
 YarnSparkHadoopUtil.RM_REQUEST_PRIORITY)
 
   val containerRequests: ArrayBuffer[ResourceRequest] = new 
ArrayBuffer[ResourceRequest](
@@ -103,7 +104,7 @@ private[yarn] class YarnAllocationHandler(
 req.addAllReleases(releasedContainerList)
 
 if (count > 0) {
-  logInfo("Allocating %d executor containers with %d of memory 
each.".format(count,
+  logInfo("Allocating %d executor containers with %d of memory 
each.".format(totalNumAsk,
 executorMemory + memoryOverhead))
 } else {
   logDebug("Empty allocation req ..  release : " + releasedContainerList)

http://git-wip-us.apache.org/repos/asf/spark/blob/25311c2c/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
--
diff --git 
a/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala 
b/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
index 0b8744f..299e38a 100644
--- 
a/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
+++ 
b/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
@@ -112,6 +112,9 @@ private[yarn] abstract class YarnAllocator(
   def allocateResources() = {
 val missing = maxExecutors - numPendingAllocate.get() - 
numExecutorsRunning.get()
 
+// this is needed by alpha, do it here since we add numPending right after 
this
+val executorsPending = numPendingAllocate.get()
+
 if (missing > 0) {
   numPendingAllocate.addAndGet(missing)
   logInfo("Will Allocate %d executor containers, each with %d 
memory".format(
@@ -121,7 +124,7 @@ private[yarn] abstract class YarnAllocator(
   logDebug("Empty allocation req

git commit: [SPARK-3217] Add Guava to classpath when SPARK_PREPEND_CLASSES is set.

2014-09-12 Thread pwendell

Repository: spark
Updated Branches:
  refs/heads/master 1d767967e -> af2583826


[SPARK-3217] Add Guava to classpath when SPARK_PREPEND_CLASSES is set.

When that option is used, the compiled classes from the build directory
are prepended to the classpath. Now that we avoid packaging Guava, that
means we have classes referencing the original Guava location in the app's
classpath, so errors happen.

For that case, add Guava manually to the classpath.

Note: if Spark is compiled with "-Phadoop-provided", it's tricky to
make things work with SPARK_PREPEND_CLASSES, because you need to add
the Hadoop classpath using SPARK_CLASSPATH and that means the older
Hadoop Guava overrides the newer one Spark needs. So someone using
SPARK_PREPEND_CLASSES needs to remember to not use that profile.

Author: Marcelo Vanzin 

Closes #2141 from vanzin/SPARK-3217 and squashes the following commits:

b967324 [Marcelo Vanzin] [SPARK-3217] Add Guava to classpath when 
SPARK_PREPEND_CLASSES is set.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/af258382
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/af258382
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/af258382

Branch: refs/heads/master
Commit: af2583826c15d2a4e2732017ea20feeff0fb79f6
Parents: 1d76796
Author: Marcelo Vanzin 
Authored: Fri Sep 12 14:54:42 2014 -0700
Committer: Patrick Wendell 
Committed: Fri Sep 12 14:54:42 2014 -0700

--
 bin/compute-classpath.sh |  1 +
 core/pom.xml | 27 +++
 2 files changed, 28 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/af258382/bin/compute-classpath.sh
--
diff --git a/bin/compute-classpath.sh b/bin/compute-classpath.sh
index 15c6779..0f63e36 100755
--- a/bin/compute-classpath.sh
+++ b/bin/compute-classpath.sh
@@ -43,6 +43,7 @@ if [ -n "$SPARK_PREPEND_CLASSES" ]; then
   echo "NOTE: SPARK_PREPEND_CLASSES is set, placing locally compiled Spark"\
 "classes ahead of assembly." >&2
   CLASSPATH="$CLASSPATH:$FWDIR/core/target/scala-$SCALA_VERSION/classes"
+  CLASSPATH="$CLASSPATH:$FWDIR/core/target/jars/*"
   CLASSPATH="$CLASSPATH:$FWDIR/repl/target/scala-$SCALA_VERSION/classes"
   CLASSPATH="$CLASSPATH:$FWDIR/mllib/target/scala-$SCALA_VERSION/classes"
   CLASSPATH="$CLASSPATH:$FWDIR/bagel/target/scala-$SCALA_VERSION/classes"

http://git-wip-us.apache.org/repos/asf/spark/blob/af258382/core/pom.xml
--
diff --git a/core/pom.xml b/core/pom.xml
index b2b788a..2a81f6d 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -351,6 +351,33 @@
   
 
   
+  
+  
+org.apache.maven.plugins
+maven-dependency-plugin
+
+  
+copy-dependencies
+package
+
+  copy-dependencies
+
+
+  ${project.build.directory}
+  false
+  false
+  true
+  true
+  guava
+  true
+
+  
+
+  
 
 
 


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

git commit: SPARK-3014. Log a more informative messages in a couple failure scenario...

2014-09-12 Thread tgraves

Repository: spark
Updated Branches:
  refs/heads/master 15a564598 -> 1d767967e


SPARK-3014. Log a more informative messages in a couple failure scenario...

...s

Author: Sandy Ryza 

Closes #1934 from sryza/sandy-spark-3014 and squashes the following commits:

ae19cc1 [Sandy Ryza] SPARK-3014. Log a more informative messages in a couple 
failure scenarios


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1d767967
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1d767967
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1d767967

Branch: refs/heads/master
Commit: 1d767967e925f1d727957c2d43383ef6ad2c5d5e
Parents: 15a5645
Author: Sandy Ryza 
Authored: Fri Sep 12 16:48:28 2014 -0500
Committer: Thomas Graves 
Committed: Fri Sep 12 16:48:28 2014 -0500

--
 core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala  | 6 --
 .../scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala | 6 ++
 2 files changed, 6 insertions(+), 6 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/1d767967/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
--
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala 
b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index 0fdb5ae..5ed3575 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.deploy
 
 import java.io.{File, PrintStream}
-import java.lang.reflect.InvocationTargetException
+import java.lang.reflect.{Modifier, InvocationTargetException}
 import java.net.URL
 
 import scala.collection.mutable.{ArrayBuffer, HashMap, Map}
@@ -323,7 +323,9 @@ object SparkSubmit {
 }
 
 val mainMethod = mainClass.getMethod("main", new Array[String](0).getClass)
-
+if (!Modifier.isStatic(mainMethod.getModifiers)) {
+  throw new IllegalStateException("The main method in the given main class 
must be static")
+}
 try {
   mainMethod.invoke(null, childArgs.toArray)
 } catch {

http://git-wip-us.apache.org/repos/asf/spark/blob/1d767967/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
--
diff --git 
a/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
 
b/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
index 878b6db..735d772 100644
--- 
a/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
+++ 
b/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
@@ -283,11 +283,9 @@ private[spark] class ApplicationMaster(args: 
ApplicationMasterArguments,
 }
 
 val sparkContext = sparkContextRef.get()
-assert(sparkContext != null || count >= numTries)
 if (sparkContext == null) {
-  logError(
-"Unable to retrieve sparkContext inspite of waiting for %d, 
numTries = %d".format(
-  count * waitTime, numTries))
+  logError(("SparkContext did not initialize after waiting for %d ms. 
Please check earlier"
++ " log output for errors. Failing the 
application.").format(numTries * waitTime))
 }
 sparkContext
   }


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

git commit: [SPARK-3427] [GraphX] Avoid active vertex tracking in static PageRank

2014-09-12 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master eae81b0bf -> 15a564598


[SPARK-3427] [GraphX] Avoid active vertex tracking in static PageRank

GraphX's current implementation of static (fixed iteration count) PageRank uses 
the Pregel API. This unnecessarily tracks active vertices, even though in 
static PageRank all vertices are always active. Active vertex tracking incurs 
the following costs:

1. A shuffle per iteration to ship the active sets to the edge partitions.
2. A hash table creation per iteration at each partition to index the active 
sets for lookup.
3. A hash lookup per edge to check whether the source vertex is active.

I reimplemented static PageRank using the lower-level GraphX API instead of the 
Pregel API. In benchmarks on a 16-node m2.4xlarge cluster, this provided a 23% 
speedup (from 514 s to 397 s, mean over 3 trials) for 10 iterations of PageRank 
on a synthetic graph with 10M vertices and 1.27B edges.

Author: Ankur Dave 

Closes #2308 from ankurdave/SPARK-3427 and squashes the following commits:

449996a [Ankur Dave] Avoid unnecessary active vertex tracking in static PageRank


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/15a56459
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/15a56459
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/15a56459

Branch: refs/heads/master
Commit: 15a564598fe63003652b1e24527c432080b5976c
Parents: eae81b0
Author: Ankur Dave 
Authored: Fri Sep 12 14:08:38 2014 -0700
Committer: Reynold Xin 
Committed: Fri Sep 12 14:08:38 2014 -0700

--
 .../org/apache/spark/graphx/lib/PageRank.scala  | 45 +---
 1 file changed, 29 insertions(+), 16 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/15a56459/graphx/src/main/scala/org/apache/spark/graphx/lib/PageRank.scala
--
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/lib/PageRank.scala 
b/graphx/src/main/scala/org/apache/spark/graphx/lib/PageRank.scala
index 614555a..257e2f3 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/lib/PageRank.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/lib/PageRank.scala
@@ -79,30 +79,43 @@ object PageRank extends Logging {
   def run[VD: ClassTag, ED: ClassTag](
   graph: Graph[VD, ED], numIter: Int, resetProb: Double = 0.15): 
Graph[Double, Double] =
   {
-// Initialize the pagerankGraph with each edge attribute having
+// Initialize the PageRank graph with each edge attribute having
 // weight 1/outDegree and each vertex with attribute 1.0.
-val pagerankGraph: Graph[Double, Double] = graph
+var rankGraph: Graph[Double, Double] = graph
   // Associate the degree with each vertex
   .outerJoinVertices(graph.outDegrees) { (vid, vdata, deg) => 
deg.getOrElse(0) }
   // Set the weight on the edges based on the degree
   .mapTriplets( e => 1.0 / e.srcAttr )
   // Set the vertex attributes to the initial pagerank values
-  .mapVertices( (id, attr) => 1.0 )
-  .cache()
+  .mapVertices( (id, attr) => resetProb )
 
-// Define the three functions needed to implement PageRank in the GraphX
-// version of Pregel
-def vertexProgram(id: VertexId, attr: Double, msgSum: Double): Double =
-  resetProb + (1.0 - resetProb) * msgSum
-def sendMessage(edge: EdgeTriplet[Double, Double]) =
-  Iterator((edge.dstId, edge.srcAttr * edge.attr))
-def messageCombiner(a: Double, b: Double): Double = a + b
-// The initial message received by all vertices in PageRank
-val initialMessage = 0.0
+var iteration = 0
+var prevRankGraph: Graph[Double, Double] = null
+while (iteration < numIter) {
+  rankGraph.cache()
 
-// Execute pregel for a fixed number of iterations.
-Pregel(pagerankGraph, initialMessage, numIter, activeDirection = 
EdgeDirection.Out)(
-  vertexProgram, sendMessage, messageCombiner)
+  // Compute the outgoing rank contributions of each vertex, perform local 
preaggregation, and
+  // do the final aggregation at the receiving vertices. Requires a 
shuffle for aggregation.
+  val rankUpdates = rankGraph.mapReduceTriplets[Double](
+e => Iterator((e.dstId, e.srcAttr * e.attr)), _ + _)
+
+  // Apply the final rank updates to get the new ranks, using join to 
preserve ranks of vertices
+  // that didn't receive a message. Requires a shuffle for broadcasting 
updated ranks to the
+  // edge partitions.
+  prevRankGraph = rankGraph
+  rankGraph = rankGraph.joinVertices(rankUpdates) {
+(id, oldRank, msgSum) => resetProb + (1.0 - resetProb) * msgSum
+  }.cache()
+
+  rankGraph.edges.foreachPartition(x => {}) // also materializes 
rankGraph.vertices
+  logInfo(s"PageRank finish

git commit: MAINTENANCE: Automated closing of pull requests.

2014-09-12 Thread pwendell

Repository: spark
Updated Branches:
  refs/heads/master 8194fc662 -> eae81b0bf


MAINTENANCE: Automated closing of pull requests.

This commit exists to close the following pull requests on Github:

Closes #930 (close requested by 'andrewor14')
Closes #867 (close requested by 'marmbrus')
Closes #1829 (close requested by 'marmbrus')
Closes #1131 (close requested by 'JoshRosen')
Closes #1571 (close requested by 'andrewor14')
Closes #2359 (close requested by 'andrewor14')


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/eae81b0b
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/eae81b0b
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/eae81b0b

Branch: refs/heads/master
Commit: eae81b0bfdf3159be90f507a03853800aec1874a
Parents: 8194fc6
Author: Patrick Wendell 
Authored: Fri Sep 12 13:43:29 2014 -0700
Committer: Patrick Wendell 
Committed: Fri Sep 12 13:43:29 2014 -0700

--

--



-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

svn commit: r1624639 - in /spark: downloads.md js/downloads.js site/downloads.html site/js/downloads.js

2014-09-12 Thread pwendell

Author: pwendell
Date: Fri Sep 12 20:36:11 2014
New Revision: 1624639

URL: http://svn.apache.org/r1624639
Log:
Adding pretty titles for download links.


Modified:
spark/downloads.md
spark/js/downloads.js
spark/site/downloads.html
spark/site/js/downloads.js

Modified: spark/downloads.md
URL: 
http://svn.apache.org/viewvc/spark/downloads.md?rev=1624639&r1=1624638&r2=1624639&view=diff
==
--- spark/downloads.md (original)
+++ spark/downloads.md Fri Sep 12 20:36:11 2014
@@ -24,10 +24,7 @@ The latest release of Spark is Spark 1.1
   
 
 2. Chose a package type:
-  
-  Note: Spark can be  
-  built from source for many other Hadoop versions.
-  
+  
 
 3. Chose a download type:
   

Modified: spark/js/downloads.js
URL: 
http://svn.apache.org/viewvc/spark/js/downloads.js?rev=1624639&r1=1624638&r2=1624639&view=diff
==
--- spark/js/downloads.js (original)
+++ spark/js/downloads.js Fri Sep 12 20:36:11 2014
@@ -7,15 +7,24 @@ function addRelease(version, releaseDate
   releases[version] = {released: releaseDate, packages: packages, 
downloadable: downloadable};
 }
 
-var sources = ["sources"];
+var sources = {pretty: "Source Code [can build several Hadoop versions]", tag: 
"sources"};
+var hadoop1 = {pretty: "Pre-built for Hadoop 1.X", tag: "hadoop1"};
+var cdh4 = {pretty: "Pre-built for CDH 4", tag: "cdh4"};
+var hadoop2 = {pretty: "Pre-built for Hadoop 2.2", tag: "hadoop2"};
+var hadoop2p3 = {pretty: "Pre-built for Hadoop 2.3", tag: "hadoop2.3"};
+var hadoop2p4 = {pretty: "Pre-built for Hadoop 2.4", tag: "hadoop2.4"};
+var mapr3 = {pretty: "Pre-built for MapR 3.X", tag: "mapr3"};
+var mapr4 = {pretty: "Pre-built for MapR 4.X", tag: "mapr4"};
+
+var sourcePackage = [sources];
 // 0.7+
-var packagesV1 = sources.concat(["hadoop1", "cdh4"]);
+var packagesV1 = sourcePackage.concat([hadoop1, cdh4]);
 // 0.8.1+
-var packagesV2 = packagesV1.concat(["hadoop2"]);
+var packagesV2 = packagesV1.concat([hadoop2]);
 // 1.0.1+
-var packagesV3 = packagesV2.concat(["mapr3", "mapr4"]);
+var packagesV3 = packagesV2.concat([mapr3, mapr4]);
 // 1.1.0+
-var packagesV4 = packagesV1.concat(["hadoop2.3", "hadoop2.4", "mapr3", 
"mapr4"]);
+var packagesV4 = packagesV1.concat([hadoop2p3, hadoop2p4, mapr3, mapr4]);
 
 addRelease("1.1.0", new Date("9/11/2014"), packagesV4, true);
 addRelease("1.0.2", new Date("8/5/2014"), packagesV3, true);
@@ -87,7 +96,10 @@ function onVersionSelect() {
   var version = getSelectedValue(versionSelect);
   var packages = releases[version]["packages"];
   for (var idx in packages) {
-append(packageSelect, "" + packages[idx] + "")
+var option = " $pretty "
+  .replace(/\$tag/, packages[idx].tag)
+  .replace(/\$pretty/, packages[idx].pretty);
+append(packageSelect, option);
   }
 
   var href = "http://www.apache.org/dist/spark/spark-"; + version + "/";

Modified: spark/site/downloads.html
URL: 
http://svn.apache.org/viewvc/spark/site/downloads.html?rev=1624639&r1=1624638&r2=1624639&view=diff
==
--- spark/site/downloads.html (original)
+++ spark/site/downloads.html Fri Sep 12 20:36:11 2014
@@ -182,10 +182,7 @@ $(document).ready(function() {
   
   
 Chose a package type:
-  
-  Note: Spark can be  
-  built from source for many other Hadoop versions.
-  
+  
   
   
 Chose a download type:

Modified: spark/site/js/downloads.js
URL: 
http://svn.apache.org/viewvc/spark/site/js/downloads.js?rev=1624639&r1=1624638&r2=1624639&view=diff
==
--- spark/site/js/downloads.js (original)
+++ spark/site/js/downloads.js Fri Sep 12 20:36:11 2014
@@ -7,15 +7,24 @@ function addRelease(version, releaseDate
   releases[version] = {released: releaseDate, packages: packages, 
downloadable: downloadable};
 }
 
-var sources = ["sources"];
+var sources = {pretty: "Source Code [can build several Hadoop versions]", tag: 
"sources"};
+var hadoop1 = {pretty: "Pre-built for Hadoop 1.X", tag: "hadoop1"};
+var cdh4 = {pretty: "Pre-built for CDH 4", tag: "cdh4"};
+var hadoop2 = {pretty: "Pre-built for Hadoop 2.2", tag: "hadoop2"};
+var hadoop2p3 = {pretty: "Pre-built for Hadoop 2.3", tag: "hadoop2.3"};
+var hadoop2p4 = {pretty: "Pre-built for Hadoop 2.4", tag: "hadoop2.4"};
+var mapr3 = {pretty: "Pre-built for MapR 3.X", tag: "mapr3"};
+var mapr4 = {pretty: "Pre-built for MapR 4.X", tag: "mapr4"};
+
+var sourcePackage = [sources];
 // 0.7+
-var packagesV1 = sources.concat(["hadoop1", "cdh4"]);
+var packagesV1 = sourcePackage.concat([hadoop1, cdh4]);
 // 0.8.1+
-var packagesV2 = packagesV1.concat(["hadoop2"]);
+var packagesV2 = packagesV1.concat([hadoop2]);
 // 1.0.1+
-var packagesV3 = packagesV2.concat(["mapr3", "mapr4"]);
+var packagesV3 = packagesV2.concat([mapr3, mapr4]);
 // 1.1.0+
-var packagesV4 = pa

git commit: [SPARK-3481] [SQL] Eliminate the error log in local Hive comparison test

2014-09-12 Thread marmbrus

Repository: spark
Updated Branches:
  refs/heads/branch-1.0 ae6f5545a -> 4f991c9ff


[SPARK-3481] [SQL] Eliminate the error log in local Hive comparison test

Logically, we should remove the Hive Table/Database first and then reset the 
Hive configuration, repoint to the new data warehouse directory etc.
Otherwise it raised exceptions like "Database doesn't not exists: default" in 
the local testing.

Author: Cheng Hao 

Closes #2352 from chenghao-intel/test_hive and squashes the following commits:

74fd76b [Cheng Hao] eliminate the error log

(cherry picked from commit 8194fc662c08eb445444c207264e22361def54ea)
Signed-off-by: Michael Armbrust 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4f991c9f
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4f991c9f
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4f991c9f

Branch: refs/heads/branch-1.0
Commit: 4f991c9fff169557b8d98923064d5ce547fa5569
Parents: ae6f554
Author: Cheng Hao 
Authored: Fri Sep 12 11:29:30 2014 -0700
Committer: Michael Armbrust 
Committed: Fri Sep 12 11:30:11 2014 -0700

--
 .../scala/org/apache/spark/sql/hive/TestHive.scala | 17 -
 1 file changed, 8 insertions(+), 9 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/4f991c9f/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala
--
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala
index 9386008..5ddaa09 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala
@@ -292,15 +292,6 @@ class TestHiveContext(sc: SparkContext) extends 
LocalHiveContext(sc) {
 
logger.asInstanceOf[org.apache.log4j.Logger].setLevel(org.apache.log4j.Level.WARN)
   }
 
-  // It is important that we RESET first as broken hooks that might have 
been set could break
-  // other sql exec here.
-  runSqlHive("RESET")
-  // For some reason, RESET does not reset the following variables...
-  runSqlHive("set datanucleus.cache.collections=true")
-  runSqlHive("set datanucleus.cache.collections.lazy=true")
-  // Lots of tests fail if we do not change the partition whitelist from 
the default.
-  runSqlHive("set hive.metastore.partition.name.whitelist.pattern=.*")
-
   loadedTables.clear()
   catalog.client.getAllTables("default").foreach { t =>
 logger.debug(s"Deleting table $t")
@@ -326,6 +317,14 @@ class TestHiveContext(sc: SparkContext) extends 
LocalHiveContext(sc) {
 FunctionRegistry.unregisterTemporaryUDF(udfName)
   }
 
+  // It is important that we RESET first as broken hooks that might have 
been set could break
+  // other sql exec here.
+  runSqlHive("RESET")
+  // For some reason, RESET does not reset the following variables...
+  runSqlHive("set datanucleus.cache.collections=true")
+  runSqlHive("set datanucleus.cache.collections.lazy=true")
+  // Lots of tests fail if we do not change the partition whitelist from 
the default.
+  runSqlHive("set hive.metastore.partition.name.whitelist.pattern=.*")
   configure()
 
   runSqlHive("USE default")


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

git commit: [SPARK-3481] [SQL] Eliminate the error log in local Hive comparison test

2014-09-12 Thread marmbrus

Repository: spark
Updated Branches:
  refs/heads/branch-1.1 f17b7957a -> 6cbf83c05


[SPARK-3481] [SQL] Eliminate the error log in local Hive comparison test

Logically, we should remove the Hive Table/Database first and then reset the 
Hive configuration, repoint to the new data warehouse directory etc.
Otherwise it raised exceptions like "Database doesn't not exists: default" in 
the local testing.

Author: Cheng Hao 

Closes #2352 from chenghao-intel/test_hive and squashes the following commits:

74fd76b [Cheng Hao] eliminate the error log

(cherry picked from commit 8194fc662c08eb445444c207264e22361def54ea)
Signed-off-by: Michael Armbrust 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6cbf83c0
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6cbf83c0
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6cbf83c0

Branch: refs/heads/branch-1.1
Commit: 6cbf83c05c7a073d4df81b59a1663fea38ce65f6
Parents: f17b795
Author: Cheng Hao 
Authored: Fri Sep 12 11:29:30 2014 -0700
Committer: Michael Armbrust 
Committed: Fri Sep 12 11:29:44 2014 -0700

--
 .../scala/org/apache/spark/sql/hive/TestHive.scala | 17 -
 1 file changed, 8 insertions(+), 9 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/6cbf83c0/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala
--
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala
index a013f3f..8bb2216 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala
@@ -309,15 +309,6 @@ class TestHiveContext(sc: SparkContext) extends 
HiveContext(sc) {
 
log.asInstanceOf[org.apache.log4j.Logger].setLevel(org.apache.log4j.Level.WARN)
   }
 
-  // It is important that we RESET first as broken hooks that might have 
been set could break
-  // other sql exec here.
-  runSqlHive("RESET")
-  // For some reason, RESET does not reset the following variables...
-  runSqlHive("set datanucleus.cache.collections=true")
-  runSqlHive("set datanucleus.cache.collections.lazy=true")
-  // Lots of tests fail if we do not change the partition whitelist from 
the default.
-  runSqlHive("set hive.metastore.partition.name.whitelist.pattern=.*")
-
   loadedTables.clear()
   catalog.client.getAllTables("default").foreach { t =>
 logDebug(s"Deleting table $t")
@@ -343,6 +334,14 @@ class TestHiveContext(sc: SparkContext) extends 
HiveContext(sc) {
 FunctionRegistry.unregisterTemporaryUDF(udfName)
   }
 
+  // It is important that we RESET first as broken hooks that might have 
been set could break
+  // other sql exec here.
+  runSqlHive("RESET")
+  // For some reason, RESET does not reset the following variables...
+  runSqlHive("set datanucleus.cache.collections=true")
+  runSqlHive("set datanucleus.cache.collections.lazy=true")
+  // Lots of tests fail if we do not change the partition whitelist from 
the default.
+  runSqlHive("set hive.metastore.partition.name.whitelist.pattern=.*")
   configure()
 
   runSqlHive("USE default")


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

git commit: [SPARK-3481] [SQL] Eliminate the error log in local Hive comparison test

2014-09-12 Thread marmbrus

Repository: spark
Updated Branches:
  refs/heads/master 533377621 -> 8194fc662


[SPARK-3481] [SQL] Eliminate the error log in local Hive comparison test

Logically, we should remove the Hive Table/Database first and then reset the 
Hive configuration, repoint to the new data warehouse directory etc.
Otherwise it raised exceptions like "Database doesn't not exists: default" in 
the local testing.

Author: Cheng Hao 

Closes #2352 from chenghao-intel/test_hive and squashes the following commits:

74fd76b [Cheng Hao] eliminate the error log


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8194fc66
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8194fc66
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8194fc66

Branch: refs/heads/master
Commit: 8194fc662c08eb445444c207264e22361def54ea
Parents: 5333776
Author: Cheng Hao 
Authored: Fri Sep 12 11:29:30 2014 -0700
Committer: Michael Armbrust 
Committed: Fri Sep 12 11:29:30 2014 -0700

--
 .../scala/org/apache/spark/sql/hive/TestHive.scala | 17 -
 1 file changed, 8 insertions(+), 9 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/8194fc66/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala
--
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala
index 6974f3e..a3bfd3a 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala
@@ -376,15 +376,6 @@ class TestHiveContext(sc: SparkContext) extends 
HiveContext(sc) {
 
log.asInstanceOf[org.apache.log4j.Logger].setLevel(org.apache.log4j.Level.WARN)
   }
 
-  // It is important that we RESET first as broken hooks that might have 
been set could break
-  // other sql exec here.
-  runSqlHive("RESET")
-  // For some reason, RESET does not reset the following variables...
-  runSqlHive("set datanucleus.cache.collections=true")
-  runSqlHive("set datanucleus.cache.collections.lazy=true")
-  // Lots of tests fail if we do not change the partition whitelist from 
the default.
-  runSqlHive("set hive.metastore.partition.name.whitelist.pattern=.*")
-
   loadedTables.clear()
   catalog.client.getAllTables("default").foreach { t =>
 logDebug(s"Deleting table $t")
@@ -410,6 +401,14 @@ class TestHiveContext(sc: SparkContext) extends 
HiveContext(sc) {
 FunctionRegistry.unregisterTemporaryUDF(udfName)
   }
 
+  // It is important that we RESET first as broken hooks that might have 
been set could break
+  // other sql exec here.
+  runSqlHive("RESET")
+  // For some reason, RESET does not reset the following variables...
+  runSqlHive("set datanucleus.cache.collections=true")
+  runSqlHive("set datanucleus.cache.collections.lazy=true")
+  // Lots of tests fail if we do not change the partition whitelist from 
the default.
+  runSqlHive("set hive.metastore.partition.name.whitelist.pattern=.*")
   configure()
 
   runSqlHive("USE default")


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

git commit: Revert "[Spark-3490] Disable SparkUI for tests"

2014-09-12 Thread andrewor14

Repository: spark
Updated Branches:
  refs/heads/branch-1.1 e69deb818 -> f17b7957a


Revert "[Spark-3490] Disable SparkUI for tests"

This reverts commit 2ffc7980c6818eec05e32141c52e335bc71daed9.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f17b7957
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f17b7957
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f17b7957

Branch: refs/heads/branch-1.1
Commit: f17b7957a4283952021d9e4106c5bd9994148128
Parents: e69deb8
Author: Andrew Or 
Authored: Fri Sep 12 10:40:03 2014 -0700
Committer: Andrew Or 
Committed: Fri Sep 12 10:40:03 2014 -0700

--
 .../scala/org/apache/spark/SparkContext.scala   |  12 +-
 .../cluster/CoarseGrainedSchedulerBackend.scala |   2 +-
 .../cluster/SimrSchedulerBackend.scala  |   6 +-
 .../cluster/SparkDeploySchedulerBackend.scala   |   4 +-
 .../scala/org/apache/spark/ui/UISuite.scala |  44 +-
 pom.xml |   1 -
 project/SparkBuild.scala|   2 +-
 .../spark/streaming/StreamingContext.scala  |  11 +-
 .../spark/streaming/StreamingSource.scala   |   2 +-
 .../spark/streaming/ui/StreamingTab.scala   |  25 +-
 .../org/apache/spark/streaming/UISuite.scala|  16 +-
 .../spark/deploy/yarn/ApplicationMaster.scala   | 443 ---
 .../cluster/YarnClientSchedulerBackend.scala|   6 +-
 13 files changed, 37 insertions(+), 537 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/f17b7957/core/src/main/scala/org/apache/spark/SparkContext.scala
--
diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala 
b/core/src/main/scala/org/apache/spark/SparkContext.scala
index abb6a83..0470fbe 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -220,14 +220,8 @@ class SparkContext(config: SparkConf) extends Logging {
 new MetadataCleaner(MetadataCleanerType.SPARK_CONTEXT, this.cleanup, conf)
 
   // Initialize the Spark UI, registering all associated listeners
-  private[spark] val ui: Option[SparkUI] =
-if (conf.getBoolean("spark.ui.enabled", true)) {
-  Some(new SparkUI(this))
-} else {
-  // For tests, do not enable the UI
-  None
-}
-  ui.foreach(_.bind())
+  private[spark] val ui = new SparkUI(this)
+  ui.bind()
 
   /** A default Hadoop Configuration for the Hadoop code (e.g. file systems) 
that we reuse. */
   val hadoopConfiguration: Configuration = {
@@ -1014,7 +1008,7 @@ class SparkContext(config: SparkConf) extends Logging {
   /** Shut down the SparkContext. */
   def stop() {
 postApplicationEnd()
-ui.foreach(_.stop())
+ui.stop()
 // Do this only if not stopped already - best case effort.
 // prevent NPE if stopped more than once.
 val dagSchedulerCopy = dagScheduler

http://git-wip-us.apache.org/repos/asf/spark/blob/f17b7957/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
--
diff --git 
a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
 
b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
index 04046e2..2a3711a 100644
--- 
a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
+++ 
b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
@@ -292,7 +292,7 @@ class CoarseGrainedSchedulerBackend(scheduler: 
TaskSchedulerImpl, actorSystem: A
   logInfo(s"Add WebUI Filter. $filterName, $filterParams, $proxyBase")
   conf.set("spark.ui.filters", filterName)
   conf.set(s"spark.$filterName.params", filterParams)
-  scheduler.sc.ui.foreach { ui => JettyUtils.addFilters(ui.getHandlers, 
conf) }
+  JettyUtils.addFilters(scheduler.sc.ui.getHandlers, conf)
 }
   }
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/f17b7957/core/src/main/scala/org/apache/spark/scheduler/cluster/SimrSchedulerBackend.scala
--
diff --git 
a/core/src/main/scala/org/apache/spark/scheduler/cluster/SimrSchedulerBackend.scala
 
b/core/src/main/scala/org/apache/spark/scheduler/cluster/SimrSchedulerBackend.scala
index b781842..4f7133c 100644
--- 
a/core/src/main/scala/org/apache/spark/scheduler/cluster/SimrSchedulerBackend.scala
+++ 
b/core/src/main/scala/org/apache/spark/scheduler/cluster/SimrSchedulerBackend.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.scheduler.cluster
 
+import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{Path, FileSystem}
 
 import org.ap

git commit: [PySpark] Add blank line so that Python RDD.top() docstring renders correctly

2014-09-12 Thread joshrosen

Repository: spark
Updated Branches:
  refs/heads/master f116f76bf -> 533377621


[PySpark] Add blank line so that Python RDD.top() docstring renders correctly

Author: RJ Nowling 

Closes #2370 from rnowling/python_rdd_docstrings and squashes the following 
commits:

5230574 [RJ Nowling] Add blank line so that Python RDD.top() docstring renders 
correctly


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/53337762
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/53337762
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/53337762

Branch: refs/heads/master
Commit: 533377621f1e178e18fa0b79d653a11b66e4e250
Parents: f116f76
Author: RJ Nowling 
Authored: Fri Sep 12 09:46:21 2014 -0700
Committer: Josh Rosen 
Committed: Fri Sep 12 09:46:21 2014 -0700

--
 python/pyspark/rdd.py | 1 +
 1 file changed, 1 insertion(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/53337762/python/pyspark/rdd.py
--
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index 5667154..6ad5ab2 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -1060,6 +1060,7 @@ class RDD(object):
 Get the top N elements from a RDD.
 
 Note: It returns the list sorted in descending order.
+
 >>> sc.parallelize([10, 4, 2, 12, 3]).top(1)
 [12]
 >>> sc.parallelize([2, 3, 4, 5, 6], 2).top(2)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

svn commit: r1624579 - in /spark: releases/_posts/2014-09-11-spark-release-1-1-0.md site/releases/spark-release-1-1-0.html

2014-09-12 Thread pwendell

Author: pwendell
Date: Fri Sep 12 16:04:01 2014
New Revision: 1624579

URL: http://svn.apache.org/r1624579
Log:
Adding missing release credit

Modified:
spark/releases/_posts/2014-09-11-spark-release-1-1-0.md
spark/site/releases/spark-release-1-1-0.html

Modified: spark/releases/_posts/2014-09-11-spark-release-1-1-0.md
URL: 
http://svn.apache.org/viewvc/spark/releases/_posts/2014-09-11-spark-release-1-1-0.md?rev=1624579&r1=1624578&r2=1624579&view=diff
==
--- spark/releases/_posts/2014-09-11-spark-release-1-1-0.md (original)
+++ spark/releases/_posts/2014-09-11-spark-release-1-1-0.md Fri Sep 12 16:04:01 
2014
@@ -63,6 +63,7 @@ Spark 1.1.0 is backwards compatible with
  * Anant -- Python and doc fixes
  * Anatoli Fomenko -- MLlib doc fix
  * Andrew Ash -- doc improvements and bug fixes
+ * Andrew Xia -- external sorting feature
  * Andrew Or -- external spilling feature, bug fixes, and optimizations
  * Andrew Schumacher -- Parquet support in Spark SQL
  * Ankit Bhardwaj -- bug fix

Modified: spark/site/releases/spark-release-1-1-0.html
URL: 
http://svn.apache.org/viewvc/spark/site/releases/spark-release-1-1-0.html?rev=1624579&r1=1624578&r2=1624579&view=diff
==
--- spark/site/releases/spark-release-1-1-0.html (original)
+++ spark/site/releases/spark-release-1-1-0.html Fri Sep 12 16:04:01 2014
@@ -224,6 +224,7 @@
   Anant – Python and doc fixes
   Anatoli Fomenko – MLlib doc fix
   Andrew Ash – doc improvements and bug fixes
+  Andrew Xia – external sorting feature
   Andrew Or – external spilling feature, bug fixes, and 
optimizations
   Andrew Schumacher – Parquet support in Spark SQL
   Ankit Bhardwaj – bug fix



-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

svn commit: r1624578 - in /spark: releases/_posts/2014-09-11-spark-release-1-1-0.md site/releases/spark-release-1-1-0.html

2014-09-12 Thread pwendell

Author: pwendell
Date: Fri Sep 12 16:03:14 2014
New Revision: 1624578

URL: http://svn.apache.org/r1624578
Log:
Adding missing release credit

Modified:
spark/releases/_posts/2014-09-11-spark-release-1-1-0.md
spark/site/releases/spark-release-1-1-0.html

Modified: spark/releases/_posts/2014-09-11-spark-release-1-1-0.md
URL: 
http://svn.apache.org/viewvc/spark/releases/_posts/2014-09-11-spark-release-1-1-0.md?rev=1624578&r1=1624577&r2=1624578&view=diff
==
--- spark/releases/_posts/2014-09-11-spark-release-1-1-0.md (original)
+++ spark/releases/_posts/2014-09-11-spark-release-1-1-0.md Fri Sep 12 16:03:14 
2014
@@ -141,6 +141,7 @@ Spark 1.1.0 is backwards compatible with
  * Lu Luorta -- graphX fix
  * Luogan Kun -- SQL test fix
  * Ly Lai -- bug fix in Spark SQL
+ * Madhu Siddalingaiah -- external sorting feature
  * Manish Amde -- Multiclass support for decision trees
  * Manuel Laflamme -- streaming fix
  * Marcelo Vanzin -- fixes and improvements in YARN code

Modified: spark/site/releases/spark-release-1-1-0.html
URL: 
http://svn.apache.org/viewvc/spark/site/releases/spark-release-1-1-0.html?rev=1624578&r1=1624577&r2=1624578&view=diff
==
--- spark/site/releases/spark-release-1-1-0.html (original)
+++ spark/site/releases/spark-release-1-1-0.html Fri Sep 12 16:03:14 2014
@@ -302,6 +302,7 @@
   Lu Luorta – graphX fix
   Luogan Kun – SQL test fix
   Ly Lai – bug fix in Spark SQL
+  Madhu Siddalingaiah – external sorting feature
   Manish Amde – Multiclass support for decision trees
   Manuel Laflamme – streaming fix
   Marcelo Vanzin – fixes and improvements in YARN code



-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

git commit: [SPARK-2558][DOCS] Add --queue example to YARN doc

2014-09-12 Thread tgraves

Repository: spark
Updated Branches:
  refs/heads/master b8634df1f -> f116f76bf


[SPARK-2558][DOCS] Add --queue example to YARN doc

Put original YARN queue spark-submit arg description in
running-on-yarn html table and example command line

Author: Mark G. Whitney 

Closes #2218 from kramimus/2258-yarndoc and squashes the following commits:

4b5d808 [Mark G. Whitney] remove yarn queue config
f8cda0d [Mark G. Whitney] [SPARK-2558][DOCS] Add spark.yarn.queue description 
to YARN doc


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f116f76b
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f116f76b
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f116f76b

Branch: refs/heads/master
Commit: f116f76bf1f1610905ca094c8edc53151a78d2f4
Parents: b8634df
Author: Mark G. Whitney 
Authored: Fri Sep 12 08:08:58 2014 -0500
Committer: Thomas Graves 
Committed: Fri Sep 12 08:08:58 2014 -0500

--
 docs/running-on-yarn.md | 1 +
 1 file changed, 1 insertion(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/f116f76b/docs/running-on-yarn.md
--
diff --git a/docs/running-on-yarn.md b/docs/running-on-yarn.md
index d8b22f3..212248b 100644
--- a/docs/running-on-yarn.md
+++ b/docs/running-on-yarn.md
@@ -155,6 +155,7 @@ For example:
 --driver-memory 4g \
 --executor-memory 2g \
 --executor-cores 1 \
+--queue thequeue \
 lib/spark-examples*.jar \
 10
 


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

git commit: [SPARK-3160] [SPARK-3494] [mllib] DecisionTree: eliminate pre-allocated nodes, parentImpurities arrays. Memory calc bug fix.

2014-09-12 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master 42904b8d0 -> b8634df1f


[SPARK-3160] [SPARK-3494] [mllib]  DecisionTree: eliminate pre-allocated nodes, 
parentImpurities arrays. Memory calc bug fix.

This PR includes some code simplifications and re-organization which will be 
helpful for implementing random forests.  The main changes are that the nodes 
and parentImpurities arrays are no longer pre-allocated in the main train() 
method.

Also added 2 bug fixes:
* maxMemoryUsage calculation
* over-allocation of space for bins in DTStatsAggregator for unordered features.

Relation to RFs:
* Since RFs will be deeper and will therefore be more likely sparse (not full 
trees), it could be a cost savings to avoid pre-allocating a full tree.
* The associated re-organization also reduces bookkeeping, which will make RFs 
easier to implement.
* The return code doneTraining may be generalized to include cases such as 
nodes ready for local training.

Details:

No longer pre-allocate parentImpurities array in main train() method.
* parentImpurities values are now stored in individual nodes (in 
Node.stats.impurity).
* These were not really needed.  They were used in calculateGainForSplit(), but 
they can be calculated anyways using parentNodeAgg.

No longer using Node.build since tree structure is constructed on-the-fly.
* Did not eliminate since it is public (Developer) API.  Marked as deprecated.

Eliminated pre-allocated nodes array in main train() method.
* Nodes are constructed and added to the tree structure as needed during 
training.
* Moved tree construction from main train() method into 
findBestSplitsPerGroup() since there is no need to keep the (split, gain) array 
for an entire level of nodes.  Only one element of that array is needed at a 
time, so we do not the array.

findBestSplits() now returns 2 items:
* rootNode (newly created root node on first iteration, same root node on later 
iterations)
* doneTraining (indicating if all nodes at that level were leafs)

Updated DecisionTreeSuite.  Notes:
* Improved test "Second level node building with vs. without groups"
** generateOrderedLabeledPoints() modified so that it really does require 2 
levels of internal nodes.
* Related update: Added Node.deepCopy (private[tree]), used for test suite

CC: mengxr

Author: Joseph K. Bradley 

Closes #2341 from jkbradley/dt-spark-3160 and squashes the following commits:

07dd1ee [Joseph K. Bradley] Fixed overflow bug with computing maxMemoryUsage in 
DecisionTree.  Also fixed bug with over-allocating space in DTStatsAggregator 
for unordered features.
debe072 [Joseph K. Bradley] Merge remote-tracking branch 'upstream/master' into 
dt-spark-3160
5c4ac33 [Joseph K. Bradley] Added check in Strategy to make sure 
minInstancesPerNode >= 1
0dd4d87 [Joseph K. Bradley] Merge remote-tracking branch 'upstream/master' into 
dt-spark-3160
306120f [Joseph K. Bradley] Fixed typo in DecisionTreeModel.scala doc
eaa1dcf [Joseph K. Bradley] Added topNode doc in DecisionTree and scalastyle fix
d4d7864 [Joseph K. Bradley] Marked Node.build as deprecated
d4dbb99 [Joseph K. Bradley] Merge remote-tracking branch 'upstream/master' into 
dt-spark-3160
1a8f0ad [Joseph K. Bradley] Eliminated pre-allocated nodes array in main 
train() method. * Nodes are constructed and added to the tree structure as 
needed during training.
2ab763b [Joseph K. Bradley] Simplifications to DecisionTree code:


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b8634df1
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b8634df1
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b8634df1

Branch: refs/heads/master
Commit: b8634df1f1eb6ce909bec779522c9c9912c7d06a
Parents: 42904b8
Author: Joseph K. Bradley 
Authored: Fri Sep 12 01:37:59 2014 -0700
Committer: Xiangrui Meng 
Committed: Fri Sep 12 01:37:59 2014 -0700

--
 .../apache/spark/mllib/tree/DecisionTree.scala  | 191 ++---
 .../mllib/tree/configuration/Strategy.scala |   3 +
 .../mllib/tree/impl/DTStatsAggregator.scala |  11 +-
 .../mllib/tree/impl/DecisionTreeMetadata.scala  |   3 +-
 .../mllib/tree/model/DecisionTreeModel.scala|   2 +-
 .../apache/spark/mllib/tree/model/Node.scala|  37 +++
 .../spark/mllib/tree/DecisionTreeSuite.scala| 277 ++-
 7 files changed, 268 insertions(+), 256 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/b8634df1/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala
index 9859656..56bb881 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.sca

git commit: Proper indent for the previous commit.

git commit: SPARK-3470 [CORE] [STREAMING] Add Closeable / close() to Java context objects

git commit: [SQL][Docs] Update SQL programming guide to show the correct default value of containsNull in an ArrayType

git commit: [SPARK-3469] Make sure all TaskCompletionListener are called even with failures

git commit: [SPARK-3515][SQL] Moves test suite setup code to beforeAll rather than in constructor

git commit: [SPARK-3515][SQL] Moves test suite setup code to beforeAll rather than in constructor

git commit: [SPARK-3500] [SQL] use JavaSchemaRDD as SchemaRDD._jschema_rdd

git commit: [SPARK-3500] [SQL] use JavaSchemaRDD as SchemaRDD._jschema_rdd

git commit: [SPARK-3094] [PySpark] compatitable with PyPy

git commit: [SPARK-3456] YarnAllocator on alpha can lose container requests to RM

git commit: [SPARK-3217] Add Guava to classpath when SPARK_PREPEND_CLASSES is set.

git commit: SPARK-3014. Log a more informative messages in a couple failure scenario...

git commit: [SPARK-3427] [GraphX] Avoid active vertex tracking in static PageRank

git commit: MAINTENANCE: Automated closing of pull requests.

svn commit: r1624639 - in /spark: downloads.md js/downloads.js site/downloads.html site/js/downloads.js

git commit: [SPARK-3481] [SQL] Eliminate the error log in local Hive comparison test

git commit: [SPARK-3481] [SQL] Eliminate the error log in local Hive comparison test

git commit: [SPARK-3481] [SQL] Eliminate the error log in local Hive comparison test

git commit: Revert "[Spark-3490] Disable SparkUI for tests"

git commit: [PySpark] Add blank line so that Python RDD.top() docstring renders correctly

svn commit: r1624579 - in /spark: releases/_posts/2014-09-11-spark-release-1-1-0.md site/releases/spark-release-1-1-0.html

svn commit: r1624578 - in /spark: releases/_posts/2014-09-11-spark-release-1-1-0.md site/releases/spark-release-1-1-0.html

git commit: [SPARK-2558][DOCS] Add --queue example to YARN doc

git commit: [SPARK-3160] [SPARK-3494] [mllib] DecisionTree: eliminate pre-allocated nodes, parentImpurities arrays. Memory calc bug fix.

24 matches

Site Navigation

Mail list logo

Footer information