git commit: Spark 1384 - Fix spark-shell on yarn access to secure hdfs - branch-0.9 only

2014-06-09 Thread meng
Repository: spark
Updated Branches:
  refs/heads/branch-0.9 1d3aab961 -> cc95d978a


Spark 1384 - Fix spark-shell on yarn access to secure hdfs - branch-0.9 only

Author: Thomas Graves 

Closes #287 from tgravescs/SPARK-1384 and squashes the following commits:

ae9162a [Thomas Graves] SPARK-1384 - fix spark-shell on yarn access to secure 
HDFS


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/cc95d978
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/cc95d978
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/cc95d978

Branch: refs/heads/branch-0.9
Commit: cc95d978a1b2dab6ed3dd8f73ccf8c299b16fdc0
Parents: 1d3aab9
Author: Thomas Graves 
Authored: Mon Jun 9 23:07:25 2014 -0700
Committer: Xiangrui Meng 
Committed: Mon Jun 9 23:07:25 2014 -0700

--
 .../org/apache/spark/repl/SparkILoop.scala  | 21 
 1 file changed, 13 insertions(+), 8 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/cc95d978/repl/src/main/scala/org/apache/spark/repl/SparkILoop.scala
--
diff --git a/repl/src/main/scala/org/apache/spark/repl/SparkILoop.scala 
b/repl/src/main/scala/org/apache/spark/repl/SparkILoop.scala
index f262faa..5274932 100644
--- a/repl/src/main/scala/org/apache/spark/repl/SparkILoop.scala
+++ b/repl/src/main/scala/org/apache/spark/repl/SparkILoop.scala
@@ -880,6 +880,7 @@ class SparkILoop(in0: Option[BufferedReader], protected val 
out: JPrintWriter,
 
   def process(settings: Settings): Boolean = savingContextLoader {
 this.settings = settings
+if (getMaster() == "yarn-client") System.setProperty("SPARK_YARN_MODE", 
"true")
 createInterpreter()
 
 // sets in to some kind of reader depending on environmental cues
@@ -937,16 +938,9 @@ class SparkILoop(in0: Option[BufferedReader], protected 
val out: JPrintWriter,
 
   def createSparkContext(): SparkContext = {
 val execUri = System.getenv("SPARK_EXECUTOR_URI")
-val master = this.master match {
-  case Some(m) => m
-  case None => {
-val prop = System.getenv("MASTER")
-if (prop != null) prop else "local"
-  }
-}
 val jars = SparkILoop.getAddedJars.map(new java.io.File(_).getAbsolutePath)
 val conf = new SparkConf()
-  .setMaster(master)
+  .setMaster(getMaster())
   .setAppName("Spark shell")
   .setJars(jars)
   .set("spark.repl.class.uri", intp.classServer.uri)
@@ -961,6 +955,17 @@ class SparkILoop(in0: Option[BufferedReader], protected 
val out: JPrintWriter,
 sparkContext
   }
 
+  private def getMaster(): String = {
+ val master = this.master match {
+   case Some(m) => m
+   case None => {
+ val prop = System.getenv("MASTER")
+ if (prop != null) prop else "local"
+   }
+ }
+ master
+   }
+
   /** process command-line arguments and do as they request */
   def process(args: Array[String]): Boolean = {
 val command = new SparkCommandLine(args.toList, msg => echo(msg))



git commit: [SPARK-1870] Made deployment with --jars work in yarn-standalone mode.

2014-06-09 Thread meng
Repository: spark
Updated Branches:
  refs/heads/branch-0.9 51f677eb9 -> 1d3aab961


[SPARK-1870] Made deployment with --jars work in yarn-standalone mode.

Ported from 1.0 branch to 0.9 branch. Sent secondary jars to distributed cache 
of all containers and add the cached jars to classpath before executors start.

Author: DB Tsai 

Closes #1013 from dbtsai/branch-0.9 and squashes the following commits:

c5696f4 [DB Tsai] fix line too long
b085f10 [DB Tsai] Make sure that empty string is filtered out when we get 
secondary jars
3cc1085 [DB Tsai] changed from var to val
ab94aa1 [DB Tsai] Code formatting.
0956af9 [DB Tsai] Ported SPARK-1870 from 1.0 branch to 0.9 branch


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1d3aab96
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1d3aab96
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1d3aab96

Branch: refs/heads/branch-0.9
Commit: 1d3aab96120c6770399e78a72b5692cf8f61a144
Parents: 51f677e
Author: DB Tsai 
Authored: Mon Jun 9 22:56:24 2014 -0700
Committer: Xiangrui Meng 
Committed: Mon Jun 9 22:56:24 2014 -0700

--
 .../org/apache/spark/deploy/yarn/Client.scala   | 24 
 .../org/apache/spark/deploy/yarn/Client.scala   | 29 ++--
 2 files changed, 39 insertions(+), 14 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/1d3aab96/yarn/alpha/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
--
diff --git 
a/yarn/alpha/src/main/scala/org/apache/spark/deploy/yarn/Client.scala 
b/yarn/alpha/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
index 6956cd6..9e5e2d5 100644
--- a/yarn/alpha/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
+++ b/yarn/alpha/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
@@ -21,8 +21,7 @@ import java.net.{InetAddress, UnknownHostException, URI}
 import java.nio.ByteBuffer
 
 import scala.collection.JavaConversions._
-import scala.collection.mutable.HashMap
-import scala.collection.mutable.Map
+import scala.collection.mutable.{HashMap, ListBuffer, Map}
 
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileContext, FileStatus, FileSystem, Path, 
FileUtil}
@@ -264,6 +263,7 @@ class Client(args: ClientArguments, conf: Configuration, 
sparkConf: SparkConf)
 }
 
 // handle any add jars
+val cachedSecondaryJarLinks = ListBuffer.empty[String]
 if ((args.addJars != null) && (!args.addJars.isEmpty())){
   args.addJars.split(',').foreach { case file: String =>
 val localURI = new URI(file.trim())
@@ -271,9 +271,11 @@ class Client(args: ClientArguments, conf: Configuration, 
sparkConf: SparkConf)
 val linkname = 
Option(localURI.getFragment()).getOrElse(localPath.getName())
 val destPath = copyRemoteFile(dst, localPath, replication)
 distCacheMgr.addResource(fs, conf, destPath, localResources, 
LocalResourceType.FILE,
-  linkname, statCache, true)
+  linkname, statCache)
+cachedSecondaryJarLinks += linkname
   }
 }
+sparkConf.set(Client.CONF_SPARK_YARN_SECONDARY_JARS, 
cachedSecondaryJarLinks.mkString(","))
 
 // handle any distributed cache files
 if ((args.files != null) && (!args.files.isEmpty())){
@@ -462,9 +464,10 @@ class Client(args: ClientArguments, conf: Configuration, 
sparkConf: SparkConf)
 }
 
 object Client {
-  val SPARK_JAR: String = "spark.jar"
-  val APP_JAR: String = "app.jar"
+  val SPARK_JAR: String = "__spark__.jar"
+  val APP_JAR: String = "__app__.jar"
   val LOG4J_PROP: String = "log4j.properties"
+  val CONF_SPARK_YARN_SECONDARY_JARS = "spark.yarn.secondary.jars"
 
   def main(argStrings: Array[String]) {
 // Set an env variable indicating we are running in YARN mode.
@@ -491,11 +494,19 @@ object Client {
   Apps.addToEnvironment(env, Environment.CLASSPATH.name, 
Environment.PWD.$() +
 Path.SEPARATOR + LOG4J_PROP)
 }
+
+val cachedSecondaryJarLinks =
+  
sparkConf.getOption(CONF_SPARK_YARN_SECONDARY_JARS).getOrElse("").split(",")
+.filter(_.nonEmpty)
+
 // Normally the users app.jar is last in case conflicts with spark jars
 val userClasspathFirst = sparkConf.get("spark.yarn.user.classpath.first", 
"false").toBoolean
 if (userClasspathFirst) {
   Apps.addToEnvironment(env, Environment.CLASSPATH.name, 
Environment.PWD.$() +
 Path.SEPARATOR + APP_JAR)
+  cachedSecondaryJarLinks.foreach(jarLink =>
+Apps.addToEnvironment(env, Environment.CLASSPATH.name, 
Environment.PWD.$() +
+  Path.SEPARATOR + jarLink))
 }
 Apps.addToEnvironment(env, Environment.CLASSPATH.name, Environment.PWD.$() 
+
   Path.SEPARATOR + SPARK_JAR)
@@ -504,6 +515,9 @@ object Client {
 if (!us

git commit: Make sure that empty string is filtered out when we get the secondary jars from conf

2014-06-09 Thread meng
Repository: spark
Updated Branches:
  refs/heads/master a9ec033c8 -> 6f2db8c2f


Make sure that empty string is filtered out when we get the secondary jars from 
conf

Author: DB Tsai 

Closes #1027 from dbtsai/dbtsai-classloader and squashes the following commits:

9ac6be3 [DB Tsai] Fixed line too long
c9c7ad7 [DB Tsai] Make sure that empty string is filtered out when we get the 
secondary jars from conf.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6f2db8c2
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6f2db8c2
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6f2db8c2

Branch: refs/heads/master
Commit: 6f2db8c2f51911f88a601ec5bf1509ea0e8173ed
Parents: a9ec033
Author: DB Tsai 
Authored: Mon Jun 9 22:18:50 2014 -0700
Committer: Xiangrui Meng 
Committed: Mon Jun 9 22:18:50 2014 -0700

--
 .../main/scala/org/apache/spark/deploy/yarn/ClientBase.scala   | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/6f2db8c2/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ClientBase.scala
--
diff --git 
a/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ClientBase.scala 
b/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ClientBase.scala
index 4b5e0ef..801e8b3 100644
--- a/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ClientBase.scala
+++ b/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ClientBase.scala
@@ -221,7 +221,7 @@ trait ClientBase extends Logging {
   }
 }
 
-var cachedSecondaryJarLinks = ListBuffer.empty[String]
+val cachedSecondaryJarLinks = ListBuffer.empty[String]
 val fileLists = List( (args.addJars, LocalResourceType.FILE, true),
   (args.files, LocalResourceType.FILE, false),
   (args.archives, LocalResourceType.ARCHIVE, false) )
@@ -502,12 +502,14 @@ object ClientBase extends Logging {
 def addClasspathEntry(path: String) = 
YarnSparkHadoopUtil.addToEnvironment(env,
   Environment.CLASSPATH.name, path, File.pathSeparator)
 /** Add entry to the classpath. Interpreted as a path relative to the 
working directory. */
-def addPwdClasspathEntry(entry: String) = 
addClasspathEntry(Environment.PWD.$() + Path.SEPARATOR + entry)
+def addPwdClasspathEntry(entry: String) =
+  addClasspathEntry(Environment.PWD.$() + Path.SEPARATOR + entry)
 
 extraClassPath.foreach(addClasspathEntry)
 
 val cachedSecondaryJarLinks =
   
sparkConf.getOption(CONF_SPARK_YARN_SECONDARY_JARS).getOrElse("").split(",")
+.filter(_.nonEmpty)
 // Normally the users app.jar is last in case conflicts with spark jars
 if (sparkConf.get("spark.yarn.user.classpath.first", "false").toBoolean) {
   addPwdClasspathEntry(APP_JAR)



git commit: [SPARK-1704][SQL] Fully support EXPLAIN commands as SchemaRDD.

2014-06-09 Thread marmbrus
Repository: spark
Updated Branches:
  refs/heads/branch-1.0 65fa7bcac -> 5a79ba13e


[SPARK-1704][SQL] Fully support EXPLAIN commands as SchemaRDD.

This PR attempts to resolve 
[SPARK-1704](https://issues.apache.org/jira/browse/SPARK-1704) by introducing a 
physical plan for EXPLAIN commands, which just prints out the debug string 
(containing various SparkSQL's plans) of the corresponding QueryExecution for 
the actual query.

Author: Zongheng Yang 

Closes #1003 from concretevitamin/explain-cmd and squashes the following 
commits:

5b7911f [Zongheng Yang] Add a regression test.
1bfa379 [Zongheng Yang] Modify output().
719ada9 [Zongheng Yang] Override otherCopyArgs for ExplainCommandPhysical.
4318fd7 [Zongheng Yang] Make all output one Row.
439c6ab [Zongheng Yang] Minor cleanups.
408f574 [Zongheng Yang] SPARK-1704: Add CommandStrategy and 
ExplainCommandPhysical.

(cherry picked from commit a9ec033c8cf489898cc47e2043bd9e86b7df1ff8)
Signed-off-by: Michael Armbrust 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5a79ba13
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5a79ba13
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5a79ba13

Branch: refs/heads/branch-1.0
Commit: 5a79ba13ea75838fe53d99ca5aa289d81a58cdb3
Parents: 65fa7bc
Author: Zongheng Yang 
Authored: Mon Jun 9 16:47:44 2014 -0700
Committer: Michael Armbrust 
Committed: Mon Jun 9 16:47:56 2014 -0700

--
 .../catalyst/plans/logical/LogicalPlan.scala|  8 +++--
 .../scala/org/apache/spark/sql/SQLContext.scala |  6 
 .../spark/sql/execution/SparkStrategies.scala   | 11 +++
 .../apache/spark/sql/execution/commands.scala   | 32 
 .../org/apache/spark/sql/hive/HiveContext.scala |  3 +-
 .../sql/hive/execution/HiveQuerySuite.scala | 12 
 6 files changed, 68 insertions(+), 4 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/5a79ba13/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
index 2b8fbdc..4f641cd 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst.plans.logical
 import org.apache.spark.sql.catalyst.errors.TreeNodeException
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.QueryPlan
-import org.apache.spark.sql.catalyst.types.StructType
+import org.apache.spark.sql.catalyst.types.{StringType, StructType}
 import org.apache.spark.sql.catalyst.trees
 
 abstract class LogicalPlan extends QueryPlan[LogicalPlan] {
@@ -102,7 +102,7 @@ abstract class LeafNode extends LogicalPlan with 
trees.LeafNode[LogicalPlan] {
  */
 abstract class Command extends LeafNode {
   self: Product =>
-  def output = Seq.empty
+  def output: Seq[Attribute] = Seq.empty
 }
 
 /**
@@ -115,7 +115,9 @@ case class NativeCommand(cmd: String) extends Command
  * Returned by a parser when the users only wants to see what query plan would 
be executed, without
  * actually performing the execution.
  */
-case class ExplainCommand(plan: LogicalPlan) extends Command
+case class ExplainCommand(plan: LogicalPlan) extends Command {
+  override def output = Seq(AttributeReference("plan", StringType, nullable = 
false)())
+}
 
 /**
  * A logical plan node with single child.

http://git-wip-us.apache.org/repos/asf/spark/blob/5a79ba13/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
--
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 0109dd6..9fccbd6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -191,6 +191,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
 val sparkContext = self.sparkContext
 
 val strategies: Seq[Strategy] =
+  CommandStrategy(self) ::
   TakeOrdered ::
   PartialAggregation ::
   LeftSemiJoin ::
@@ -252,6 +253,11 @@ class SQLContext(@transient val sparkContext: SparkContext)
   Batch("Prepare Expressions", Once, new BindReferences[SparkPlan]) :: Nil
   }
 
+  // TODO: or should we make QueryExecution protected[sql]?
+  protected[sql] def mkQueryExecution(plan: LogicalPlan) = new QueryExecution {
+val logical = plan
+  }
+
  

git commit: [SPARK-1704][SQL] Fully support EXPLAIN commands as SchemaRDD.

2014-06-09 Thread marmbrus
Repository: spark
Updated Branches:
  refs/heads/master c6e041d17 -> a9ec033c8


[SPARK-1704][SQL] Fully support EXPLAIN commands as SchemaRDD.

This PR attempts to resolve 
[SPARK-1704](https://issues.apache.org/jira/browse/SPARK-1704) by introducing a 
physical plan for EXPLAIN commands, which just prints out the debug string 
(containing various SparkSQL's plans) of the corresponding QueryExecution for 
the actual query.

Author: Zongheng Yang 

Closes #1003 from concretevitamin/explain-cmd and squashes the following 
commits:

5b7911f [Zongheng Yang] Add a regression test.
1bfa379 [Zongheng Yang] Modify output().
719ada9 [Zongheng Yang] Override otherCopyArgs for ExplainCommandPhysical.
4318fd7 [Zongheng Yang] Make all output one Row.
439c6ab [Zongheng Yang] Minor cleanups.
408f574 [Zongheng Yang] SPARK-1704: Add CommandStrategy and 
ExplainCommandPhysical.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a9ec033c
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a9ec033c
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a9ec033c

Branch: refs/heads/master
Commit: a9ec033c8cf489898cc47e2043bd9e86b7df1ff8
Parents: c6e041d
Author: Zongheng Yang 
Authored: Mon Jun 9 16:47:44 2014 -0700
Committer: Michael Armbrust 
Committed: Mon Jun 9 16:47:44 2014 -0700

--
 .../catalyst/plans/logical/LogicalPlan.scala|  8 +++--
 .../scala/org/apache/spark/sql/SQLContext.scala |  6 
 .../spark/sql/execution/SparkStrategies.scala   | 11 +++
 .../apache/spark/sql/execution/commands.scala   | 32 
 .../org/apache/spark/sql/hive/HiveContext.scala |  3 +-
 .../sql/hive/execution/HiveQuerySuite.scala | 12 
 6 files changed, 68 insertions(+), 4 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/a9ec033c/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
index 2b8fbdc..4f641cd 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst.plans.logical
 import org.apache.spark.sql.catalyst.errors.TreeNodeException
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.QueryPlan
-import org.apache.spark.sql.catalyst.types.StructType
+import org.apache.spark.sql.catalyst.types.{StringType, StructType}
 import org.apache.spark.sql.catalyst.trees
 
 abstract class LogicalPlan extends QueryPlan[LogicalPlan] {
@@ -102,7 +102,7 @@ abstract class LeafNode extends LogicalPlan with 
trees.LeafNode[LogicalPlan] {
  */
 abstract class Command extends LeafNode {
   self: Product =>
-  def output = Seq.empty
+  def output: Seq[Attribute] = Seq.empty
 }
 
 /**
@@ -115,7 +115,9 @@ case class NativeCommand(cmd: String) extends Command
  * Returned by a parser when the users only wants to see what query plan would 
be executed, without
  * actually performing the execution.
  */
-case class ExplainCommand(plan: LogicalPlan) extends Command
+case class ExplainCommand(plan: LogicalPlan) extends Command {
+  override def output = Seq(AttributeReference("plan", StringType, nullable = 
false)())
+}
 
 /**
  * A logical plan node with single child.

http://git-wip-us.apache.org/repos/asf/spark/blob/a9ec033c/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
--
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 5626f0d..fde4c48 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -191,6 +191,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
 val sparkContext = self.sparkContext
 
 val strategies: Seq[Strategy] =
+  CommandStrategy(self) ::
   TakeOrdered ::
   PartialAggregation ::
   LeftSemiJoin ::
@@ -256,6 +257,11 @@ class SQLContext(@transient val sparkContext: SparkContext)
   Batch("Prepare Expressions", Once, new BindReferences[SparkPlan]) :: Nil
   }
 
+  // TODO: or should we make QueryExecution protected[sql]?
+  protected[sql] def mkQueryExecution(plan: LogicalPlan) = new QueryExecution {
+val logical = plan
+  }
+
   /**
* The primary workflow for executing relational queries using Spark.  
Designed to allow easy
* ac

git commit: [SQL] Simple framework for debugging query execution

2014-06-09 Thread rxin
Repository: spark
Updated Branches:
  refs/heads/branch-1.0 73cd1f822 -> 65fa7bcac


[SQL] Simple framework for debugging query execution

Only records number of tuples and unique dataTypes output right now...

Example:
```scala
scala> import org.apache.spark.sql.execution.debug._
scala> hql("SELECT value FROM src WHERE key > 10").debug(sparkContext)

Results returned: 489
== Project [value#1:0] ==
Tuples output: 489
 value StringType: {java.lang.String}
== Filter (key#0:1 > 10) ==
Tuples output: 489
 value StringType: {java.lang.String}
 key IntegerType: {java.lang.Integer}
== HiveTableScan [value#1,key#0], (MetastoreRelation default, src, None), None 
==
Tuples output: 500
 value StringType: {java.lang.String}
 key IntegerType: {java.lang.Integer}
```

Author: Michael Armbrust 

Closes #1005 from marmbrus/debug and squashes the following commits:

dcc3ca6 [Michael Armbrust] Add comments.
c9dded2 [Michael Armbrust] Simple framework for debugging query execution

(cherry picked from commit c6e041d171e3d9882ab15e2bd7a7217dc19647f6)
Signed-off-by: Reynold Xin 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/65fa7bca
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/65fa7bca
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/65fa7bca

Branch: refs/heads/branch-1.0
Commit: 65fa7bcac81fc2a7a6c578775f72929cb201c20a
Parents: 73cd1f8
Author: Michael Armbrust 
Authored: Mon Jun 9 14:24:19 2014 -0700
Committer: Reynold Xin 
Committed: Mon Jun 9 14:24:52 2014 -0700

--
 .../scala/org/apache/spark/sql/SQLContext.scala |   5 -
 .../org/apache/spark/sql/execution/debug.scala  |  45 ---
 .../spark/sql/execution/debug/package.scala | 119 +++
 3 files changed, 119 insertions(+), 50 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/65fa7bca/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
--
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 5ebdffa..0109dd6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -281,11 +281,6 @@ class SQLContext(@transient val sparkContext: SparkContext)
  |== Physical Plan ==
  |${stringOrError(executedPlan)}
   """.stripMargin.trim
-
-/**
- * Runs the query after interposing operators that print the result of 
each intermediate step.
- */
-def debugExec() = DebugQuery(executedPlan).execute().collect()
   }
 
   /**

http://git-wip-us.apache.org/repos/asf/spark/blob/65fa7bca/sql/core/src/main/scala/org/apache/spark/sql/execution/debug.scala
--
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/debug.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/debug.scala
deleted file mode 100644
index a0d2910..000
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/debug.scala
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.execution
-
-private[sql] object DebugQuery {
-  def apply(plan: SparkPlan): SparkPlan = {
-val visited = new collection.mutable.HashSet[Long]()
-plan transform {
-  case s: SparkPlan if !visited.contains(s.id) =>
-visited += s.id
-DebugNode(s)
-}
-  }
-}
-
-private[sql] case class DebugNode(child: SparkPlan) extends UnaryNode {
-  def references = Set.empty
-  def output = child.output
-  def execute() = {
-val childRdd = child.execute()
-println(
-  s"""
-|=
-|${child.simpleString}
-|=
-  """.stripMargin)
-childRdd.foreach(println(_))
-childRdd
-  }
-}

http://git-wip-us.apache.org/repos/asf/spark/blob/65fa7bca/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/p

git commit: [SQL] Simple framework for debugging query execution

2014-06-09 Thread rxin
Repository: spark
Updated Branches:
  refs/heads/master e27344768 -> c6e041d17


[SQL] Simple framework for debugging query execution

Only records number of tuples and unique dataTypes output right now...

Example:
```scala
scala> import org.apache.spark.sql.execution.debug._
scala> hql("SELECT value FROM src WHERE key > 10").debug(sparkContext)

Results returned: 489
== Project [value#1:0] ==
Tuples output: 489
 value StringType: {java.lang.String}
== Filter (key#0:1 > 10) ==
Tuples output: 489
 value StringType: {java.lang.String}
 key IntegerType: {java.lang.Integer}
== HiveTableScan [value#1,key#0], (MetastoreRelation default, src, None), None 
==
Tuples output: 500
 value StringType: {java.lang.String}
 key IntegerType: {java.lang.Integer}
```

Author: Michael Armbrust 

Closes #1005 from marmbrus/debug and squashes the following commits:

dcc3ca6 [Michael Armbrust] Add comments.
c9dded2 [Michael Armbrust] Simple framework for debugging query execution


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c6e041d1
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c6e041d1
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c6e041d1

Branch: refs/heads/master
Commit: c6e041d171e3d9882ab15e2bd7a7217dc19647f6
Parents: e273447
Author: Michael Armbrust 
Authored: Mon Jun 9 14:24:19 2014 -0700
Committer: Reynold Xin 
Committed: Mon Jun 9 14:24:19 2014 -0700

--
 .../scala/org/apache/spark/sql/SQLContext.scala |   5 -
 .../org/apache/spark/sql/execution/debug.scala  |  45 ---
 .../spark/sql/execution/debug/package.scala | 119 +++
 3 files changed, 119 insertions(+), 50 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/c6e041d1/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
--
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index e371c82..5626f0d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -285,11 +285,6 @@ class SQLContext(@transient val sparkContext: SparkContext)
  |== Physical Plan ==
  |${stringOrError(executedPlan)}
   """.stripMargin.trim
-
-/**
- * Runs the query after interposing operators that print the result of 
each intermediate step.
- */
-def debugExec() = DebugQuery(executedPlan).execute().collect()
   }
 
   /**

http://git-wip-us.apache.org/repos/asf/spark/blob/c6e041d1/sql/core/src/main/scala/org/apache/spark/sql/execution/debug.scala
--
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/debug.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/debug.scala
deleted file mode 100644
index a0d2910..000
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/debug.scala
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.execution
-
-private[sql] object DebugQuery {
-  def apply(plan: SparkPlan): SparkPlan = {
-val visited = new collection.mutable.HashSet[Long]()
-plan transform {
-  case s: SparkPlan if !visited.contains(s.id) =>
-visited += s.id
-DebugNode(s)
-}
-  }
-}
-
-private[sql] case class DebugNode(child: SparkPlan) extends UnaryNode {
-  def references = Set.empty
-  def output = child.output
-  def execute() = {
-val childRdd = child.execute()
-println(
-  s"""
-|=
-|${child.simpleString}
-|=
-  """.stripMargin)
-childRdd.foreach(println(_))
-childRdd
-  }
-}

http://git-wip-us.apache.org/repos/asf/spark/blob/c6e041d1/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala
--
diff --git 
a/sql/core

git commit: [SPARK-1522] : YARN ClientBase throws a NPE if there is no YARN Application CP

2014-06-09 Thread tgraves
Repository: spark
Updated Branches:
  refs/heads/master 6cf335d79 -> e27344768


[SPARK-1522] : YARN ClientBase throws a NPE if there is no YARN Application CP

The current implementation of ClientBase.getDefaultYarnApplicationClasspath 
inspects
the MRJobConfig class for the field DEFAULT_YARN_APPLICATION_CLASSPATH when it 
should
be really looking into YarnConfiguration. If the Application Configuration has 
no
yarn.application.classpath defined a NPE exception will be thrown.

Additional Changes include:
* Test Suite for ClientBase added

[ticket: SPARK-1522] : https://issues.apache.org/jira/browse/SPARK-1522

Author  : bernardo.gomezpala...@gmail.com
Testing : SPARK_HADOOP_VERSION=2.3.0 SPARK_YARN=true ./sbt/sbt test

Author: Bernardo Gomez Palacio 

Closes #433 from berngp/feature/SPARK-1522 and squashes the following commits:

2c2e118 [Bernardo Gomez Palacio] [SPARK-1522]: YARN ClientBase throws a NPE if 
there is no YARN Application specific CP


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e2734476
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e2734476
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e2734476

Branch: refs/heads/master
Commit: e273447684779a18bd61d733bfe7958b78657ffd
Parents: 6cf335d
Author: Bernardo Gomez Palacio 
Authored: Mon Jun 9 16:14:54 2014 -0500
Committer: Thomas Graves 
Committed: Mon Jun 9 16:14:54 2014 -0500

--
 .../apache/spark/deploy/yarn/ClientBase.scala   |  89 +--
 .../spark/deploy/yarn/ClientBaseSuite.scala | 112 +++
 2 files changed, 167 insertions(+), 34 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/e2734476/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ClientBase.scala
--
diff --git 
a/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ClientBase.scala 
b/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ClientBase.scala
index aeb3f00..4b5e0ef 100644
--- a/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ClientBase.scala
+++ b/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ClientBase.scala
@@ -23,6 +23,7 @@ import java.nio.ByteBuffer
 
 import scala.collection.JavaConversions._
 import scala.collection.mutable.{HashMap, ListBuffer, Map}
+import scala.util.{Try, Success, Failure}
 
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs._
@@ -378,7 +379,7 @@ trait ClientBase extends Logging {
   }
 }
 
-object ClientBase {
+object ClientBase extends Logging {
   val SPARK_JAR: String = "__spark__.jar"
   val APP_JAR: String = "__app__.jar"
   val LOG4J_PROP: String = "log4j.properties"
@@ -388,37 +389,47 @@ object ClientBase {
 
   def getSparkJar = 
sys.env.get("SPARK_JAR").getOrElse(SparkContext.jarOfClass(this.getClass).head)
 
-  // Based on code from org.apache.hadoop.mapreduce.v2.util.MRApps
-  def populateHadoopClasspath(conf: Configuration, env: HashMap[String, 
String]) {
-val classpathEntries = Option(conf.getStrings(
-  YarnConfiguration.YARN_APPLICATION_CLASSPATH)).getOrElse(
-getDefaultYarnApplicationClasspath())
-if (classpathEntries != null) {
-  for (c <- classpathEntries) {
-YarnSparkHadoopUtil.addToEnvironment(env, Environment.CLASSPATH.name, 
c.trim,
-  File.pathSeparator)
-  }
+  def populateHadoopClasspath(conf: Configuration, env: HashMap[String, 
String]) = {
+val classPathElementsToAdd = getYarnAppClasspath(conf) ++ 
getMRAppClasspath(conf)
+for (c <- classPathElementsToAdd.flatten) {
+  YarnSparkHadoopUtil.addToEnvironment(
+env,
+Environment.CLASSPATH.name,
+c.trim,
+File.pathSeparator)
 }
+classPathElementsToAdd
+  }
 
-val mrClasspathEntries = Option(conf.getStrings(
-  "mapreduce.application.classpath")).getOrElse(
-getDefaultMRApplicationClasspath())
-if (mrClasspathEntries != null) {
-  for (c <- mrClasspathEntries) {
-YarnSparkHadoopUtil.addToEnvironment(env, Environment.CLASSPATH.name, 
c.trim,
-  File.pathSeparator)
-  }
-}
+  private def getYarnAppClasspath(conf: Configuration): Option[Seq[String]] =
+Option(conf.getStrings(YarnConfiguration.YARN_APPLICATION_CLASSPATH)) 
match {
+  case Some(s) => Some(s.toSeq)
+  case None => getDefaultYarnApplicationClasspath
   }
 
-  def getDefaultYarnApplicationClasspath(): Array[String] = {
-try {
-  val field = 
classOf[MRJobConfig].getField("DEFAULT_YARN_APPLICATION_CLASSPATH")
-  field.get(null).asInstanceOf[Array[String]]
-} catch {
-  case err: NoSuchFieldError => null
-  case err: NoSuchFieldException => null
+  private def getMRAppClasspath(conf: Configuration): Option[Seq[String]] =
+Option(conf

git commit: Added a TaskSetManager unit test.

2014-06-09 Thread kayousterhout
Repository: spark
Updated Branches:
  refs/heads/master 0cf600280 -> 6cf335d79


Added a TaskSetManager unit test.

This test ensures that when there are no
alive executors that satisfy a particular locality level,
the TaskSetManager doesn't ever use that as the maximum
allowed locality level (this optimization ensures that a
job doesn't wait extra time in an attempt to satisfy
a scheduling locality level that is impossible).

@mateiz and @lirui-intel this unit test illustrates an issue
with #892 (it fails with that patch).

Author: Kay Ousterhout 

Closes #1024 from kayousterhout/scheduler_unit_test and squashes the following 
commits:

de6a08f [Kay Ousterhout] Added a TaskSetManager unit test.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6cf335d7
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6cf335d7
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6cf335d7

Branch: refs/heads/master
Commit: 6cf335d79a2f69ecd9a139dd0a03acff60585be4
Parents: 0cf6002
Author: Kay Ousterhout 
Authored: Mon Jun 9 13:13:53 2014 -0700
Committer: Kay Ousterhout 
Committed: Mon Jun 9 13:13:53 2014 -0700

--
 .../spark/scheduler/TaskSetManagerSuite.scala   | 16 
 1 file changed, 16 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/6cf335d7/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
--
diff --git 
a/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala 
b/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
index c92b6dc..6f1fd25 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
@@ -141,6 +141,22 @@ class TaskSetManagerSuite extends FunSuite with 
LocalSparkContext with Logging {
 assert(sched.finishedManagers.contains(manager))
   }
 
+  test("skip unsatisfiable locality levels") {
+sc = new SparkContext("local", "test")
+val sched = new FakeTaskScheduler(sc, ("execA", "host1"), ("execC", 
"host2"))
+val taskSet = FakeTask.createTaskSet(1, Seq(TaskLocation("host1", 
"execB")))
+val clock = new FakeClock
+val manager = new TaskSetManager(sched, taskSet, MAX_TASK_FAILURES, clock)
+
+// An executor that is not NODE_LOCAL should be rejected.
+assert(manager.resourceOffer("execC", "host2", ANY) === None)
+
+// Because there are no alive PROCESS_LOCAL executors, the base locality 
level should be
+// NODE_LOCAL. So, we should schedule the task on this offered NODE_LOCAL 
executor before
+// any of the locality wait timers expire.
+assert(manager.resourceOffer("execA", "host1", ANY).get.index === 0)
+  }
+
   test("basic delay scheduling") {
 sc = new SparkContext("local", "test")
 val sched = new FakeTaskScheduler(sc, ("exec1", "host1"), ("exec2", 
"host2"))



git commit: [SPARK-1495][SQL]add support for left semi join

2014-06-09 Thread marmbrus
Repository: spark
Updated Branches:
  refs/heads/branch-1.0 a5848d325 -> 73cd1f822


[SPARK-1495][SQL]add support for left semi join

Just submit another solution for #395

Author: Daoyuan 
Author: Michael Armbrust 
Author: Daoyuan Wang 

Closes #837 from adrian-wang/left-semi-join-support and squashes the following 
commits:

d39cd12 [Daoyuan Wang] Merge pull request #1 from marmbrus/pr/837
6713c09 [Michael Armbrust] Better debugging for failed query tests.
035b73e [Michael Armbrust] Add test for left semi that can't be done with a 
hash join.
5ec6fa4 [Michael Armbrust] Add left semi to SQL Parser.
4c726e5 [Daoyuan] improvement according to Michael
8d4a121 [Daoyuan] add golden files for leftsemijoin
83a3c8a [Daoyuan] scala style fix
14cff80 [Daoyuan] add support for left semi join

(cherry picked from commit 0cf600280167a94faec75736223256e8f2e48085)
Signed-off-by: Michael Armbrust 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/73cd1f82
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/73cd1f82
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/73cd1f82

Branch: refs/heads/branch-1.0
Commit: 73cd1f8223a4799fd104fe48ba011315236cf4a8
Parents: a5848d3
Author: Daoyuan 
Authored: Mon Jun 9 11:31:36 2014 -0700
Committer: Michael Armbrust 
Committed: Mon Jun 9 11:32:04 2014 -0700

--
 .../apache/spark/sql/catalyst/SqlParser.scala   |   2 +
 .../spark/sql/catalyst/planning/patterns.scala  |   5 +
 .../spark/sql/catalyst/plans/joinTypes.scala|   1 +
 .../catalyst/plans/logical/basicOperators.scala |   9 +-
 .../scala/org/apache/spark/sql/SQLContext.scala |   1 +
 .../spark/sql/execution/SparkStrategies.scala   |  16 +++
 .../org/apache/spark/sql/execution/joins.scala  | 131 +++
 .../scala/org/apache/spark/sql/QueryTest.scala  |   2 +-
 .../org/apache/spark/sql/SQLQuerySuite.scala|   7 +
 .../org/apache/spark/sql/hive/HiveContext.scala |   1 +
 .../org/apache/spark/sql/hive/HiveQl.scala  |   1 +
 ...tsemijoin-0-80b6466213face7fbcb0de044611e1f5 |   0
 ...tsemijoin-1-d1f6a3dea28a5f0fee08026bf33d9129 |   0
 ...semijoin-10-89737a8857b5b61cc909e0c797f86aea |   4 +
 ...semijoin-11-80b6466213face7fbcb0de044611e1f5 |   0
 ...semijoin-12-d1f6a3dea28a5f0fee08026bf33d9129 |   0
 ...tsemijoin-2-43d53504df013e6b35f81811138a167a |   1 +
 ...tsemijoin-3-b07d292423312aafa5e5762a579decd2 |   0
 ...tsemijoin-4-3ac2226efe7cb5d999c1c5e4ac2114be |   0
 ...tsemijoin-5-9c307c0559d735960ce77efa95b2b17b |   0
 ...tsemijoin-6-82921fc96eef547ec0f71027ee88298c |   0
 ...tsemijoin-7-b30aa3b4a45db6b64bb46b4d9bd32ff0 |   0
 ...eftsemijoin-8-73cad58a10a1483ccb15e94a857013 |   4 +
 ...tsemijoin-9-c5efa6b8771a51610d655be461670e1e |   2 +
 ...mijoin_mr-0-7087fb6281a34d00f1812d2ff4ba8b75 |   0
 ...mijoin_mr-1-aa3f07f028027ffd13ab5535dc821593 |   0
 ...ijoin_mr-10-9914f44ecb6ae7587b62e5349ff60d04 |   1 +
 ...ijoin_mr-11-2027ecb1495d5550c5d56abf6b95b0a7 |   2 +
 ...mijoin_mr-2-3f65953ae60375156367c54533978782 |   0
 ...mijoin_mr-3-645cf8b871c9b27418d6fa1d1bda9a52 |   0
 ...mijoin_mr-4-333895fe6abca27c8edb5c91bfe10d2f |   2 +
 ...mijoin_mr-5-896d0948c1df849df9764a6d8ad8fff9 |  20 +++
 ...mijoin_mr-6-b1e2ade89ae898650f0be4f796d8947b |   1 +
 ...mijoin_mr-7-8e9c2969b999557363e40f9ebb3f6d7c |   1 +
 ...mijoin_mr-8-c61b972d4409babe41d8963e841af45b |   1 +
 ...mijoin_mr-9-2027ecb1495d5550c5d56abf6b95b0a7 |   2 +
 .../hive/execution/HiveCompatibilitySuite.scala |   2 +
 37 files changed, 216 insertions(+), 3 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/73cd1f82/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
index a404e74..cc65012 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
@@ -131,6 +131,7 @@ class SqlParser extends StandardTokenParsers with 
PackratParsers {
   protected val OUTER = Keyword("OUTER")
   protected val RIGHT = Keyword("RIGHT")
   protected val SELECT = Keyword("SELECT")
+  protected val SEMI = Keyword("SEMI")
   protected val STRING = Keyword("STRING")
   protected val SUM = Keyword("SUM")
   protected val TRUE = Keyword("TRUE")
@@ -241,6 +242,7 @@ class SqlParser extends StandardTokenParsers with 
PackratParsers {
 
protected lazy val joinType: Parser[JoinType] =
  INNER ^^^ Inner |
+ LEFT ~ SEMI ^^^ LeftSemi |
  LEFT ~ opt(OUTER) ^^^ LeftOuter |
  RIGHT ~ opt(OUTER) ^^^ RightOuter |
  FULL ~ opt(OUTER) ^^^ FullOuter

http://git-wip-us.apache.org/repos/asf/spark/blob/73cd1f8

git commit: [SPARK-1495][SQL]add support for left semi join

2014-06-09 Thread marmbrus
Repository: spark
Updated Branches:
  refs/heads/master 35630c86f -> 0cf600280


[SPARK-1495][SQL]add support for left semi join

Just submit another solution for #395

Author: Daoyuan 
Author: Michael Armbrust 
Author: Daoyuan Wang 

Closes #837 from adrian-wang/left-semi-join-support and squashes the following 
commits:

d39cd12 [Daoyuan Wang] Merge pull request #1 from marmbrus/pr/837
6713c09 [Michael Armbrust] Better debugging for failed query tests.
035b73e [Michael Armbrust] Add test for left semi that can't be done with a 
hash join.
5ec6fa4 [Michael Armbrust] Add left semi to SQL Parser.
4c726e5 [Daoyuan] improvement according to Michael
8d4a121 [Daoyuan] add golden files for leftsemijoin
83a3c8a [Daoyuan] scala style fix
14cff80 [Daoyuan] add support for left semi join


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0cf60028
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0cf60028
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0cf60028

Branch: refs/heads/master
Commit: 0cf600280167a94faec75736223256e8f2e48085
Parents: 35630c8
Author: Daoyuan 
Authored: Mon Jun 9 11:31:36 2014 -0700
Committer: Michael Armbrust 
Committed: Mon Jun 9 11:31:36 2014 -0700

--
 .../apache/spark/sql/catalyst/SqlParser.scala   |   2 +
 .../spark/sql/catalyst/planning/patterns.scala  |   5 +
 .../spark/sql/catalyst/plans/joinTypes.scala|   1 +
 .../catalyst/plans/logical/basicOperators.scala |   9 +-
 .../scala/org/apache/spark/sql/SQLContext.scala |   1 +
 .../spark/sql/execution/SparkStrategies.scala   |  16 +++
 .../org/apache/spark/sql/execution/joins.scala  | 131 +++
 .../scala/org/apache/spark/sql/QueryTest.scala  |   2 +-
 .../org/apache/spark/sql/SQLQuerySuite.scala|   7 +
 .../org/apache/spark/sql/hive/HiveContext.scala |   1 +
 .../org/apache/spark/sql/hive/HiveQl.scala  |   1 +
 ...tsemijoin-0-80b6466213face7fbcb0de044611e1f5 |   0
 ...tsemijoin-1-d1f6a3dea28a5f0fee08026bf33d9129 |   0
 ...semijoin-10-89737a8857b5b61cc909e0c797f86aea |   4 +
 ...semijoin-11-80b6466213face7fbcb0de044611e1f5 |   0
 ...semijoin-12-d1f6a3dea28a5f0fee08026bf33d9129 |   0
 ...tsemijoin-2-43d53504df013e6b35f81811138a167a |   1 +
 ...tsemijoin-3-b07d292423312aafa5e5762a579decd2 |   0
 ...tsemijoin-4-3ac2226efe7cb5d999c1c5e4ac2114be |   0
 ...tsemijoin-5-9c307c0559d735960ce77efa95b2b17b |   0
 ...tsemijoin-6-82921fc96eef547ec0f71027ee88298c |   0
 ...tsemijoin-7-b30aa3b4a45db6b64bb46b4d9bd32ff0 |   0
 ...eftsemijoin-8-73cad58a10a1483ccb15e94a857013 |   4 +
 ...tsemijoin-9-c5efa6b8771a51610d655be461670e1e |   2 +
 ...mijoin_mr-0-7087fb6281a34d00f1812d2ff4ba8b75 |   0
 ...mijoin_mr-1-aa3f07f028027ffd13ab5535dc821593 |   0
 ...ijoin_mr-10-9914f44ecb6ae7587b62e5349ff60d04 |   1 +
 ...ijoin_mr-11-2027ecb1495d5550c5d56abf6b95b0a7 |   2 +
 ...mijoin_mr-2-3f65953ae60375156367c54533978782 |   0
 ...mijoin_mr-3-645cf8b871c9b27418d6fa1d1bda9a52 |   0
 ...mijoin_mr-4-333895fe6abca27c8edb5c91bfe10d2f |   2 +
 ...mijoin_mr-5-896d0948c1df849df9764a6d8ad8fff9 |  20 +++
 ...mijoin_mr-6-b1e2ade89ae898650f0be4f796d8947b |   1 +
 ...mijoin_mr-7-8e9c2969b999557363e40f9ebb3f6d7c |   1 +
 ...mijoin_mr-8-c61b972d4409babe41d8963e841af45b |   1 +
 ...mijoin_mr-9-2027ecb1495d5550c5d56abf6b95b0a7 |   2 +
 .../hive/execution/HiveCompatibilitySuite.scala |   2 +
 37 files changed, 216 insertions(+), 3 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/0cf60028/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
index a404e74..cc65012 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
@@ -131,6 +131,7 @@ class SqlParser extends StandardTokenParsers with 
PackratParsers {
   protected val OUTER = Keyword("OUTER")
   protected val RIGHT = Keyword("RIGHT")
   protected val SELECT = Keyword("SELECT")
+  protected val SEMI = Keyword("SEMI")
   protected val STRING = Keyword("STRING")
   protected val SUM = Keyword("SUM")
   protected val TRUE = Keyword("TRUE")
@@ -241,6 +242,7 @@ class SqlParser extends StandardTokenParsers with 
PackratParsers {
 
protected lazy val joinType: Parser[JoinType] =
  INNER ^^^ Inner |
+ LEFT ~ SEMI ^^^ LeftSemi |
  LEFT ~ opt(OUTER) ^^^ LeftOuter |
  RIGHT ~ opt(OUTER) ^^^ RightOuter |
  FULL ~ opt(OUTER) ^^^ FullOuter

http://git-wip-us.apache.org/repos/asf/spark/blob/0cf60028/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala
---

git commit: SPARK-1944 Document --verbose in spark-shell -h

2014-06-09 Thread rxin
Repository: spark
Updated Branches:
  refs/heads/master 6113ac155 -> 35630c86f


SPARK-1944 Document --verbose in spark-shell -h

https://issues.apache.org/jira/browse/SPARK-1944

Author: Andrew Ash 

Closes #1020 from ash211/SPARK-1944 and squashes the following commits:

a831c4d [Andrew Ash] SPARK-1944 Document --verbose in spark-shell -h


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/35630c86
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/35630c86
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/35630c86

Branch: refs/heads/master
Commit: 35630c86ff0e27862c9d902887eb0a24d25867ae
Parents: 6113ac1
Author: Andrew Ash 
Authored: Mon Jun 9 10:21:21 2014 -0700
Committer: Reynold Xin 
Committed: Mon Jun 9 10:21:21 2014 -0700

--
 .../main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala | 3 +++
 1 file changed, 3 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/35630c86/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
--
diff --git 
a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala 
b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
index 153eee3..f1032ea 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
@@ -360,6 +360,9 @@ private[spark] class SparkSubmitArguments(args: 
Seq[String]) {
 |
 |  --executor-memory MEM   Memory per executor (e.g. 1000M, 2G) 
(Default: 1G).
 |
+|  --help, -h  Show this help message and exit
+|  --verbose, -v   Print additional debug output
+|
 | Spark standalone with cluster deploy mode only:
 |  --driver-cores NUM  Cores for driver (Default: 1).
 |  --supervise If given, restarts the driver on 
failure.



git commit: SPARK-1944 Document --verbose in spark-shell -h

2014-06-09 Thread rxin
Repository: spark
Updated Branches:
  refs/heads/branch-1.0 502a8f795 -> a5848d325


SPARK-1944 Document --verbose in spark-shell -h

https://issues.apache.org/jira/browse/SPARK-1944

Author: Andrew Ash 

Closes #1020 from ash211/SPARK-1944 and squashes the following commits:

a831c4d [Andrew Ash] SPARK-1944 Document --verbose in spark-shell -h

(cherry picked from commit 35630c86ff0e27862c9d902887eb0a24d25867ae)
Signed-off-by: Reynold Xin 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a5848d32
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a5848d32
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a5848d32

Branch: refs/heads/branch-1.0
Commit: a5848d325ae0909072800cbb3ea9ad73a3708965
Parents: 502a8f7
Author: Andrew Ash 
Authored: Mon Jun 9 10:21:21 2014 -0700
Committer: Reynold Xin 
Committed: Mon Jun 9 10:21:45 2014 -0700

--
 .../main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala | 3 +++
 1 file changed, 3 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/a5848d32/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
--
diff --git 
a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala 
b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
index 153eee3..f1032ea 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
@@ -360,6 +360,9 @@ private[spark] class SparkSubmitArguments(args: 
Seq[String]) {
 |
 |  --executor-memory MEM   Memory per executor (e.g. 1000M, 2G) 
(Default: 1G).
 |
+|  --help, -h  Show this help message and exit
+|  --verbose, -v   Print additional debug output
+|
 | Spark standalone with cluster deploy mode only:
 |  --driver-cores NUM  Cores for driver (Default: 1).
 |  --supervise If given, restarts the driver on 
failure.



git commit: [SPARK-1308] Add getNumPartitions to pyspark RDD

2014-06-09 Thread rxin
Repository: spark
Updated Branches:
  refs/heads/master 32ee9f066 -> 6113ac155


[SPARK-1308] Add getNumPartitions to pyspark RDD

Add getNumPartitions to pyspark RDD to provide an intuitive way to get number 
of partitions in RDD like we can do in scala today.

Author: Syed Hashmi 

Closes #995 from syedhashmi/master and squashes the following commits:

de0ed5e [Syed Hashmi] [SPARK-1308] Add getNumPartitions to pyspark RDD


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6113ac15
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6113ac15
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6113ac15

Branch: refs/heads/master
Commit: 6113ac1559d62c828dfbf08ef0f7f172c24cf7f5
Parents: 32ee9f0
Author: Syed Hashmi 
Authored: Mon Jun 9 00:08:40 2014 -0700
Committer: Reynold Xin 
Committed: Mon Jun 9 00:08:40 2014 -0700

--
 python/pyspark/rdd.py | 45 +++--
 1 file changed, 27 insertions(+), 18 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/6113ac15/python/pyspark/rdd.py
--
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index ca0a955..9c69c79 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -250,7 +250,7 @@ class RDD(object):
 def map(self, f, preservesPartitioning=False):
 """
 Return a new RDD by applying a function to each element of this RDD.
-
+
 >>> rdd = sc.parallelize(["b", "a", "c"])
 >>> sorted(rdd.map(lambda x: (x, 1)).collect())
 [('a', 1), ('b', 1), ('c', 1)]
@@ -312,6 +312,15 @@ class RDD(object):
 "use mapPartitionsWithIndex instead", DeprecationWarning, 
stacklevel=2)
 return self.mapPartitionsWithIndex(f, preservesPartitioning)
 
+def getNumPartitions(self):
+  """
+  Returns the number of partitions in RDD
+  >>> rdd = sc.parallelize([1, 2, 3, 4], 2)
+  >>> rdd.getNumPartitions()
+  2
+  """
+  return self._jrdd.splits().size()
+
 def filter(self, f):
 """
 Return a new RDD containing only the elements that satisfy a predicate.
@@ -413,9 +422,9 @@ class RDD(object):
 
 def intersection(self, other):
 """
-Return the intersection of this RDD and another one. The output will 
not 
+Return the intersection of this RDD and another one. The output will 
not
 contain any duplicate elements, even if the input RDDs did.
-
+
 Note that this method performs a shuffle internally.
 
 >>> rdd1 = sc.parallelize([1, 10, 2, 3, 4, 5])
@@ -571,14 +580,14 @@ class RDD(object):
 """
 Applies a function to each partition of this RDD.
 
->>> def f(iterator): 
-...  for x in iterator: 
-...   print x 
+>>> def f(iterator):
+...  for x in iterator:
+...   print x
 ...  yield None
 >>> sc.parallelize([1, 2, 3, 4, 5]).foreachPartition(f)
 """
 self.mapPartitions(f).collect()  # Force evaluation
-
+
 def collect(self):
 """
 Return a list that contains all of the elements in this RDD.
@@ -673,7 +682,7 @@ class RDD(object):
 yield acc
 
 return self.mapPartitions(func).fold(zeroValue, combOp)
-
+
 
 def max(self):
 """
@@ -692,7 +701,7 @@ class RDD(object):
 1.0
 """
 return self.reduce(min)
-
+
 def sum(self):
 """
 Add up the elements in this RDD.
@@ -786,7 +795,7 @@ class RDD(object):
 m1[k] += v
 return m1
 return self.mapPartitions(countPartition).reduce(mergeMaps)
-
+
 def top(self, num):
 """
 Get the top N elements from a RDD.
@@ -814,7 +823,7 @@ class RDD(object):
 def takeOrdered(self, num, key=None):
 """
 Get the N elements from a RDD ordered in ascending order or as 
specified
-by the optional key function. 
+by the optional key function.
 
 >>> sc.parallelize([10, 1, 2, 9, 3, 4, 5, 6, 7]).takeOrdered(6)
 [1, 2, 3, 4, 5, 6]
@@ -834,7 +843,7 @@ class RDD(object):
 if key_ != None:
 x = [i[1] for i in x]
 return x
-
+
 def merge(a, b):
 return next(topNKeyedElems(a + b))
 result = self.mapPartitions(lambda i: topNKeyedElems(i, 
key)).reduce(merge)
@@ -1169,12 +1178,12 @@ class RDD(object):
 combiners[k] = mergeCombiners(combiners[k], v)
 return combiners.iteritems()
 return shuffled.mapPartitions(_mergeCombiners)
-
+
 def foldByKey(self, zeroValue, func, numPartitions=None):
 """