git commit: Spark 1384 - Fix spark-shell on yarn access to secure hdfs - branch-0.9 only
Repository: spark Updated Branches: refs/heads/branch-0.9 1d3aab961 -> cc95d978a Spark 1384 - Fix spark-shell on yarn access to secure hdfs - branch-0.9 only Author: Thomas Graves Closes #287 from tgravescs/SPARK-1384 and squashes the following commits: ae9162a [Thomas Graves] SPARK-1384 - fix spark-shell on yarn access to secure HDFS Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/cc95d978 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/cc95d978 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/cc95d978 Branch: refs/heads/branch-0.9 Commit: cc95d978a1b2dab6ed3dd8f73ccf8c299b16fdc0 Parents: 1d3aab9 Author: Thomas Graves Authored: Mon Jun 9 23:07:25 2014 -0700 Committer: Xiangrui Meng Committed: Mon Jun 9 23:07:25 2014 -0700 -- .../org/apache/spark/repl/SparkILoop.scala | 21 1 file changed, 13 insertions(+), 8 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/cc95d978/repl/src/main/scala/org/apache/spark/repl/SparkILoop.scala -- diff --git a/repl/src/main/scala/org/apache/spark/repl/SparkILoop.scala b/repl/src/main/scala/org/apache/spark/repl/SparkILoop.scala index f262faa..5274932 100644 --- a/repl/src/main/scala/org/apache/spark/repl/SparkILoop.scala +++ b/repl/src/main/scala/org/apache/spark/repl/SparkILoop.scala @@ -880,6 +880,7 @@ class SparkILoop(in0: Option[BufferedReader], protected val out: JPrintWriter, def process(settings: Settings): Boolean = savingContextLoader { this.settings = settings +if (getMaster() == "yarn-client") System.setProperty("SPARK_YARN_MODE", "true") createInterpreter() // sets in to some kind of reader depending on environmental cues @@ -937,16 +938,9 @@ class SparkILoop(in0: Option[BufferedReader], protected val out: JPrintWriter, def createSparkContext(): SparkContext = { val execUri = System.getenv("SPARK_EXECUTOR_URI") -val master = this.master match { - case Some(m) => m - case None => { -val prop = System.getenv("MASTER") -if (prop != null) prop else "local" - } -} val jars = SparkILoop.getAddedJars.map(new java.io.File(_).getAbsolutePath) val conf = new SparkConf() - .setMaster(master) + .setMaster(getMaster()) .setAppName("Spark shell") .setJars(jars) .set("spark.repl.class.uri", intp.classServer.uri) @@ -961,6 +955,17 @@ class SparkILoop(in0: Option[BufferedReader], protected val out: JPrintWriter, sparkContext } + private def getMaster(): String = { + val master = this.master match { + case Some(m) => m + case None => { + val prop = System.getenv("MASTER") + if (prop != null) prop else "local" + } + } + master + } + /** process command-line arguments and do as they request */ def process(args: Array[String]): Boolean = { val command = new SparkCommandLine(args.toList, msg => echo(msg))
git commit: [SPARK-1870] Made deployment with --jars work in yarn-standalone mode.
Repository: spark Updated Branches: refs/heads/branch-0.9 51f677eb9 -> 1d3aab961 [SPARK-1870] Made deployment with --jars work in yarn-standalone mode. Ported from 1.0 branch to 0.9 branch. Sent secondary jars to distributed cache of all containers and add the cached jars to classpath before executors start. Author: DB Tsai Closes #1013 from dbtsai/branch-0.9 and squashes the following commits: c5696f4 [DB Tsai] fix line too long b085f10 [DB Tsai] Make sure that empty string is filtered out when we get secondary jars 3cc1085 [DB Tsai] changed from var to val ab94aa1 [DB Tsai] Code formatting. 0956af9 [DB Tsai] Ported SPARK-1870 from 1.0 branch to 0.9 branch Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1d3aab96 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1d3aab96 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1d3aab96 Branch: refs/heads/branch-0.9 Commit: 1d3aab96120c6770399e78a72b5692cf8f61a144 Parents: 51f677e Author: DB Tsai Authored: Mon Jun 9 22:56:24 2014 -0700 Committer: Xiangrui Meng Committed: Mon Jun 9 22:56:24 2014 -0700 -- .../org/apache/spark/deploy/yarn/Client.scala | 24 .../org/apache/spark/deploy/yarn/Client.scala | 29 ++-- 2 files changed, 39 insertions(+), 14 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/1d3aab96/yarn/alpha/src/main/scala/org/apache/spark/deploy/yarn/Client.scala -- diff --git a/yarn/alpha/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/yarn/alpha/src/main/scala/org/apache/spark/deploy/yarn/Client.scala index 6956cd6..9e5e2d5 100644 --- a/yarn/alpha/src/main/scala/org/apache/spark/deploy/yarn/Client.scala +++ b/yarn/alpha/src/main/scala/org/apache/spark/deploy/yarn/Client.scala @@ -21,8 +21,7 @@ import java.net.{InetAddress, UnknownHostException, URI} import java.nio.ByteBuffer import scala.collection.JavaConversions._ -import scala.collection.mutable.HashMap -import scala.collection.mutable.Map +import scala.collection.mutable.{HashMap, ListBuffer, Map} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileContext, FileStatus, FileSystem, Path, FileUtil} @@ -264,6 +263,7 @@ class Client(args: ClientArguments, conf: Configuration, sparkConf: SparkConf) } // handle any add jars +val cachedSecondaryJarLinks = ListBuffer.empty[String] if ((args.addJars != null) && (!args.addJars.isEmpty())){ args.addJars.split(',').foreach { case file: String => val localURI = new URI(file.trim()) @@ -271,9 +271,11 @@ class Client(args: ClientArguments, conf: Configuration, sparkConf: SparkConf) val linkname = Option(localURI.getFragment()).getOrElse(localPath.getName()) val destPath = copyRemoteFile(dst, localPath, replication) distCacheMgr.addResource(fs, conf, destPath, localResources, LocalResourceType.FILE, - linkname, statCache, true) + linkname, statCache) +cachedSecondaryJarLinks += linkname } } +sparkConf.set(Client.CONF_SPARK_YARN_SECONDARY_JARS, cachedSecondaryJarLinks.mkString(",")) // handle any distributed cache files if ((args.files != null) && (!args.files.isEmpty())){ @@ -462,9 +464,10 @@ class Client(args: ClientArguments, conf: Configuration, sparkConf: SparkConf) } object Client { - val SPARK_JAR: String = "spark.jar" - val APP_JAR: String = "app.jar" + val SPARK_JAR: String = "__spark__.jar" + val APP_JAR: String = "__app__.jar" val LOG4J_PROP: String = "log4j.properties" + val CONF_SPARK_YARN_SECONDARY_JARS = "spark.yarn.secondary.jars" def main(argStrings: Array[String]) { // Set an env variable indicating we are running in YARN mode. @@ -491,11 +494,19 @@ object Client { Apps.addToEnvironment(env, Environment.CLASSPATH.name, Environment.PWD.$() + Path.SEPARATOR + LOG4J_PROP) } + +val cachedSecondaryJarLinks = + sparkConf.getOption(CONF_SPARK_YARN_SECONDARY_JARS).getOrElse("").split(",") +.filter(_.nonEmpty) + // Normally the users app.jar is last in case conflicts with spark jars val userClasspathFirst = sparkConf.get("spark.yarn.user.classpath.first", "false").toBoolean if (userClasspathFirst) { Apps.addToEnvironment(env, Environment.CLASSPATH.name, Environment.PWD.$() + Path.SEPARATOR + APP_JAR) + cachedSecondaryJarLinks.foreach(jarLink => +Apps.addToEnvironment(env, Environment.CLASSPATH.name, Environment.PWD.$() + + Path.SEPARATOR + jarLink)) } Apps.addToEnvironment(env, Environment.CLASSPATH.name, Environment.PWD.$() + Path.SEPARATOR + SPARK_JAR) @@ -504,6 +515,9 @@ object Client { if (!us
git commit: Make sure that empty string is filtered out when we get the secondary jars from conf
Repository: spark Updated Branches: refs/heads/master a9ec033c8 -> 6f2db8c2f Make sure that empty string is filtered out when we get the secondary jars from conf Author: DB Tsai Closes #1027 from dbtsai/dbtsai-classloader and squashes the following commits: 9ac6be3 [DB Tsai] Fixed line too long c9c7ad7 [DB Tsai] Make sure that empty string is filtered out when we get the secondary jars from conf. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6f2db8c2 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6f2db8c2 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6f2db8c2 Branch: refs/heads/master Commit: 6f2db8c2f51911f88a601ec5bf1509ea0e8173ed Parents: a9ec033 Author: DB Tsai Authored: Mon Jun 9 22:18:50 2014 -0700 Committer: Xiangrui Meng Committed: Mon Jun 9 22:18:50 2014 -0700 -- .../main/scala/org/apache/spark/deploy/yarn/ClientBase.scala | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/6f2db8c2/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ClientBase.scala -- diff --git a/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ClientBase.scala b/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ClientBase.scala index 4b5e0ef..801e8b3 100644 --- a/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ClientBase.scala +++ b/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ClientBase.scala @@ -221,7 +221,7 @@ trait ClientBase extends Logging { } } -var cachedSecondaryJarLinks = ListBuffer.empty[String] +val cachedSecondaryJarLinks = ListBuffer.empty[String] val fileLists = List( (args.addJars, LocalResourceType.FILE, true), (args.files, LocalResourceType.FILE, false), (args.archives, LocalResourceType.ARCHIVE, false) ) @@ -502,12 +502,14 @@ object ClientBase extends Logging { def addClasspathEntry(path: String) = YarnSparkHadoopUtil.addToEnvironment(env, Environment.CLASSPATH.name, path, File.pathSeparator) /** Add entry to the classpath. Interpreted as a path relative to the working directory. */ -def addPwdClasspathEntry(entry: String) = addClasspathEntry(Environment.PWD.$() + Path.SEPARATOR + entry) +def addPwdClasspathEntry(entry: String) = + addClasspathEntry(Environment.PWD.$() + Path.SEPARATOR + entry) extraClassPath.foreach(addClasspathEntry) val cachedSecondaryJarLinks = sparkConf.getOption(CONF_SPARK_YARN_SECONDARY_JARS).getOrElse("").split(",") +.filter(_.nonEmpty) // Normally the users app.jar is last in case conflicts with spark jars if (sparkConf.get("spark.yarn.user.classpath.first", "false").toBoolean) { addPwdClasspathEntry(APP_JAR)
git commit: [SPARK-1704][SQL] Fully support EXPLAIN commands as SchemaRDD.
Repository: spark Updated Branches: refs/heads/branch-1.0 65fa7bcac -> 5a79ba13e [SPARK-1704][SQL] Fully support EXPLAIN commands as SchemaRDD. This PR attempts to resolve [SPARK-1704](https://issues.apache.org/jira/browse/SPARK-1704) by introducing a physical plan for EXPLAIN commands, which just prints out the debug string (containing various SparkSQL's plans) of the corresponding QueryExecution for the actual query. Author: Zongheng Yang Closes #1003 from concretevitamin/explain-cmd and squashes the following commits: 5b7911f [Zongheng Yang] Add a regression test. 1bfa379 [Zongheng Yang] Modify output(). 719ada9 [Zongheng Yang] Override otherCopyArgs for ExplainCommandPhysical. 4318fd7 [Zongheng Yang] Make all output one Row. 439c6ab [Zongheng Yang] Minor cleanups. 408f574 [Zongheng Yang] SPARK-1704: Add CommandStrategy and ExplainCommandPhysical. (cherry picked from commit a9ec033c8cf489898cc47e2043bd9e86b7df1ff8) Signed-off-by: Michael Armbrust Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5a79ba13 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5a79ba13 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5a79ba13 Branch: refs/heads/branch-1.0 Commit: 5a79ba13ea75838fe53d99ca5aa289d81a58cdb3 Parents: 65fa7bc Author: Zongheng Yang Authored: Mon Jun 9 16:47:44 2014 -0700 Committer: Michael Armbrust Committed: Mon Jun 9 16:47:56 2014 -0700 -- .../catalyst/plans/logical/LogicalPlan.scala| 8 +++-- .../scala/org/apache/spark/sql/SQLContext.scala | 6 .../spark/sql/execution/SparkStrategies.scala | 11 +++ .../apache/spark/sql/execution/commands.scala | 32 .../org/apache/spark/sql/hive/HiveContext.scala | 3 +- .../sql/hive/execution/HiveQuerySuite.scala | 12 6 files changed, 68 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5a79ba13/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala index 2b8fbdc..4f641cd 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.catalyst.errors.TreeNodeException import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.QueryPlan -import org.apache.spark.sql.catalyst.types.StructType +import org.apache.spark.sql.catalyst.types.{StringType, StructType} import org.apache.spark.sql.catalyst.trees abstract class LogicalPlan extends QueryPlan[LogicalPlan] { @@ -102,7 +102,7 @@ abstract class LeafNode extends LogicalPlan with trees.LeafNode[LogicalPlan] { */ abstract class Command extends LeafNode { self: Product => - def output = Seq.empty + def output: Seq[Attribute] = Seq.empty } /** @@ -115,7 +115,9 @@ case class NativeCommand(cmd: String) extends Command * Returned by a parser when the users only wants to see what query plan would be executed, without * actually performing the execution. */ -case class ExplainCommand(plan: LogicalPlan) extends Command +case class ExplainCommand(plan: LogicalPlan) extends Command { + override def output = Seq(AttributeReference("plan", StringType, nullable = false)()) +} /** * A logical plan node with single child. http://git-wip-us.apache.org/repos/asf/spark/blob/5a79ba13/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala index 0109dd6..9fccbd6 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala @@ -191,6 +191,7 @@ class SQLContext(@transient val sparkContext: SparkContext) val sparkContext = self.sparkContext val strategies: Seq[Strategy] = + CommandStrategy(self) :: TakeOrdered :: PartialAggregation :: LeftSemiJoin :: @@ -252,6 +253,11 @@ class SQLContext(@transient val sparkContext: SparkContext) Batch("Prepare Expressions", Once, new BindReferences[SparkPlan]) :: Nil } + // TODO: or should we make QueryExecution protected[sql]? + protected[sql] def mkQueryExecution(plan: LogicalPlan) = new QueryExecution { +val logical = plan + } +
git commit: [SPARK-1704][SQL] Fully support EXPLAIN commands as SchemaRDD.
Repository: spark Updated Branches: refs/heads/master c6e041d17 -> a9ec033c8 [SPARK-1704][SQL] Fully support EXPLAIN commands as SchemaRDD. This PR attempts to resolve [SPARK-1704](https://issues.apache.org/jira/browse/SPARK-1704) by introducing a physical plan for EXPLAIN commands, which just prints out the debug string (containing various SparkSQL's plans) of the corresponding QueryExecution for the actual query. Author: Zongheng Yang Closes #1003 from concretevitamin/explain-cmd and squashes the following commits: 5b7911f [Zongheng Yang] Add a regression test. 1bfa379 [Zongheng Yang] Modify output(). 719ada9 [Zongheng Yang] Override otherCopyArgs for ExplainCommandPhysical. 4318fd7 [Zongheng Yang] Make all output one Row. 439c6ab [Zongheng Yang] Minor cleanups. 408f574 [Zongheng Yang] SPARK-1704: Add CommandStrategy and ExplainCommandPhysical. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a9ec033c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a9ec033c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a9ec033c Branch: refs/heads/master Commit: a9ec033c8cf489898cc47e2043bd9e86b7df1ff8 Parents: c6e041d Author: Zongheng Yang Authored: Mon Jun 9 16:47:44 2014 -0700 Committer: Michael Armbrust Committed: Mon Jun 9 16:47:44 2014 -0700 -- .../catalyst/plans/logical/LogicalPlan.scala| 8 +++-- .../scala/org/apache/spark/sql/SQLContext.scala | 6 .../spark/sql/execution/SparkStrategies.scala | 11 +++ .../apache/spark/sql/execution/commands.scala | 32 .../org/apache/spark/sql/hive/HiveContext.scala | 3 +- .../sql/hive/execution/HiveQuerySuite.scala | 12 6 files changed, 68 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a9ec033c/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala index 2b8fbdc..4f641cd 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.catalyst.errors.TreeNodeException import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.QueryPlan -import org.apache.spark.sql.catalyst.types.StructType +import org.apache.spark.sql.catalyst.types.{StringType, StructType} import org.apache.spark.sql.catalyst.trees abstract class LogicalPlan extends QueryPlan[LogicalPlan] { @@ -102,7 +102,7 @@ abstract class LeafNode extends LogicalPlan with trees.LeafNode[LogicalPlan] { */ abstract class Command extends LeafNode { self: Product => - def output = Seq.empty + def output: Seq[Attribute] = Seq.empty } /** @@ -115,7 +115,9 @@ case class NativeCommand(cmd: String) extends Command * Returned by a parser when the users only wants to see what query plan would be executed, without * actually performing the execution. */ -case class ExplainCommand(plan: LogicalPlan) extends Command +case class ExplainCommand(plan: LogicalPlan) extends Command { + override def output = Seq(AttributeReference("plan", StringType, nullable = false)()) +} /** * A logical plan node with single child. http://git-wip-us.apache.org/repos/asf/spark/blob/a9ec033c/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala index 5626f0d..fde4c48 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala @@ -191,6 +191,7 @@ class SQLContext(@transient val sparkContext: SparkContext) val sparkContext = self.sparkContext val strategies: Seq[Strategy] = + CommandStrategy(self) :: TakeOrdered :: PartialAggregation :: LeftSemiJoin :: @@ -256,6 +257,11 @@ class SQLContext(@transient val sparkContext: SparkContext) Batch("Prepare Expressions", Once, new BindReferences[SparkPlan]) :: Nil } + // TODO: or should we make QueryExecution protected[sql]? + protected[sql] def mkQueryExecution(plan: LogicalPlan) = new QueryExecution { +val logical = plan + } + /** * The primary workflow for executing relational queries using Spark. Designed to allow easy * ac
git commit: [SQL] Simple framework for debugging query execution
Repository: spark Updated Branches: refs/heads/branch-1.0 73cd1f822 -> 65fa7bcac [SQL] Simple framework for debugging query execution Only records number of tuples and unique dataTypes output right now... Example: ```scala scala> import org.apache.spark.sql.execution.debug._ scala> hql("SELECT value FROM src WHERE key > 10").debug(sparkContext) Results returned: 489 == Project [value#1:0] == Tuples output: 489 value StringType: {java.lang.String} == Filter (key#0:1 > 10) == Tuples output: 489 value StringType: {java.lang.String} key IntegerType: {java.lang.Integer} == HiveTableScan [value#1,key#0], (MetastoreRelation default, src, None), None == Tuples output: 500 value StringType: {java.lang.String} key IntegerType: {java.lang.Integer} ``` Author: Michael Armbrust Closes #1005 from marmbrus/debug and squashes the following commits: dcc3ca6 [Michael Armbrust] Add comments. c9dded2 [Michael Armbrust] Simple framework for debugging query execution (cherry picked from commit c6e041d171e3d9882ab15e2bd7a7217dc19647f6) Signed-off-by: Reynold Xin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/65fa7bca Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/65fa7bca Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/65fa7bca Branch: refs/heads/branch-1.0 Commit: 65fa7bcac81fc2a7a6c578775f72929cb201c20a Parents: 73cd1f8 Author: Michael Armbrust Authored: Mon Jun 9 14:24:19 2014 -0700 Committer: Reynold Xin Committed: Mon Jun 9 14:24:52 2014 -0700 -- .../scala/org/apache/spark/sql/SQLContext.scala | 5 - .../org/apache/spark/sql/execution/debug.scala | 45 --- .../spark/sql/execution/debug/package.scala | 119 +++ 3 files changed, 119 insertions(+), 50 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/65fa7bca/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala index 5ebdffa..0109dd6 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala @@ -281,11 +281,6 @@ class SQLContext(@transient val sparkContext: SparkContext) |== Physical Plan == |${stringOrError(executedPlan)} """.stripMargin.trim - -/** - * Runs the query after interposing operators that print the result of each intermediate step. - */ -def debugExec() = DebugQuery(executedPlan).execute().collect() } /** http://git-wip-us.apache.org/repos/asf/spark/blob/65fa7bca/sql/core/src/main/scala/org/apache/spark/sql/execution/debug.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/debug.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/debug.scala deleted file mode 100644 index a0d2910..000 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/debug.scala +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - *http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.execution - -private[sql] object DebugQuery { - def apply(plan: SparkPlan): SparkPlan = { -val visited = new collection.mutable.HashSet[Long]() -plan transform { - case s: SparkPlan if !visited.contains(s.id) => -visited += s.id -DebugNode(s) -} - } -} - -private[sql] case class DebugNode(child: SparkPlan) extends UnaryNode { - def references = Set.empty - def output = child.output - def execute() = { -val childRdd = child.execute() -println( - s""" -|= -|${child.simpleString} -|= - """.stripMargin) -childRdd.foreach(println(_)) -childRdd - } -} http://git-wip-us.apache.org/repos/asf/spark/blob/65fa7bca/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/p
git commit: [SQL] Simple framework for debugging query execution
Repository: spark Updated Branches: refs/heads/master e27344768 -> c6e041d17 [SQL] Simple framework for debugging query execution Only records number of tuples and unique dataTypes output right now... Example: ```scala scala> import org.apache.spark.sql.execution.debug._ scala> hql("SELECT value FROM src WHERE key > 10").debug(sparkContext) Results returned: 489 == Project [value#1:0] == Tuples output: 489 value StringType: {java.lang.String} == Filter (key#0:1 > 10) == Tuples output: 489 value StringType: {java.lang.String} key IntegerType: {java.lang.Integer} == HiveTableScan [value#1,key#0], (MetastoreRelation default, src, None), None == Tuples output: 500 value StringType: {java.lang.String} key IntegerType: {java.lang.Integer} ``` Author: Michael Armbrust Closes #1005 from marmbrus/debug and squashes the following commits: dcc3ca6 [Michael Armbrust] Add comments. c9dded2 [Michael Armbrust] Simple framework for debugging query execution Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c6e041d1 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c6e041d1 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c6e041d1 Branch: refs/heads/master Commit: c6e041d171e3d9882ab15e2bd7a7217dc19647f6 Parents: e273447 Author: Michael Armbrust Authored: Mon Jun 9 14:24:19 2014 -0700 Committer: Reynold Xin Committed: Mon Jun 9 14:24:19 2014 -0700 -- .../scala/org/apache/spark/sql/SQLContext.scala | 5 - .../org/apache/spark/sql/execution/debug.scala | 45 --- .../spark/sql/execution/debug/package.scala | 119 +++ 3 files changed, 119 insertions(+), 50 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c6e041d1/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala index e371c82..5626f0d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala @@ -285,11 +285,6 @@ class SQLContext(@transient val sparkContext: SparkContext) |== Physical Plan == |${stringOrError(executedPlan)} """.stripMargin.trim - -/** - * Runs the query after interposing operators that print the result of each intermediate step. - */ -def debugExec() = DebugQuery(executedPlan).execute().collect() } /** http://git-wip-us.apache.org/repos/asf/spark/blob/c6e041d1/sql/core/src/main/scala/org/apache/spark/sql/execution/debug.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/debug.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/debug.scala deleted file mode 100644 index a0d2910..000 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/debug.scala +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - *http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.execution - -private[sql] object DebugQuery { - def apply(plan: SparkPlan): SparkPlan = { -val visited = new collection.mutable.HashSet[Long]() -plan transform { - case s: SparkPlan if !visited.contains(s.id) => -visited += s.id -DebugNode(s) -} - } -} - -private[sql] case class DebugNode(child: SparkPlan) extends UnaryNode { - def references = Set.empty - def output = child.output - def execute() = { -val childRdd = child.execute() -println( - s""" -|= -|${child.simpleString} -|= - """.stripMargin) -childRdd.foreach(println(_)) -childRdd - } -} http://git-wip-us.apache.org/repos/asf/spark/blob/c6e041d1/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala -- diff --git a/sql/core
git commit: [SPARK-1522] : YARN ClientBase throws a NPE if there is no YARN Application CP
Repository: spark Updated Branches: refs/heads/master 6cf335d79 -> e27344768 [SPARK-1522] : YARN ClientBase throws a NPE if there is no YARN Application CP The current implementation of ClientBase.getDefaultYarnApplicationClasspath inspects the MRJobConfig class for the field DEFAULT_YARN_APPLICATION_CLASSPATH when it should be really looking into YarnConfiguration. If the Application Configuration has no yarn.application.classpath defined a NPE exception will be thrown. Additional Changes include: * Test Suite for ClientBase added [ticket: SPARK-1522] : https://issues.apache.org/jira/browse/SPARK-1522 Author : bernardo.gomezpala...@gmail.com Testing : SPARK_HADOOP_VERSION=2.3.0 SPARK_YARN=true ./sbt/sbt test Author: Bernardo Gomez Palacio Closes #433 from berngp/feature/SPARK-1522 and squashes the following commits: 2c2e118 [Bernardo Gomez Palacio] [SPARK-1522]: YARN ClientBase throws a NPE if there is no YARN Application specific CP Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e2734476 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e2734476 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e2734476 Branch: refs/heads/master Commit: e273447684779a18bd61d733bfe7958b78657ffd Parents: 6cf335d Author: Bernardo Gomez Palacio Authored: Mon Jun 9 16:14:54 2014 -0500 Committer: Thomas Graves Committed: Mon Jun 9 16:14:54 2014 -0500 -- .../apache/spark/deploy/yarn/ClientBase.scala | 89 +-- .../spark/deploy/yarn/ClientBaseSuite.scala | 112 +++ 2 files changed, 167 insertions(+), 34 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e2734476/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ClientBase.scala -- diff --git a/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ClientBase.scala b/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ClientBase.scala index aeb3f00..4b5e0ef 100644 --- a/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ClientBase.scala +++ b/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ClientBase.scala @@ -23,6 +23,7 @@ import java.nio.ByteBuffer import scala.collection.JavaConversions._ import scala.collection.mutable.{HashMap, ListBuffer, Map} +import scala.util.{Try, Success, Failure} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs._ @@ -378,7 +379,7 @@ trait ClientBase extends Logging { } } -object ClientBase { +object ClientBase extends Logging { val SPARK_JAR: String = "__spark__.jar" val APP_JAR: String = "__app__.jar" val LOG4J_PROP: String = "log4j.properties" @@ -388,37 +389,47 @@ object ClientBase { def getSparkJar = sys.env.get("SPARK_JAR").getOrElse(SparkContext.jarOfClass(this.getClass).head) - // Based on code from org.apache.hadoop.mapreduce.v2.util.MRApps - def populateHadoopClasspath(conf: Configuration, env: HashMap[String, String]) { -val classpathEntries = Option(conf.getStrings( - YarnConfiguration.YARN_APPLICATION_CLASSPATH)).getOrElse( -getDefaultYarnApplicationClasspath()) -if (classpathEntries != null) { - for (c <- classpathEntries) { -YarnSparkHadoopUtil.addToEnvironment(env, Environment.CLASSPATH.name, c.trim, - File.pathSeparator) - } + def populateHadoopClasspath(conf: Configuration, env: HashMap[String, String]) = { +val classPathElementsToAdd = getYarnAppClasspath(conf) ++ getMRAppClasspath(conf) +for (c <- classPathElementsToAdd.flatten) { + YarnSparkHadoopUtil.addToEnvironment( +env, +Environment.CLASSPATH.name, +c.trim, +File.pathSeparator) } +classPathElementsToAdd + } -val mrClasspathEntries = Option(conf.getStrings( - "mapreduce.application.classpath")).getOrElse( -getDefaultMRApplicationClasspath()) -if (mrClasspathEntries != null) { - for (c <- mrClasspathEntries) { -YarnSparkHadoopUtil.addToEnvironment(env, Environment.CLASSPATH.name, c.trim, - File.pathSeparator) - } -} + private def getYarnAppClasspath(conf: Configuration): Option[Seq[String]] = +Option(conf.getStrings(YarnConfiguration.YARN_APPLICATION_CLASSPATH)) match { + case Some(s) => Some(s.toSeq) + case None => getDefaultYarnApplicationClasspath } - def getDefaultYarnApplicationClasspath(): Array[String] = { -try { - val field = classOf[MRJobConfig].getField("DEFAULT_YARN_APPLICATION_CLASSPATH") - field.get(null).asInstanceOf[Array[String]] -} catch { - case err: NoSuchFieldError => null - case err: NoSuchFieldException => null + private def getMRAppClasspath(conf: Configuration): Option[Seq[String]] = +Option(conf
git commit: Added a TaskSetManager unit test.
Repository: spark Updated Branches: refs/heads/master 0cf600280 -> 6cf335d79 Added a TaskSetManager unit test. This test ensures that when there are no alive executors that satisfy a particular locality level, the TaskSetManager doesn't ever use that as the maximum allowed locality level (this optimization ensures that a job doesn't wait extra time in an attempt to satisfy a scheduling locality level that is impossible). @mateiz and @lirui-intel this unit test illustrates an issue with #892 (it fails with that patch). Author: Kay Ousterhout Closes #1024 from kayousterhout/scheduler_unit_test and squashes the following commits: de6a08f [Kay Ousterhout] Added a TaskSetManager unit test. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6cf335d7 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6cf335d7 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6cf335d7 Branch: refs/heads/master Commit: 6cf335d79a2f69ecd9a139dd0a03acff60585be4 Parents: 0cf6002 Author: Kay Ousterhout Authored: Mon Jun 9 13:13:53 2014 -0700 Committer: Kay Ousterhout Committed: Mon Jun 9 13:13:53 2014 -0700 -- .../spark/scheduler/TaskSetManagerSuite.scala | 16 1 file changed, 16 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/6cf335d7/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala index c92b6dc..6f1fd25 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala @@ -141,6 +141,22 @@ class TaskSetManagerSuite extends FunSuite with LocalSparkContext with Logging { assert(sched.finishedManagers.contains(manager)) } + test("skip unsatisfiable locality levels") { +sc = new SparkContext("local", "test") +val sched = new FakeTaskScheduler(sc, ("execA", "host1"), ("execC", "host2")) +val taskSet = FakeTask.createTaskSet(1, Seq(TaskLocation("host1", "execB"))) +val clock = new FakeClock +val manager = new TaskSetManager(sched, taskSet, MAX_TASK_FAILURES, clock) + +// An executor that is not NODE_LOCAL should be rejected. +assert(manager.resourceOffer("execC", "host2", ANY) === None) + +// Because there are no alive PROCESS_LOCAL executors, the base locality level should be +// NODE_LOCAL. So, we should schedule the task on this offered NODE_LOCAL executor before +// any of the locality wait timers expire. +assert(manager.resourceOffer("execA", "host1", ANY).get.index === 0) + } + test("basic delay scheduling") { sc = new SparkContext("local", "test") val sched = new FakeTaskScheduler(sc, ("exec1", "host1"), ("exec2", "host2"))
git commit: [SPARK-1495][SQL]add support for left semi join
Repository: spark Updated Branches: refs/heads/branch-1.0 a5848d325 -> 73cd1f822 [SPARK-1495][SQL]add support for left semi join Just submit another solution for #395 Author: Daoyuan Author: Michael Armbrust Author: Daoyuan Wang Closes #837 from adrian-wang/left-semi-join-support and squashes the following commits: d39cd12 [Daoyuan Wang] Merge pull request #1 from marmbrus/pr/837 6713c09 [Michael Armbrust] Better debugging for failed query tests. 035b73e [Michael Armbrust] Add test for left semi that can't be done with a hash join. 5ec6fa4 [Michael Armbrust] Add left semi to SQL Parser. 4c726e5 [Daoyuan] improvement according to Michael 8d4a121 [Daoyuan] add golden files for leftsemijoin 83a3c8a [Daoyuan] scala style fix 14cff80 [Daoyuan] add support for left semi join (cherry picked from commit 0cf600280167a94faec75736223256e8f2e48085) Signed-off-by: Michael Armbrust Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/73cd1f82 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/73cd1f82 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/73cd1f82 Branch: refs/heads/branch-1.0 Commit: 73cd1f8223a4799fd104fe48ba011315236cf4a8 Parents: a5848d3 Author: Daoyuan Authored: Mon Jun 9 11:31:36 2014 -0700 Committer: Michael Armbrust Committed: Mon Jun 9 11:32:04 2014 -0700 -- .../apache/spark/sql/catalyst/SqlParser.scala | 2 + .../spark/sql/catalyst/planning/patterns.scala | 5 + .../spark/sql/catalyst/plans/joinTypes.scala| 1 + .../catalyst/plans/logical/basicOperators.scala | 9 +- .../scala/org/apache/spark/sql/SQLContext.scala | 1 + .../spark/sql/execution/SparkStrategies.scala | 16 +++ .../org/apache/spark/sql/execution/joins.scala | 131 +++ .../scala/org/apache/spark/sql/QueryTest.scala | 2 +- .../org/apache/spark/sql/SQLQuerySuite.scala| 7 + .../org/apache/spark/sql/hive/HiveContext.scala | 1 + .../org/apache/spark/sql/hive/HiveQl.scala | 1 + ...tsemijoin-0-80b6466213face7fbcb0de044611e1f5 | 0 ...tsemijoin-1-d1f6a3dea28a5f0fee08026bf33d9129 | 0 ...semijoin-10-89737a8857b5b61cc909e0c797f86aea | 4 + ...semijoin-11-80b6466213face7fbcb0de044611e1f5 | 0 ...semijoin-12-d1f6a3dea28a5f0fee08026bf33d9129 | 0 ...tsemijoin-2-43d53504df013e6b35f81811138a167a | 1 + ...tsemijoin-3-b07d292423312aafa5e5762a579decd2 | 0 ...tsemijoin-4-3ac2226efe7cb5d999c1c5e4ac2114be | 0 ...tsemijoin-5-9c307c0559d735960ce77efa95b2b17b | 0 ...tsemijoin-6-82921fc96eef547ec0f71027ee88298c | 0 ...tsemijoin-7-b30aa3b4a45db6b64bb46b4d9bd32ff0 | 0 ...eftsemijoin-8-73cad58a10a1483ccb15e94a857013 | 4 + ...tsemijoin-9-c5efa6b8771a51610d655be461670e1e | 2 + ...mijoin_mr-0-7087fb6281a34d00f1812d2ff4ba8b75 | 0 ...mijoin_mr-1-aa3f07f028027ffd13ab5535dc821593 | 0 ...ijoin_mr-10-9914f44ecb6ae7587b62e5349ff60d04 | 1 + ...ijoin_mr-11-2027ecb1495d5550c5d56abf6b95b0a7 | 2 + ...mijoin_mr-2-3f65953ae60375156367c54533978782 | 0 ...mijoin_mr-3-645cf8b871c9b27418d6fa1d1bda9a52 | 0 ...mijoin_mr-4-333895fe6abca27c8edb5c91bfe10d2f | 2 + ...mijoin_mr-5-896d0948c1df849df9764a6d8ad8fff9 | 20 +++ ...mijoin_mr-6-b1e2ade89ae898650f0be4f796d8947b | 1 + ...mijoin_mr-7-8e9c2969b999557363e40f9ebb3f6d7c | 1 + ...mijoin_mr-8-c61b972d4409babe41d8963e841af45b | 1 + ...mijoin_mr-9-2027ecb1495d5550c5d56abf6b95b0a7 | 2 + .../hive/execution/HiveCompatibilitySuite.scala | 2 + 37 files changed, 216 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/73cd1f82/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala index a404e74..cc65012 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala @@ -131,6 +131,7 @@ class SqlParser extends StandardTokenParsers with PackratParsers { protected val OUTER = Keyword("OUTER") protected val RIGHT = Keyword("RIGHT") protected val SELECT = Keyword("SELECT") + protected val SEMI = Keyword("SEMI") protected val STRING = Keyword("STRING") protected val SUM = Keyword("SUM") protected val TRUE = Keyword("TRUE") @@ -241,6 +242,7 @@ class SqlParser extends StandardTokenParsers with PackratParsers { protected lazy val joinType: Parser[JoinType] = INNER ^^^ Inner | + LEFT ~ SEMI ^^^ LeftSemi | LEFT ~ opt(OUTER) ^^^ LeftOuter | RIGHT ~ opt(OUTER) ^^^ RightOuter | FULL ~ opt(OUTER) ^^^ FullOuter http://git-wip-us.apache.org/repos/asf/spark/blob/73cd1f8
git commit: [SPARK-1495][SQL]add support for left semi join
Repository: spark Updated Branches: refs/heads/master 35630c86f -> 0cf600280 [SPARK-1495][SQL]add support for left semi join Just submit another solution for #395 Author: Daoyuan Author: Michael Armbrust Author: Daoyuan Wang Closes #837 from adrian-wang/left-semi-join-support and squashes the following commits: d39cd12 [Daoyuan Wang] Merge pull request #1 from marmbrus/pr/837 6713c09 [Michael Armbrust] Better debugging for failed query tests. 035b73e [Michael Armbrust] Add test for left semi that can't be done with a hash join. 5ec6fa4 [Michael Armbrust] Add left semi to SQL Parser. 4c726e5 [Daoyuan] improvement according to Michael 8d4a121 [Daoyuan] add golden files for leftsemijoin 83a3c8a [Daoyuan] scala style fix 14cff80 [Daoyuan] add support for left semi join Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0cf60028 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0cf60028 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0cf60028 Branch: refs/heads/master Commit: 0cf600280167a94faec75736223256e8f2e48085 Parents: 35630c8 Author: Daoyuan Authored: Mon Jun 9 11:31:36 2014 -0700 Committer: Michael Armbrust Committed: Mon Jun 9 11:31:36 2014 -0700 -- .../apache/spark/sql/catalyst/SqlParser.scala | 2 + .../spark/sql/catalyst/planning/patterns.scala | 5 + .../spark/sql/catalyst/plans/joinTypes.scala| 1 + .../catalyst/plans/logical/basicOperators.scala | 9 +- .../scala/org/apache/spark/sql/SQLContext.scala | 1 + .../spark/sql/execution/SparkStrategies.scala | 16 +++ .../org/apache/spark/sql/execution/joins.scala | 131 +++ .../scala/org/apache/spark/sql/QueryTest.scala | 2 +- .../org/apache/spark/sql/SQLQuerySuite.scala| 7 + .../org/apache/spark/sql/hive/HiveContext.scala | 1 + .../org/apache/spark/sql/hive/HiveQl.scala | 1 + ...tsemijoin-0-80b6466213face7fbcb0de044611e1f5 | 0 ...tsemijoin-1-d1f6a3dea28a5f0fee08026bf33d9129 | 0 ...semijoin-10-89737a8857b5b61cc909e0c797f86aea | 4 + ...semijoin-11-80b6466213face7fbcb0de044611e1f5 | 0 ...semijoin-12-d1f6a3dea28a5f0fee08026bf33d9129 | 0 ...tsemijoin-2-43d53504df013e6b35f81811138a167a | 1 + ...tsemijoin-3-b07d292423312aafa5e5762a579decd2 | 0 ...tsemijoin-4-3ac2226efe7cb5d999c1c5e4ac2114be | 0 ...tsemijoin-5-9c307c0559d735960ce77efa95b2b17b | 0 ...tsemijoin-6-82921fc96eef547ec0f71027ee88298c | 0 ...tsemijoin-7-b30aa3b4a45db6b64bb46b4d9bd32ff0 | 0 ...eftsemijoin-8-73cad58a10a1483ccb15e94a857013 | 4 + ...tsemijoin-9-c5efa6b8771a51610d655be461670e1e | 2 + ...mijoin_mr-0-7087fb6281a34d00f1812d2ff4ba8b75 | 0 ...mijoin_mr-1-aa3f07f028027ffd13ab5535dc821593 | 0 ...ijoin_mr-10-9914f44ecb6ae7587b62e5349ff60d04 | 1 + ...ijoin_mr-11-2027ecb1495d5550c5d56abf6b95b0a7 | 2 + ...mijoin_mr-2-3f65953ae60375156367c54533978782 | 0 ...mijoin_mr-3-645cf8b871c9b27418d6fa1d1bda9a52 | 0 ...mijoin_mr-4-333895fe6abca27c8edb5c91bfe10d2f | 2 + ...mijoin_mr-5-896d0948c1df849df9764a6d8ad8fff9 | 20 +++ ...mijoin_mr-6-b1e2ade89ae898650f0be4f796d8947b | 1 + ...mijoin_mr-7-8e9c2969b999557363e40f9ebb3f6d7c | 1 + ...mijoin_mr-8-c61b972d4409babe41d8963e841af45b | 1 + ...mijoin_mr-9-2027ecb1495d5550c5d56abf6b95b0a7 | 2 + .../hive/execution/HiveCompatibilitySuite.scala | 2 + 37 files changed, 216 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0cf60028/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala index a404e74..cc65012 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala @@ -131,6 +131,7 @@ class SqlParser extends StandardTokenParsers with PackratParsers { protected val OUTER = Keyword("OUTER") protected val RIGHT = Keyword("RIGHT") protected val SELECT = Keyword("SELECT") + protected val SEMI = Keyword("SEMI") protected val STRING = Keyword("STRING") protected val SUM = Keyword("SUM") protected val TRUE = Keyword("TRUE") @@ -241,6 +242,7 @@ class SqlParser extends StandardTokenParsers with PackratParsers { protected lazy val joinType: Parser[JoinType] = INNER ^^^ Inner | + LEFT ~ SEMI ^^^ LeftSemi | LEFT ~ opt(OUTER) ^^^ LeftOuter | RIGHT ~ opt(OUTER) ^^^ RightOuter | FULL ~ opt(OUTER) ^^^ FullOuter http://git-wip-us.apache.org/repos/asf/spark/blob/0cf60028/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala ---
git commit: SPARK-1944 Document --verbose in spark-shell -h
Repository: spark Updated Branches: refs/heads/master 6113ac155 -> 35630c86f SPARK-1944 Document --verbose in spark-shell -h https://issues.apache.org/jira/browse/SPARK-1944 Author: Andrew Ash Closes #1020 from ash211/SPARK-1944 and squashes the following commits: a831c4d [Andrew Ash] SPARK-1944 Document --verbose in spark-shell -h Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/35630c86 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/35630c86 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/35630c86 Branch: refs/heads/master Commit: 35630c86ff0e27862c9d902887eb0a24d25867ae Parents: 6113ac1 Author: Andrew Ash Authored: Mon Jun 9 10:21:21 2014 -0700 Committer: Reynold Xin Committed: Mon Jun 9 10:21:21 2014 -0700 -- .../main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala | 3 +++ 1 file changed, 3 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/35630c86/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala -- diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala index 153eee3..f1032ea 100644 --- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala +++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala @@ -360,6 +360,9 @@ private[spark] class SparkSubmitArguments(args: Seq[String]) { | | --executor-memory MEM Memory per executor (e.g. 1000M, 2G) (Default: 1G). | +| --help, -h Show this help message and exit +| --verbose, -v Print additional debug output +| | Spark standalone with cluster deploy mode only: | --driver-cores NUM Cores for driver (Default: 1). | --supervise If given, restarts the driver on failure.
git commit: SPARK-1944 Document --verbose in spark-shell -h
Repository: spark Updated Branches: refs/heads/branch-1.0 502a8f795 -> a5848d325 SPARK-1944 Document --verbose in spark-shell -h https://issues.apache.org/jira/browse/SPARK-1944 Author: Andrew Ash Closes #1020 from ash211/SPARK-1944 and squashes the following commits: a831c4d [Andrew Ash] SPARK-1944 Document --verbose in spark-shell -h (cherry picked from commit 35630c86ff0e27862c9d902887eb0a24d25867ae) Signed-off-by: Reynold Xin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a5848d32 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a5848d32 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a5848d32 Branch: refs/heads/branch-1.0 Commit: a5848d325ae0909072800cbb3ea9ad73a3708965 Parents: 502a8f7 Author: Andrew Ash Authored: Mon Jun 9 10:21:21 2014 -0700 Committer: Reynold Xin Committed: Mon Jun 9 10:21:45 2014 -0700 -- .../main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala | 3 +++ 1 file changed, 3 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a5848d32/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala -- diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala index 153eee3..f1032ea 100644 --- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala +++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala @@ -360,6 +360,9 @@ private[spark] class SparkSubmitArguments(args: Seq[String]) { | | --executor-memory MEM Memory per executor (e.g. 1000M, 2G) (Default: 1G). | +| --help, -h Show this help message and exit +| --verbose, -v Print additional debug output +| | Spark standalone with cluster deploy mode only: | --driver-cores NUM Cores for driver (Default: 1). | --supervise If given, restarts the driver on failure.
git commit: [SPARK-1308] Add getNumPartitions to pyspark RDD
Repository: spark Updated Branches: refs/heads/master 32ee9f066 -> 6113ac155 [SPARK-1308] Add getNumPartitions to pyspark RDD Add getNumPartitions to pyspark RDD to provide an intuitive way to get number of partitions in RDD like we can do in scala today. Author: Syed Hashmi Closes #995 from syedhashmi/master and squashes the following commits: de0ed5e [Syed Hashmi] [SPARK-1308] Add getNumPartitions to pyspark RDD Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6113ac15 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6113ac15 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6113ac15 Branch: refs/heads/master Commit: 6113ac1559d62c828dfbf08ef0f7f172c24cf7f5 Parents: 32ee9f0 Author: Syed Hashmi Authored: Mon Jun 9 00:08:40 2014 -0700 Committer: Reynold Xin Committed: Mon Jun 9 00:08:40 2014 -0700 -- python/pyspark/rdd.py | 45 +++-- 1 file changed, 27 insertions(+), 18 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/6113ac15/python/pyspark/rdd.py -- diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py index ca0a955..9c69c79 100644 --- a/python/pyspark/rdd.py +++ b/python/pyspark/rdd.py @@ -250,7 +250,7 @@ class RDD(object): def map(self, f, preservesPartitioning=False): """ Return a new RDD by applying a function to each element of this RDD. - + >>> rdd = sc.parallelize(["b", "a", "c"]) >>> sorted(rdd.map(lambda x: (x, 1)).collect()) [('a', 1), ('b', 1), ('c', 1)] @@ -312,6 +312,15 @@ class RDD(object): "use mapPartitionsWithIndex instead", DeprecationWarning, stacklevel=2) return self.mapPartitionsWithIndex(f, preservesPartitioning) +def getNumPartitions(self): + """ + Returns the number of partitions in RDD + >>> rdd = sc.parallelize([1, 2, 3, 4], 2) + >>> rdd.getNumPartitions() + 2 + """ + return self._jrdd.splits().size() + def filter(self, f): """ Return a new RDD containing only the elements that satisfy a predicate. @@ -413,9 +422,9 @@ class RDD(object): def intersection(self, other): """ -Return the intersection of this RDD and another one. The output will not +Return the intersection of this RDD and another one. The output will not contain any duplicate elements, even if the input RDDs did. - + Note that this method performs a shuffle internally. >>> rdd1 = sc.parallelize([1, 10, 2, 3, 4, 5]) @@ -571,14 +580,14 @@ class RDD(object): """ Applies a function to each partition of this RDD. ->>> def f(iterator): -... for x in iterator: -... print x +>>> def f(iterator): +... for x in iterator: +... print x ... yield None >>> sc.parallelize([1, 2, 3, 4, 5]).foreachPartition(f) """ self.mapPartitions(f).collect() # Force evaluation - + def collect(self): """ Return a list that contains all of the elements in this RDD. @@ -673,7 +682,7 @@ class RDD(object): yield acc return self.mapPartitions(func).fold(zeroValue, combOp) - + def max(self): """ @@ -692,7 +701,7 @@ class RDD(object): 1.0 """ return self.reduce(min) - + def sum(self): """ Add up the elements in this RDD. @@ -786,7 +795,7 @@ class RDD(object): m1[k] += v return m1 return self.mapPartitions(countPartition).reduce(mergeMaps) - + def top(self, num): """ Get the top N elements from a RDD. @@ -814,7 +823,7 @@ class RDD(object): def takeOrdered(self, num, key=None): """ Get the N elements from a RDD ordered in ascending order or as specified -by the optional key function. +by the optional key function. >>> sc.parallelize([10, 1, 2, 9, 3, 4, 5, 6, 7]).takeOrdered(6) [1, 2, 3, 4, 5, 6] @@ -834,7 +843,7 @@ class RDD(object): if key_ != None: x = [i[1] for i in x] return x - + def merge(a, b): return next(topNKeyedElems(a + b)) result = self.mapPartitions(lambda i: topNKeyedElems(i, key)).reduce(merge) @@ -1169,12 +1178,12 @@ class RDD(object): combiners[k] = mergeCombiners(combiners[k], v) return combiners.iteritems() return shuffled.mapPartitions(_mergeCombiners) - + def foldByKey(self, zeroValue, func, numPartitions=None): """