spark git commit: [Minor] Fix doc typo for describing primitiveTerm effectiveness condition
Repository: spark Updated Branches: refs/heads/branch-1.3 58e7198f0 - f92876a54 [Minor] Fix doc typo for describing primitiveTerm effectiveness condition It should be `true` instead of `false`? Author: Liang-Chi Hsieh vii...@gmail.com Closes #4762 from viirya/doc_fix and squashes the following commits: 2e37482 [Liang-Chi Hsieh] Fix doc. (cherry picked from commit 3f9def81170c24f24f4a6b7ca7905de4f75e11e0) Signed-off-by: Michael Armbrust mich...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f92876a5 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f92876a5 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f92876a5 Branch: refs/heads/branch-1.3 Commit: f92876a5430d2b87e2df66a4eddc833d568e3e1a Parents: 58e7198 Author: Liang-Chi Hsieh vii...@gmail.com Authored: Mon Mar 2 13:11:17 2015 -0800 Committer: Michael Armbrust mich...@databricks.com Committed: Mon Mar 2 13:11:38 2015 -0800 -- .../spark/sql/catalyst/expressions/codegen/CodeGenerator.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f92876a5/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala index 1f80d84..c347780 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala @@ -121,7 +121,7 @@ abstract class CodeGenerator[InType : AnyRef, OutType : AnyRef] extends Loggin * @param nullTerm A term that holds a boolean value representing whether the expression evaluated * to null. * @param primitiveTerm A term for a possible primitive value of the result of the evaluation. Not - * valid if `nullTerm` is set to `false`. + * valid if `nullTerm` is set to `true`. * @param objectTerm A possibly boxed version of the result of evaluating this expression. */ protected case class EvaluatedExpression( - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
svn commit: r1663444 - in /spark/site: examples.html index.html
Author: srowen Date: Mon Mar 2 22:06:52 2015 New Revision: 1663444 URL: http://svn.apache.org/r1663444 Log: SPARK-4992 Improve var naming in Python example; change other var names to match Modified: spark/site/examples.html spark/site/index.html Modified: spark/site/examples.html URL: http://svn.apache.org/viewvc/spark/site/examples.html?rev=1663444r1=1663443r2=1663444view=diff == --- spark/site/examples.html (original) +++ spark/site/examples.html Mon Mar 2 22:06:52 2015 @@ -187,8 +187,8 @@ previous ones, and emactions/em, whi div class=tab-content div class=tab-pane tab-pane-python active div class=code code-tab -file = spark.textFile(span class=stringhdfs://.../span)br / -errors = file.span class=sparkopfilter/span(span class=closurelambda line: ERROR in line/span)br / +text_file = spark.textFile(span class=stringhdfs://.../span)br / +errors = text_file.span class=sparkopfilter/span(span class=closurelambda line: ERROR in line/span)br / span class=comment# Count all the errors/spanbr / errors.span class=sparkopcount/span()br / span class=comment# Count errors mentioning MySQL/spanbr / @@ -199,8 +199,8 @@ previous ones, and emactions/em, whi /div div class=tab-pane tab-pane-scala div class=code code-tab -span class=keywordval/span file = spark.textFile(span class=stringhdfs://.../span)br / -span class=keywordval/span errors = file.span class=sparkopfilter/span(span class=closureline =gt; line.contains(ERROR)/span)br / +span class=keywordval/span textFile = spark.textFile(span class=stringhdfs://.../span)br / +span class=keywordval/span errors = textFile.span class=sparkopfilter/span(span class=closureline =gt; line.contains(ERROR)/span)br / span class=comment// Count all the errors/spanbr / errors.span class=sparkopcount/span()br / span class=comment// Count errors mentioning MySQL/spanbr / @@ -211,8 +211,8 @@ previous ones, and emactions/em, whi /div div class=tab-pane tab-pane-java div class=code code-tab -JavaRDDlt;Stringgt; file = spark.textFile(span class=stringhdfs://.../span);br / -JavaRDDlt;Stringgt; errors = file.span class=sparkopfilter/span(span class=closurenew Functionlt;String, Booleangt;() {br / +JavaRDDlt;Stringgt; textFile = spark.textFile(span class=stringhdfs://.../span);br / +JavaRDDlt;Stringgt; errors = textFile.span class=sparkopfilter/span(span class=closurenew Functionlt;String, Booleangt;() {br / nbsp;nbsp;public Boolean call(String s) { return s.contains(ERROR); }br / }/span);br / span class=comment// Count all the errors/spanbr / @@ -272,8 +272,8 @@ previous ones, and emactions/em, whi div class=tab-content div class=tab-pane tab-pane-python active div class=code code-tab -file = spark.textFile(span class=stringhdfs://.../span)br / -counts = file.span class=sparkopflatMap/span(span class=closurelambda line: line.split( )/span) \br / +text_file = spark.textFile(span class=stringhdfs://.../span)br / +counts = text_file.span class=sparkopflatMap/span(span class=closurelambda line: line.split( )/span) \br / nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;.span class=sparkopmap/span(span class=closurelambda word: (word, 1)/span) \br / nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;.span class=sparkopreduceByKey/span(span class=closurelambda a, b: a + b/span)br / counts.span class=sparkopsaveAsTextFile/span(span class=stringhdfs://.../span) @@ -281,8 +281,8 @@ previous ones, and emactions/em, whi /div div class=tab-pane tab-pane-scala div class=code code-tab -span class=keywordval/span file = spark.textFile(span class=stringhdfs://.../span)br / -span class=keywordval/span counts = file.span class=sparkopflatMap/span(span class=closureline =gt; line.split( )/span)br / +span class=keywordval/span textFile = spark.textFile(span class=stringhdfs://.../span)br / +span class=keywordval/span counts = textFile.span class=sparkopflatMap/span(span class=closureline =gt; line.split( )/span)br / nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;.span class=sparkopmap/span(span class=closureword =gt; (word, 1)/span)br / nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;.span class=sparkopreduceByKey/span(span class=closure_ + _/span)br / counts.span class=sparkopsaveAsTextFile/span(span class=stringhdfs://.../span) @@ -290,8 +290,8 @@ previous ones, and emactions/em, whi /div div class=tab-pane tab-pane-java div class=code code-tab -JavaRDDlt;Stringgt; file = spark.textFile(span class=stringhdfs://.../span);br / -JavaRDDlt;Stringgt; words = file.span class=sparkopflatMap/span(span class=closurenew FlatMapFunctionlt;String, Stringgt;() {br / +
spark git commit: [Minor] Fix doc typo for describing primitiveTerm effectiveness condition
Repository: spark Updated Branches: refs/heads/master 0b472f60c - 3f9def811 [Minor] Fix doc typo for describing primitiveTerm effectiveness condition It should be `true` instead of `false`? Author: Liang-Chi Hsieh vii...@gmail.com Closes #4762 from viirya/doc_fix and squashes the following commits: 2e37482 [Liang-Chi Hsieh] Fix doc. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3f9def81 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3f9def81 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3f9def81 Branch: refs/heads/master Commit: 3f9def81170c24f24f4a6b7ca7905de4f75e11e0 Parents: 0b472f6 Author: Liang-Chi Hsieh vii...@gmail.com Authored: Mon Mar 2 13:11:17 2015 -0800 Committer: Michael Armbrust mich...@databricks.com Committed: Mon Mar 2 13:11:17 2015 -0800 -- .../spark/sql/catalyst/expressions/codegen/CodeGenerator.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/3f9def81/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala index 1f80d84..c347780 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala @@ -121,7 +121,7 @@ abstract class CodeGenerator[InType : AnyRef, OutType : AnyRef] extends Loggin * @param nullTerm A term that holds a boolean value representing whether the expression evaluated * to null. * @param primitiveTerm A term for a possible primitive value of the result of the evaluation. Not - * valid if `nullTerm` is set to `false`. + * valid if `nullTerm` is set to `true`. * @param objectTerm A possibly boxed version of the result of evaluating this expression. */ protected case class EvaluatedExpression( - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-6050] [yarn] Relax matching of vcore count in received containers.
Repository: spark Updated Branches: refs/heads/branch-1.3 a83b9bbb2 - 650d1e7fb [SPARK-6050] [yarn] Relax matching of vcore count in received containers. Some YARN configurations return a vcore count for allocated containers that does not match the requested resource. That means Spark would always ignore those containers. So relax the the matching of the vcore count to allow the Spark jobs to run. Author: Marcelo Vanzin van...@cloudera.com Closes #4818 from vanzin/SPARK-6050 and squashes the following commits: 991c803 [Marcelo Vanzin] Remove config option, standardize on legacy behavior (no vcore matching). 8c9c346 [Marcelo Vanzin] Restrict lax matching to vcores only. 3359692 [Marcelo Vanzin] [SPARK-6050] [yarn] Add config option to do lax resource matching. (cherry picked from commit 6b348d90f475440c285a4b636134ffa9351580b9) Signed-off-by: Thomas Graves tgra...@apache.org Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/650d1e7f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/650d1e7f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/650d1e7f Branch: refs/heads/branch-1.3 Commit: 650d1e7fb13545d0d102de9bb6e11ab4f9ef6359 Parents: a83b9bb Author: Marcelo Vanzin van...@cloudera.com Authored: Mon Mar 2 16:41:43 2015 -0600 Committer: Thomas Graves tgra...@apache.org Committed: Mon Mar 2 16:42:02 2015 -0600 -- .../org/apache/spark/deploy/yarn/YarnAllocator.scala | 10 -- 1 file changed, 8 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/650d1e7f/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala -- diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala index 12c62a6..55bfbcd 100644 --- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala +++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala @@ -290,8 +290,14 @@ private[yarn] class YarnAllocator( location: String, containersToUse: ArrayBuffer[Container], remaining: ArrayBuffer[Container]): Unit = { +// SPARK-6050: certain Yarn configurations return a virtual core count that doesn't match the +// request; for example, capacity scheduler + DefaultResourceCalculator. So match on requested +// memory, but use the asked vcore count for matching, effectively disabling matching on vcore +// count. +val matchingResource = Resource.newInstance(allocatedContainer.getResource.getMemory, + resource.getVirtualCores) val matchingRequests = amClient.getMatchingRequests(allocatedContainer.getPriority, location, - allocatedContainer.getResource) + matchingResource) // Match the allocation to a request if (!matchingRequests.isEmpty) { @@ -318,7 +324,7 @@ private[yarn] class YarnAllocator( assert(container.getResource.getMemory = resource.getMemory) logInfo(Launching container %s for on host %s.format(containerId, executorHostname)) - executorIdToContainer(executorId) = container + executorIdToContainer(executorId) = container val containerSet = allocatedHostToContainersMap.getOrElseUpdate(executorHostname, new HashSet[ContainerId]) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-5522] Accelerate the Histroty Server start
Repository: spark Updated Branches: refs/heads/master 6b348d90f - 26c1c56de [SPARK-5522] Accelerate the Histroty Server start When starting the history server, all the log files will be fetched and parsed in order to get the applications' meta data e.g. App Name, Start Time, Duration, etc. In our production cluster, there exist 2600 log files (160G) in HDFS and it costs 3 hours to restart the history server, which is a little bit too long for us. It would be better, if the history server can show logs with missing information during start-up and fill the missing information after fetching and parsing a log file. Author: guliangliang guliangli...@qiyi.com Closes #4525 from marsishandsome/Spark5522 and squashes the following commits: a865c11 [guliangliang] fix bug2 4340c2b [guliangliang] fix bug af92a5a [guliangliang] [SPARK-5522] Accelerate the Histroty Server start Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/26c1c56d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/26c1c56d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/26c1c56d Branch: refs/heads/master Commit: 26c1c56dea5d4160913bb65bb743aeb63fee3240 Parents: 6b348d9 Author: guliangliang guliangli...@qiyi.com Authored: Mon Mar 2 15:33:23 2015 -0800 Committer: Andrew Or and...@databricks.com Committed: Mon Mar 2 15:33:23 2015 -0800 -- .../deploy/history/FsHistoryProvider.scala | 115 --- 1 file changed, 74 insertions(+), 41 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/26c1c56d/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala -- diff --git a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala index 3e3d6ff..c5fab1d 100644 --- a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala +++ b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala @@ -18,22 +18,23 @@ package org.apache.spark.deploy.history import java.io.{IOException, BufferedInputStream, FileNotFoundException, InputStream} -import java.util.concurrent.{Executors, TimeUnit} +import java.util.concurrent.{ExecutorService, Executors, TimeUnit} import scala.collection.mutable import scala.concurrent.duration.Duration import com.google.common.util.concurrent.ThreadFactoryBuilder -import org.apache.hadoop.fs.{FileStatus, Path} +import com.google.common.util.concurrent.MoreExecutors import org.apache.hadoop.fs.permission.AccessControlException - -import org.apache.spark.{Logging, SecurityManager, SparkConf} +import org.apache.hadoop.fs.{FileStatus, Path} import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.io.CompressionCodec import org.apache.spark.scheduler._ import org.apache.spark.ui.SparkUI import org.apache.spark.util.Utils +import org.apache.spark.{Logging, SecurityManager, SparkConf} + /** * A class that provides application history from event logs stored in the file system. @@ -98,6 +99,17 @@ private[history] class FsHistoryProvider(conf: SparkConf) extends ApplicationHis } } + /** + * An Executor to fetch and parse log files. + */ + private val replayExecutor: ExecutorService = { +if (!conf.contains(spark.testing)) { + Executors.newSingleThreadExecutor(Utils.namedThreadFactory(log-replay-executor)) +} else { + MoreExecutors.sameThreadExecutor() +} + } + initialize() private def initialize(): Unit = { @@ -171,10 +183,10 @@ private[history] class FsHistoryProvider(conf: SparkConf) extends ApplicationHis */ private[history] def checkForLogs(): Unit = { try { - var newLastModifiedTime = lastModifiedTime val statusList = Option(fs.listStatus(new Path(logDir))).map(_.toSeq) .getOrElse(Seq[FileStatus]()) - val logInfos = statusList + var newLastModifiedTime = lastModifiedTime + val logInfos: Seq[FileStatus] = statusList .filter { entry = try { getModificationTime(entry).map { time = @@ -189,48 +201,69 @@ private[history] class FsHistoryProvider(conf: SparkConf) extends ApplicationHis false } } -.flatMap { entry = - try { -Some(replay(entry, new ReplayListenerBus())) - } catch { -case e: Exception = - logError(sFailed to load application log data from $entry., e) - None - } -} -.sortWith(compareAppInfo) +.flatMap { entry = Some(entry) } +.sortWith { case (entry1, entry2) = + val mod1 = getModificationTime(entry1).getOrElse(-1L) + val
svn commit: r1663449 - /spark/site/index.html
Author: srowen Date: Mon Mar 2 22:18:42 2015 New Revision: 1663449 URL: http://svn.apache.org/r1663449 Log: SPARK-4992 follow-up to inhibit undesired text wrapping in Python example Modified: spark/site/index.html Modified: spark/site/index.html URL: http://svn.apache.org/viewvc/spark/site/index.html?rev=1663449r1=1663448r2=1663449view=diff == --- spark/site/index.html (original) +++ spark/site/index.html Mon Mar 2 22:18:42 2015 @@ -214,7 +214,7 @@ div class=code text_file = spark.textFile(span class=stringhdfs://.../span)br / nbsp;br / -text_file.span class=sparkopflatMap/span(span class=closurelambda line: line.split()/span)br / +text_file.span class=sparkopflatMap/span(span class=closurelambdanbsp;line:nbsp;line.split()/span)br / nbsp;nbsp;nbsp;nbsp;.span class=sparkopmap/span(span class=closurelambda word: (word, 1)/span)br / nbsp;nbsp;nbsp;nbsp;.span class=sparkopreduceByKey/span(span class=closurelambda a, b: a+b/span) /div - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-6050] [yarn] Relax matching of vcore count in received containers.
Repository: spark Updated Branches: refs/heads/master 582e5a24c - 6b348d90f [SPARK-6050] [yarn] Relax matching of vcore count in received containers. Some YARN configurations return a vcore count for allocated containers that does not match the requested resource. That means Spark would always ignore those containers. So relax the the matching of the vcore count to allow the Spark jobs to run. Author: Marcelo Vanzin van...@cloudera.com Closes #4818 from vanzin/SPARK-6050 and squashes the following commits: 991c803 [Marcelo Vanzin] Remove config option, standardize on legacy behavior (no vcore matching). 8c9c346 [Marcelo Vanzin] Restrict lax matching to vcores only. 3359692 [Marcelo Vanzin] [SPARK-6050] [yarn] Add config option to do lax resource matching. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6b348d90 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6b348d90 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6b348d90 Branch: refs/heads/master Commit: 6b348d90f475440c285a4b636134ffa9351580b9 Parents: 582e5a2 Author: Marcelo Vanzin van...@cloudera.com Authored: Mon Mar 2 16:41:43 2015 -0600 Committer: Thomas Graves tgra...@apache.org Committed: Mon Mar 2 16:41:43 2015 -0600 -- .../org/apache/spark/deploy/yarn/YarnAllocator.scala | 10 -- 1 file changed, 8 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/6b348d90/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala -- diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala index 12c62a6..55bfbcd 100644 --- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala +++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala @@ -290,8 +290,14 @@ private[yarn] class YarnAllocator( location: String, containersToUse: ArrayBuffer[Container], remaining: ArrayBuffer[Container]): Unit = { +// SPARK-6050: certain Yarn configurations return a virtual core count that doesn't match the +// request; for example, capacity scheduler + DefaultResourceCalculator. So match on requested +// memory, but use the asked vcore count for matching, effectively disabling matching on vcore +// count. +val matchingResource = Resource.newInstance(allocatedContainer.getResource.getMemory, + resource.getVirtualCores) val matchingRequests = amClient.getMatchingRequests(allocatedContainer.getPriority, location, - allocatedContainer.getResource) + matchingResource) // Match the allocation to a request if (!matchingRequests.isEmpty) { @@ -318,7 +324,7 @@ private[yarn] class YarnAllocator( assert(container.getResource.getMemory = resource.getMemory) logInfo(Launching container %s for on host %s.format(containerId, executorHostname)) - executorIdToContainer(executorId) = container + executorIdToContainer(executorId) = container val containerSet = allocatedHostToContainersMap.getOrElseUpdate(executorHostname, new HashSet[ContainerId]) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: SPARK-5390 [DOCS] Encourage users to post on Stack Overflow in Community Docs
Repository: spark Updated Branches: refs/heads/branch-1.3 54ac24365 - 58e7198f0 SPARK-5390 [DOCS] Encourage users to post on Stack Overflow in Community Docs Point Community to main Spark Community page; mention SO tag apache-spark. Separately, the Apache site can be updated to mention, under Mailing Lists: StackOverflow also has an apache-spark tag for Spark QA. or similar. Author: Sean Owen so...@cloudera.com Closes #4843 from srowen/SPARK-5390 and squashes the following commits: 3508ac6 [Sean Owen] Point Community to main Spark Community page; mention SO tag apache-spark (cherry picked from commit 0b472f60cdf4984ab5e28e6dbf12615e8997a448) Signed-off-by: Sean Owen so...@cloudera.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/58e7198f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/58e7198f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/58e7198f Branch: refs/heads/branch-1.3 Commit: 58e7198f008753ef3e3b81f3ef30080300645d50 Parents: 54ac243 Author: Sean Owen so...@cloudera.com Authored: Mon Mar 2 21:10:08 2015 + Committer: Sean Owen so...@cloudera.com Committed: Mon Mar 2 21:10:19 2015 + -- docs/index.md | 10 ++ 1 file changed, 2 insertions(+), 8 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/58e7198f/docs/index.md -- diff --git a/docs/index.md b/docs/index.md index e006be6..0986398 100644 --- a/docs/index.md +++ b/docs/index.md @@ -115,6 +115,8 @@ options for deployment: * [Spark Homepage](http://spark.apache.org) * [Spark Wiki](https://cwiki.apache.org/confluence/display/SPARK) +* [Spark Community](http://spark.apache.org/community.html) resources, including local meetups +* [StackOverflow tag `apache-spark`](http://stackoverflow.com/questions/tagged/apache-spark) * [Mailing Lists](http://spark.apache.org/mailing-lists.html): ask questions about Spark here * [AMP Camps](http://ampcamp.berkeley.edu/): a series of training camps at UC Berkeley that featured talks and exercises about Spark, Spark Streaming, Mesos, and more. [Videos](http://ampcamp.berkeley.edu/3/), @@ -123,11 +125,3 @@ options for deployment: * [Code Examples](http://spark.apache.org/examples.html): more are also available in the `examples` subfolder of Spark ([Scala]({{site.SPARK_GITHUB_URL}}/tree/master/examples/src/main/scala/org/apache/spark/examples), [Java]({{site.SPARK_GITHUB_URL}}/tree/master/examples/src/main/java/org/apache/spark/examples), [Python]({{site.SPARK_GITHUB_URL}}/tree/master/examples/src/main/python)) - -# Community - -To get help using Spark or keep up with Spark development, sign up for the [user mailing list](http://spark.apache.org/mailing-lists.html). - -If you're in the San Francisco Bay Area, there's a regular [Spark meetup](http://www.meetup.com/spark-users/) every few weeks. Come by to meet the developers and other users. - -Finally, if you'd like to contribute code to Spark, read [how to contribute](contributing-to-spark.html). - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: SPARK-5390 [DOCS] Encourage users to post on Stack Overflow in Community Docs
Repository: spark Updated Branches: refs/heads/master d9a8bae77 - 0b472f60c SPARK-5390 [DOCS] Encourage users to post on Stack Overflow in Community Docs Point Community to main Spark Community page; mention SO tag apache-spark. Separately, the Apache site can be updated to mention, under Mailing Lists: StackOverflow also has an apache-spark tag for Spark QA. or similar. Author: Sean Owen so...@cloudera.com Closes #4843 from srowen/SPARK-5390 and squashes the following commits: 3508ac6 [Sean Owen] Point Community to main Spark Community page; mention SO tag apache-spark Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0b472f60 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0b472f60 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0b472f60 Branch: refs/heads/master Commit: 0b472f60cdf4984ab5e28e6dbf12615e8997a448 Parents: d9a8bae Author: Sean Owen so...@cloudera.com Authored: Mon Mar 2 21:10:08 2015 + Committer: Sean Owen so...@cloudera.com Committed: Mon Mar 2 21:10:08 2015 + -- docs/index.md | 10 ++ 1 file changed, 2 insertions(+), 8 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0b472f60/docs/index.md -- diff --git a/docs/index.md b/docs/index.md index e006be6..0986398 100644 --- a/docs/index.md +++ b/docs/index.md @@ -115,6 +115,8 @@ options for deployment: * [Spark Homepage](http://spark.apache.org) * [Spark Wiki](https://cwiki.apache.org/confluence/display/SPARK) +* [Spark Community](http://spark.apache.org/community.html) resources, including local meetups +* [StackOverflow tag `apache-spark`](http://stackoverflow.com/questions/tagged/apache-spark) * [Mailing Lists](http://spark.apache.org/mailing-lists.html): ask questions about Spark here * [AMP Camps](http://ampcamp.berkeley.edu/): a series of training camps at UC Berkeley that featured talks and exercises about Spark, Spark Streaming, Mesos, and more. [Videos](http://ampcamp.berkeley.edu/3/), @@ -123,11 +125,3 @@ options for deployment: * [Code Examples](http://spark.apache.org/examples.html): more are also available in the `examples` subfolder of Spark ([Scala]({{site.SPARK_GITHUB_URL}}/tree/master/examples/src/main/scala/org/apache/spark/examples), [Java]({{site.SPARK_GITHUB_URL}}/tree/master/examples/src/main/java/org/apache/spark/examples), [Python]({{site.SPARK_GITHUB_URL}}/tree/master/examples/src/main/python)) - -# Community - -To get help using Spark or keep up with Spark development, sign up for the [user mailing list](http://spark.apache.org/mailing-lists.html). - -If you're in the San Francisco Bay Area, there's a regular [Spark meetup](http://www.meetup.com/spark-users/) every few weeks. Come by to meet the developers and other users. - -Finally, if you'd like to contribute code to Spark, read [how to contribute](contributing-to-spark.html). - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [DOCS] Refactored Dataframe join comment to use correct parameter ordering
Repository: spark Updated Branches: refs/heads/master af2effdd7 - d9a8bae77 [DOCS] Refactored Dataframe join comment to use correct parameter ordering The API signatire for join requires the JoinType to be the third parameter. The code examples provided for join show JoinType being provided as the 2nd parater resuling in errors (i.e. df1.join(df2, outer, $df1Key === $df2Key) ). The correct sample code is df1.join(df2, $df1Key === $df2Key, outer) Author: Paul Power paul.po...@peerside.com Closes #4847 from peerside/master and squashes the following commits: ebc1efa [Paul Power] Merge pull request #1 from peerside/peerside-patch-1 e353340 [Paul Power] Updated comments use correct sample code for Dataframe joins Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d9a8bae7 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d9a8bae7 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d9a8bae7 Branch: refs/heads/master Commit: d9a8bae77826a0cc77df29d85883e914d0f0b4f3 Parents: af2effd Author: Paul Power paul.po...@peerside.com Authored: Mon Mar 2 13:08:47 2015 -0800 Committer: Michael Armbrust mich...@databricks.com Committed: Mon Mar 2 13:09:35 2015 -0800 -- sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d9a8bae7/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala index 060ab5e..f3aac08 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala @@ -337,11 +337,11 @@ class DataFrame protected[sql]( * {{{ * // Scala: * import org.apache.spark.sql.functions._ - * df1.join(df2, outer, $df1Key === $df2Key) + * df1.join(df2, $df1Key === $df2Key, outer) * * // Java: * import static org.apache.spark.sql.functions.*; - * df1.join(df2, outer, col(df1Key) === col(df2Key)); + * df1.join(df2, col(df1Key).equalTo(col(df2Key)), outer); * }}} * * @param right Right side of the join. - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-6040][SQL] Fix the percent bug in tablesample
Repository: spark Updated Branches: refs/heads/branch-1.3 f92876a54 - a83b9bbb2 [SPARK-6040][SQL] Fix the percent bug in tablesample HiveQL expression like `select count(1) from src tablesample(1 percent);` means take 1% sample to select. But it means 100% in the current version of the Spark. Author: q00251598 qiyad...@huawei.com Closes #4789 from watermen/SPARK-6040 and squashes the following commits: 2453ebe [q00251598] check and adjust the fraction. (cherry picked from commit 582e5a24c55e8c876733537c9910001affc8b29b) Signed-off-by: Michael Armbrust mich...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a83b9bbb Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a83b9bbb Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a83b9bbb Branch: refs/heads/branch-1.3 Commit: a83b9bbb242ff6dedf261b8838add5a391d5bd36 Parents: f92876a Author: q00251598 qiyad...@huawei.com Authored: Mon Mar 2 13:16:29 2015 -0800 Committer: Michael Armbrust mich...@databricks.com Committed: Mon Mar 2 13:16:40 2015 -0800 -- .../main/scala/org/apache/spark/sql/hive/HiveQl.scala| 11 ++- .../apache/spark/sql/hive/execution/HiveQuerySuite.scala | 1 + 2 files changed, 11 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a83b9bbb/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala -- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala index 98263f6..ced99cd 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala @@ -40,6 +40,7 @@ import org.apache.spark.sql.execution.ExplainCommand import org.apache.spark.sql.sources.DescribeCommand import org.apache.spark.sql.hive.execution.{HiveNativeCommand, DropTable, AnalyzeTable, HiveScriptIOSchema} import org.apache.spark.sql.types._ +import org.apache.spark.util.random.RandomSampler /* Implicit conversions */ import scala.collection.JavaConversions._ @@ -850,7 +851,15 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C case Token(TOK_TABLESPLITSAMPLE, Token(TOK_PERCENT, Nil) :: Token(fraction, Nil) :: Nil) = - Sample(fraction.toDouble, withReplacement = false, (math.random * 1000).toInt, relation) + // The range of fraction accepted by Sample is [0, 1]. Because Hive's block sampling + // function takes X PERCENT as the input and the range of X is [0, 100], we need to + // adjust the fraction. + require( +fraction.toDouble = (0.0 - RandomSampler.roundingEpsilon) + fraction.toDouble = (100.0 + RandomSampler.roundingEpsilon), +sSampling fraction ($fraction) must be on interval [0, 100]) + Sample(fraction.toDouble / 100, withReplacement = false, (math.random * 1000).toInt, +relation) case Token(TOK_TABLEBUCKETSAMPLE, Token(numerator, Nil) :: Token(denominator, Nil) :: Nil) = http://git-wip-us.apache.org/repos/asf/spark/blob/a83b9bbb/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala -- diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala index bb0a67d..c0d21bc 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala @@ -467,6 +467,7 @@ class HiveQuerySuite extends HiveComparisonTest with BeforeAndAfter { test(sampling) { sql(SELECT * FROM src TABLESAMPLE(0.1 PERCENT) s) +sql(SELECT * FROM src TABLESAMPLE(100 PERCENT) s) } test(DataFrame toString) { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-6066] Make event log format easier to parse
Repository: spark Updated Branches: refs/heads/master 1a49496b4 - 6776cb33e [SPARK-6066] Make event log format easier to parse Some users have reported difficulty in parsing the new event log format. Since we embed the metadata in the beginning of the file, when we compress the event log we need to skip the metadata because we need that information to parse the log later. This means we'll end up with a partially compressed file if event logging compression is turned on. The old format looks like: ``` sparkVersion = 1.3.0 compressionCodec = org.apache.spark.io.LZFCompressionCodec === LOG_HEADER_END === // actual events, could be compressed bytes ``` The new format in this patch puts the compression codec in the log file name instead. It also removes the metadata header altogether along with the Spark version, which was not needed. The new file name looks something like: ``` app_without_compression app_123.lzf app_456.snappy ``` I tested this with and without compression, using different compression codecs and event logging directories. I verified that both the `Master` and the `HistoryServer` can render both compressed and uncompressed logs as before. Author: Andrew Or and...@databricks.com Closes #4821 from andrewor14/event-log-format and squashes the following commits: 8511141 [Andrew Or] Fix test 654883d [Andrew Or] Add back metadata with Spark version 7f537cd [Andrew Or] Address review feedback 7d6aa61 [Andrew Or] Make codec an extension 59abee9 [Andrew Or] Merge branch 'master' of github.com:apache/spark into event-log-format 27c9a6c [Andrew Or] Address review feedback 519e51a [Andrew Or] Address review feedback ef69276 [Andrew Or] Merge branch 'master' of github.com:apache/spark into event-log-format 88a091d [Andrew Or] Add tests for new format and file name f32d8d2 [Andrew Or] Fix tests 8db5a06 [Andrew Or] Embed metadata in the event log file name instead Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6776cb33 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6776cb33 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6776cb33 Branch: refs/heads/master Commit: 6776cb33ea691f7843b956b3e80979282967e826 Parents: 1a49496 Author: Andrew Or and...@databricks.com Authored: Mon Mar 2 16:34:32 2015 -0800 Committer: Patrick Wendell patr...@databricks.com Committed: Mon Mar 2 16:34:32 2015 -0800 -- .../scala/org/apache/spark/SparkContext.scala | 9 ++ .../spark/deploy/ApplicationDescription.scala | 10 +- .../deploy/history/FsHistoryProvider.scala | 22 +-- .../org/apache/spark/deploy/master/Master.scala | 8 +- .../org/apache/spark/io/CompressionCodec.scala | 21 ++- .../spark/scheduler/EventLoggingListener.scala | 162 +++ .../spark/scheduler/ReplayListenerBus.scala | 3 +- .../apache/spark/scheduler/SparkListener.scala | 5 + .../spark/scheduler/SparkListenerBus.scala | 1 + .../cluster/SparkDeploySchedulerBackend.scala | 2 +- .../org/apache/spark/util/JsonProtocol.scala| 14 ++ .../deploy/history/FsHistoryProviderSuite.scala | 69 +--- .../scheduler/EventLoggingListenerSuite.scala | 62 --- .../spark/scheduler/ReplayListenerSuite.scala | 13 +- 14 files changed, 212 insertions(+), 189 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/6776cb33/core/src/main/scala/org/apache/spark/SparkContext.scala -- diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index 3cd0c21..e231e83 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -51,6 +51,7 @@ import org.apache.spark.deploy.{LocalSparkCluster, SparkHadoopUtil} import org.apache.spark.executor.TriggerThreadDump import org.apache.spark.input.{StreamInputFormat, PortableDataStream, WholeTextFileInputFormat, FixedLengthBinaryInputFormat} +import org.apache.spark.io.CompressionCodec import org.apache.spark.partial.{ApproximateEvaluator, PartialResult} import org.apache.spark.rdd._ import org.apache.spark.scheduler._ @@ -233,6 +234,14 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli None } } + private[spark] val eventLogCodec: Option[String] = { +val compress = conf.getBoolean(spark.eventLog.compress, false) +if (compress isEventLogEnabled) { + Some(CompressionCodec.getCodecName(conf)).map(CompressionCodec.getShortName) +} else { + None +} + } // Generate the random name for a temp folder in Tachyon // Add a timestamp as the suffix here to make it more safe
spark git commit: [SPARK-6066] Make event log format easier to parse
Repository: spark Updated Branches: refs/heads/branch-1.3 866f2814a - 8100b79c2 [SPARK-6066] Make event log format easier to parse Some users have reported difficulty in parsing the new event log format. Since we embed the metadata in the beginning of the file, when we compress the event log we need to skip the metadata because we need that information to parse the log later. This means we'll end up with a partially compressed file if event logging compression is turned on. The old format looks like: ``` sparkVersion = 1.3.0 compressionCodec = org.apache.spark.io.LZFCompressionCodec === LOG_HEADER_END === // actual events, could be compressed bytes ``` The new format in this patch puts the compression codec in the log file name instead. It also removes the metadata header altogether along with the Spark version, which was not needed. The new file name looks something like: ``` app_without_compression app_123.lzf app_456.snappy ``` I tested this with and without compression, using different compression codecs and event logging directories. I verified that both the `Master` and the `HistoryServer` can render both compressed and uncompressed logs as before. Author: Andrew Or and...@databricks.com Closes #4821 from andrewor14/event-log-format and squashes the following commits: 8511141 [Andrew Or] Fix test 654883d [Andrew Or] Add back metadata with Spark version 7f537cd [Andrew Or] Address review feedback 7d6aa61 [Andrew Or] Make codec an extension 59abee9 [Andrew Or] Merge branch 'master' of github.com:apache/spark into event-log-format 27c9a6c [Andrew Or] Address review feedback 519e51a [Andrew Or] Address review feedback ef69276 [Andrew Or] Merge branch 'master' of github.com:apache/spark into event-log-format 88a091d [Andrew Or] Add tests for new format and file name f32d8d2 [Andrew Or] Fix tests 8db5a06 [Andrew Or] Embed metadata in the event log file name instead (cherry picked from commit 6776cb33ea691f7843b956b3e80979282967e826) Signed-off-by: Patrick Wendell patr...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8100b79c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8100b79c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8100b79c Branch: refs/heads/branch-1.3 Commit: 8100b79c292349aaeefe7ff9545afb9e526c2bff Parents: 866f281 Author: Andrew Or and...@databricks.com Authored: Mon Mar 2 16:34:32 2015 -0800 Committer: Patrick Wendell patr...@databricks.com Committed: Mon Mar 2 16:34:39 2015 -0800 -- .../scala/org/apache/spark/SparkContext.scala | 9 ++ .../spark/deploy/ApplicationDescription.scala | 10 +- .../deploy/history/FsHistoryProvider.scala | 22 +-- .../org/apache/spark/deploy/master/Master.scala | 8 +- .../org/apache/spark/io/CompressionCodec.scala | 21 ++- .../spark/scheduler/EventLoggingListener.scala | 162 +++ .../spark/scheduler/ReplayListenerBus.scala | 3 +- .../apache/spark/scheduler/SparkListener.scala | 5 + .../spark/scheduler/SparkListenerBus.scala | 1 + .../cluster/SparkDeploySchedulerBackend.scala | 2 +- .../org/apache/spark/util/JsonProtocol.scala| 14 ++ .../deploy/history/FsHistoryProviderSuite.scala | 69 +--- .../scheduler/EventLoggingListenerSuite.scala | 62 --- .../spark/scheduler/ReplayListenerSuite.scala | 13 +- 14 files changed, 212 insertions(+), 189 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/8100b79c/core/src/main/scala/org/apache/spark/SparkContext.scala -- diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index d59b466..05c3210 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -51,6 +51,7 @@ import org.apache.spark.deploy.{LocalSparkCluster, SparkHadoopUtil} import org.apache.spark.executor.TriggerThreadDump import org.apache.spark.input.{StreamInputFormat, PortableDataStream, WholeTextFileInputFormat, FixedLengthBinaryInputFormat} +import org.apache.spark.io.CompressionCodec import org.apache.spark.partial.{ApproximateEvaluator, PartialResult} import org.apache.spark.rdd._ import org.apache.spark.scheduler._ @@ -233,6 +234,14 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli None } } + private[spark] val eventLogCodec: Option[String] = { +val compress = conf.getBoolean(spark.eventLog.compress, false) +if (compress isEventLogEnabled) { + Some(CompressionCodec.getCodecName(conf)).map(CompressionCodec.getShortName) +} else { + None +} + } // Generate the random name for a temp folder
spark git commit: [SPARK-6121][SQL][MLLIB] simpleString for UDT
Repository: spark Updated Branches: refs/heads/master e3a88d110 - 2db6a853a [SPARK-6121][SQL][MLLIB] simpleString for UDT `df.dtypes` shows `null` for UDTs. This PR uses `udt` by default and `VectorUDT` overwrites it with `vector`. jkbradley davies Author: Xiangrui Meng m...@databricks.com Closes #4858 from mengxr/SPARK-6121 and squashes the following commits: 34f0a77 [Xiangrui Meng] simpleString for UDT Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2db6a853 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2db6a853 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2db6a853 Branch: refs/heads/master Commit: 2db6a853a53b4c25e35983bc489510abb8a73e1d Parents: e3a88d1 Author: Xiangrui Meng m...@databricks.com Authored: Mon Mar 2 17:14:34 2015 -0800 Committer: Xiangrui Meng m...@databricks.com Committed: Mon Mar 2 17:14:34 2015 -0800 -- python/pyspark/mllib/linalg.py | 3 +++ python/pyspark/sql/types.py| 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/2db6a853/python/pyspark/mllib/linalg.py -- diff --git a/python/pyspark/mllib/linalg.py b/python/pyspark/mllib/linalg.py index 597012b..f5aad28 100644 --- a/python/pyspark/mllib/linalg.py +++ b/python/pyspark/mllib/linalg.py @@ -152,6 +152,9 @@ class VectorUDT(UserDefinedType): else: raise ValueError(do not recognize type %r % tpe) +def simpleString(self): +return vector + class Vector(object): http://git-wip-us.apache.org/repos/asf/spark/blob/2db6a853/python/pyspark/sql/types.py -- diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py index 31a861e..0169028 100644 --- a/python/pyspark/sql/types.py +++ b/python/pyspark/sql/types.py @@ -468,7 +468,7 @@ class UserDefinedType(DataType): raise NotImplementedError(UDT must implement deserialize().) def simpleString(self): -return 'null' +return 'udt' def json(self): return json.dumps(self.jsonValue(), separators=(',', ':'), sort_keys=True) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-6121][SQL][MLLIB] simpleString for UDT
Repository: spark Updated Branches: refs/heads/branch-1.3 ea69cf28e - 1b8ab5752 [SPARK-6121][SQL][MLLIB] simpleString for UDT `df.dtypes` shows `null` for UDTs. This PR uses `udt` by default and `VectorUDT` overwrites it with `vector`. jkbradley davies Author: Xiangrui Meng m...@databricks.com Closes #4858 from mengxr/SPARK-6121 and squashes the following commits: 34f0a77 [Xiangrui Meng] simpleString for UDT (cherry picked from commit 2db6a853a53b4c25e35983bc489510abb8a73e1d) Signed-off-by: Xiangrui Meng m...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1b8ab575 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1b8ab575 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1b8ab575 Branch: refs/heads/branch-1.3 Commit: 1b8ab5752fccbc08c3f76c50bc384b89231d0a78 Parents: ea69cf2 Author: Xiangrui Meng m...@databricks.com Authored: Mon Mar 2 17:14:34 2015 -0800 Committer: Xiangrui Meng m...@databricks.com Committed: Mon Mar 2 17:14:43 2015 -0800 -- python/pyspark/mllib/linalg.py | 3 +++ python/pyspark/sql/types.py| 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/1b8ab575/python/pyspark/mllib/linalg.py -- diff --git a/python/pyspark/mllib/linalg.py b/python/pyspark/mllib/linalg.py index 597012b..f5aad28 100644 --- a/python/pyspark/mllib/linalg.py +++ b/python/pyspark/mllib/linalg.py @@ -152,6 +152,9 @@ class VectorUDT(UserDefinedType): else: raise ValueError(do not recognize type %r % tpe) +def simpleString(self): +return vector + class Vector(object): http://git-wip-us.apache.org/repos/asf/spark/blob/1b8ab575/python/pyspark/sql/types.py -- diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py index 31a861e..0169028 100644 --- a/python/pyspark/sql/types.py +++ b/python/pyspark/sql/types.py @@ -468,7 +468,7 @@ class UserDefinedType(DataType): raise NotImplementedError(UDT must implement deserialize().) def simpleString(self): -return 'null' +return 'udt' def json(self): return json.dumps(self.jsonValue(), separators=(',', ':'), sort_keys=True) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-5537] Add user guide for multinomial logistic regression
Repository: spark Updated Branches: refs/heads/branch-1.3 1b8ab5752 - 11389f026 [SPARK-5537] Add user guide for multinomial logistic regression This is based on #4801 from dbtsai. The linear method guide is re-organized a little bit for this change. Closes #4801 Author: Xiangrui Meng m...@databricks.com Author: DB Tsai dbt...@alpinenow.com Closes #4861 from mengxr/SPARK-5537 and squashes the following commits: 47af0ac [Xiangrui Meng] update user guide for multinomial logistic regression cdc2e15 [Xiangrui Meng] Merge remote-tracking branch 'apache/master' into AlpineNow-mlor-doc 096d0ca [DB Tsai] first commit (cherry picked from commit 9d6c5aeebd3c7f8ff6defe3bccd8ff12ed918293) Signed-off-by: Xiangrui Meng m...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/11389f02 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/11389f02 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/11389f02 Branch: refs/heads/branch-1.3 Commit: 11389f0262b1a755effc8cbd247aa199c0d8fd9d Parents: 1b8ab57 Author: Xiangrui Meng m...@databricks.com Authored: Mon Mar 2 18:10:50 2015 -0800 Committer: Xiangrui Meng m...@databricks.com Committed: Mon Mar 2 18:10:56 2015 -0800 -- docs/mllib-linear-methods.md | 278 +- 1 file changed, 217 insertions(+), 61 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/11389f02/docs/mllib-linear-methods.md -- diff --git a/docs/mllib-linear-methods.md b/docs/mllib-linear-methods.md index ffbd7ef..03f90d7 100644 --- a/docs/mllib-linear-methods.md +++ b/docs/mllib-linear-methods.md @@ -17,7 +17,7 @@ displayTitle: a href=mllib-guide.htmlMLlib/a - Linear Methods \newcommand{\av}{\mathbf{\alpha}} \newcommand{\bv}{\mathbf{b}} \newcommand{\N}{\mathbb{N}} -\newcommand{\id}{\mathbf{I}} +\newcommand{\id}{\mathbf{I}} \newcommand{\ind}{\mathbf{1}} \newcommand{\0}{\mathbf{0}} \newcommand{\unit}{\mathbf{e}} @@ -114,18 +114,26 @@ especially when the number of training examples is small. Under the hood, linear methods use convex optimization methods to optimize the objective functions. MLlib uses two methods, SGD and L-BFGS, described in the [optimization section](mllib-optimization.html). Currently, most algorithm APIs support Stochastic Gradient Descent (SGD), and a few support L-BFGS. Refer to [this optimization section](mllib-optimization.html#Choosing-an-Optimization-Method) for guidelines on choosing between optimization methods. -## Binary classification - -[Binary classification](http://en.wikipedia.org/wiki/Binary_classification) -aims to divide items into two categories: positive and negative. MLlib -supports two linear methods for binary classification: linear Support Vector -Machines (SVMs) and logistic regression. For both methods, MLlib supports -L1 and L2 regularized variants. The training data set is represented by an RDD -of [LabeledPoint](mllib-data-types.html) in MLlib. Note that, in the -mathematical formulation in this guide, a training label $y$ is denoted as -either $+1$ (positive) or $-1$ (negative), which is convenient for the -formulation. *However*, the negative label is represented by $0$ in MLlib -instead of $-1$, to be consistent with multiclass labeling. +## Classification + +[Classification](http://en.wikipedia.org/wiki/Statistical_classification) aims to divide items into +categories. +The most common classification type is +[binary classificaion](http://en.wikipedia.org/wiki/Binary_classification), where there are two +categories, usually named positive and negative. +If there are more than two categories, it is called +[multiclass classification](http://en.wikipedia.org/wiki/Multiclass_classification). +MLlib supports two linear methods for classification: linear Support Vector Machines (SVMs) +and logistic regression. +Linear SVMs supports only binary classification, while logistic regression supports both binary and +multiclass classification problems. +For both methods, MLlib supports L1 and L2 regularized variants. +The training data set is represented by an RDD of [LabeledPoint](mllib-data-types.html) in MLlib, +where labels are class indices starting from zero: $0, 1, 2, \ldots$. +Note that, in the mathematical formulation in this guide, a binary label $y$ is denoted as either +$+1$ (positive) or $-1$ (negative), which is convenient for the formulation. +*However*, the negative label is represented by $0$ in MLlib instead of $-1$, to be consistent with +multiclass labeling. ### Linear Support Vector Machines (SVMs) @@ -144,41 +152,7 @@ denoted by $\x$, the model makes predictions based on the value of $\wv^T \x$. By the default, if $\wv^T \x \geq 0$ then the outcome is
spark git commit: [SPARK-6127][Streaming][Docs] Add Kafka to Python api docs
Repository: spark Updated Branches: refs/heads/master 9d6c5aeeb - 9eb22ece1 [SPARK-6127][Streaming][Docs] Add Kafka to Python api docs davies Author: Tathagata Das tathagata.das1...@gmail.com Closes #4860 from tdas/SPARK-6127 and squashes the following commits: 82de92a [Tathagata Das] Add Kafka to Python api docs Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9eb22ece Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9eb22ece Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9eb22ece Branch: refs/heads/master Commit: 9eb22ece115c69899d100cecb8a5e20b3a268649 Parents: 9d6c5ae Author: Tathagata Das tathagata.das1...@gmail.com Authored: Mon Mar 2 18:40:46 2015 -0800 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Mon Mar 2 18:40:46 2015 -0800 -- python/docs/pyspark.streaming.rst | 7 +++ 1 file changed, 7 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/9eb22ece/python/docs/pyspark.streaming.rst -- diff --git a/python/docs/pyspark.streaming.rst b/python/docs/pyspark.streaming.rst index f081856..7890d9d 100644 --- a/python/docs/pyspark.streaming.rst +++ b/python/docs/pyspark.streaming.rst @@ -8,3 +8,10 @@ Module contents :members: :undoc-members: :show-inheritance: + +pyspark.streaming.kafka module + +.. automodule:: pyspark.streaming.kafka +:members: +:undoc-members: +:show-inheritance: - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-6114][SQL] Avoid metastore conversions before plan is resolved
Repository: spark Updated Branches: refs/heads/master 26c1c56de - 8223ce6a8 [SPARK-6114][SQL] Avoid metastore conversions before plan is resolved Author: Michael Armbrust mich...@databricks.com Closes #4855 from marmbrus/explodeBug and squashes the following commits: a712249 [Michael Armbrust] [SPARK-6114][SQL] Avoid metastore conversions before plan is resolved Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8223ce6a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8223ce6a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8223ce6a Branch: refs/heads/master Commit: 8223ce6a81e4cc9fdf816892365fcdff4006c35e Parents: 26c1c56 Author: Michael Armbrust mich...@databricks.com Authored: Mon Mar 2 16:10:54 2015 -0800 Committer: Michael Armbrust mich...@databricks.com Committed: Mon Mar 2 16:10:54 2015 -0800 -- .../org/apache/spark/sql/hive/HiveMetastoreCatalog.scala | 4 .../apache/spark/sql/hive/execution/SQLQuerySuite.scala | 10 ++ 2 files changed, 14 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/8223ce6a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala -- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala index d3ad364..74b4e76 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala @@ -444,6 +444,10 @@ private[hive] class HiveMetastoreCatalog(hive: HiveContext) extends Catalog with */ object ParquetConversions extends Rule[LogicalPlan] { override def apply(plan: LogicalPlan): LogicalPlan = { + if (!plan.resolved) { +return plan + } + // Collects all `MetastoreRelation`s which should be replaced val toBeReplaced = plan.collect { // Write path http://git-wip-us.apache.org/repos/asf/spark/blob/8223ce6a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala -- diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala index f2bc73b..22ea19b 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala @@ -31,6 +31,9 @@ case class Nested1(f1: Nested2) case class Nested2(f2: Nested3) case class Nested3(f3: Int) +case class NestedArray2(b: Seq[Int]) +case class NestedArray1(a: NestedArray2) + /** * A collection of hive query tests where we generate the answers ourselves instead of depending on * Hive to generate them (in contrast to HiveQuerySuite). Often this is because the query is @@ -38,6 +41,13 @@ case class Nested3(f3: Int) */ class SQLQuerySuite extends QueryTest { + test(explode nested Field) { + Seq(NestedArray1(NestedArray2(Seq(1,2,3.toDF.registerTempTable(nestedArray) +checkAnswer( + sql(SELECT ints FROM nestedArray LATERAL VIEW explode(a.b) a AS ints), + Row(1) :: Row(2) :: Row(3) :: Nil) + } + test(SPARK-4512 Fix attribute reference resolution error when using SORT BY) { checkAnswer( sql(SELECT * FROM (SELECT key + key AS a FROM src SORT BY value) t ORDER BY t.a), - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-6048] SparkConf should not translate deprecated configs on set
Repository: spark Updated Branches: refs/heads/master 6776cb33e - 258d154c9 [SPARK-6048] SparkConf should not translate deprecated configs on set There are multiple issues with translating on set outlined in the JIRA. This PR reverts the translation logic added to `SparkConf`. In the future, after the 1.3.0 release we will figure out a way to reorganize the internal structure more elegantly. For now, let's preserve the existing semantics of `SparkConf` since it's a public interface. Unfortunately this means duplicating some code for now, but this is all internal and we can always clean it up later. Author: Andrew Or and...@databricks.com Closes #4799 from andrewor14/conf-set-translate and squashes the following commits: 11c525b [Andrew Or] Move warning to driver 10e77b5 [Andrew Or] Add documentation for deprecation precedence a369cb1 [Andrew Or] Merge branch 'master' of github.com:apache/spark into conf-set-translate c26a9e3 [Andrew Or] Revert all translate logic in SparkConf fef6c9c [Andrew Or] Restore deprecation logic for spark.executor.userClassPathFirst 94b4dfa [Andrew Or] Translate on get, not set Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/258d154c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/258d154c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/258d154c Branch: refs/heads/master Commit: 258d154c9f1afdd52dce19f03d81683ee34effac Parents: 6776cb3 Author: Andrew Or and...@databricks.com Authored: Mon Mar 2 16:36:42 2015 -0800 Committer: Patrick Wendell patr...@databricks.com Committed: Mon Mar 2 16:36:42 2015 -0800 -- core/src/main/scala/org/apache/spark/SparkConf.scala | 15 +++ .../scala/org/apache/spark/executor/Executor.scala | 13 + .../test/scala/org/apache/spark/SparkConfSuite.scala | 12 docs/configuration.md| 4 +++- .../scala/org/apache/spark/deploy/yarn/Client.scala | 3 ++- 5 files changed, 25 insertions(+), 22 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/258d154c/core/src/main/scala/org/apache/spark/SparkConf.scala -- diff --git a/core/src/main/scala/org/apache/spark/SparkConf.scala b/core/src/main/scala/org/apache/spark/SparkConf.scala index 61b34d5..2ca19f5 100644 --- a/core/src/main/scala/org/apache/spark/SparkConf.scala +++ b/core/src/main/scala/org/apache/spark/SparkConf.scala @@ -68,7 +68,7 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging { if (value == null) { throw new NullPointerException(null value for + key) } -settings.put(translateConfKey(key, warn = true), value) +settings.put(key, value) this } @@ -140,7 +140,7 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging { /** Set a parameter if it isn't already configured */ def setIfMissing(key: String, value: String): SparkConf = { -settings.putIfAbsent(translateConfKey(key, warn = true), value) +settings.putIfAbsent(key, value) this } @@ -176,7 +176,7 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging { /** Get a parameter as an Option */ def getOption(key: String): Option[String] = { -Option(settings.get(translateConfKey(key))) +Option(settings.get(key)) } /** Get all parameters as a list of pairs */ @@ -229,7 +229,7 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging { def getAppId: String = get(spark.app.id) /** Does the configuration contain a given parameter? */ - def contains(key: String): Boolean = settings.containsKey(translateConfKey(key)) + def contains(key: String): Boolean = settings.containsKey(key) /** Copy this object */ override def clone: SparkConf = { @@ -343,6 +343,13 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging { } } } + +// Warn against the use of deprecated configs +deprecatedConfigs.values.foreach { dc = + if (contains(dc.oldName)) { +dc.warn() + } +} } /** http://git-wip-us.apache.org/repos/asf/spark/blob/258d154c/core/src/main/scala/org/apache/spark/executor/Executor.scala -- diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala b/core/src/main/scala/org/apache/spark/executor/Executor.scala index b684fb7..bed0a08 100644 --- a/core/src/main/scala/org/apache/spark/executor/Executor.scala +++ b/core/src/main/scala/org/apache/spark/executor/Executor.scala @@ -92,6 +92,12 @@ private[spark] class Executor( private val executorActor = env.actorSystem.actorOf( Props(new ExecutorActor(executorId)), ExecutorActor) + //
spark git commit: [SPARK-6048] SparkConf should not translate deprecated configs on set
Repository: spark Updated Branches: refs/heads/branch-1.3 8100b79c2 - ea69cf28e [SPARK-6048] SparkConf should not translate deprecated configs on set There are multiple issues with translating on set outlined in the JIRA. This PR reverts the translation logic added to `SparkConf`. In the future, after the 1.3.0 release we will figure out a way to reorganize the internal structure more elegantly. For now, let's preserve the existing semantics of `SparkConf` since it's a public interface. Unfortunately this means duplicating some code for now, but this is all internal and we can always clean it up later. Author: Andrew Or and...@databricks.com Closes #4799 from andrewor14/conf-set-translate and squashes the following commits: 11c525b [Andrew Or] Move warning to driver 10e77b5 [Andrew Or] Add documentation for deprecation precedence a369cb1 [Andrew Or] Merge branch 'master' of github.com:apache/spark into conf-set-translate c26a9e3 [Andrew Or] Revert all translate logic in SparkConf fef6c9c [Andrew Or] Restore deprecation logic for spark.executor.userClassPathFirst 94b4dfa [Andrew Or] Translate on get, not set (cherry picked from commit 258d154c9f1afdd52dce19f03d81683ee34effac) Signed-off-by: Patrick Wendell patr...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ea69cf28 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ea69cf28 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ea69cf28 Branch: refs/heads/branch-1.3 Commit: ea69cf28e6874d205fca70872a637547407bc08b Parents: 8100b79 Author: Andrew Or and...@databricks.com Authored: Mon Mar 2 16:36:42 2015 -0800 Committer: Patrick Wendell patr...@databricks.com Committed: Mon Mar 2 16:36:50 2015 -0800 -- core/src/main/scala/org/apache/spark/SparkConf.scala | 15 +++ .../scala/org/apache/spark/executor/Executor.scala | 13 + .../test/scala/org/apache/spark/SparkConfSuite.scala | 12 docs/configuration.md| 4 +++- .../scala/org/apache/spark/deploy/yarn/Client.scala | 3 ++- 5 files changed, 25 insertions(+), 22 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ea69cf28/core/src/main/scala/org/apache/spark/SparkConf.scala -- diff --git a/core/src/main/scala/org/apache/spark/SparkConf.scala b/core/src/main/scala/org/apache/spark/SparkConf.scala index 0dbd261..f3de467 100644 --- a/core/src/main/scala/org/apache/spark/SparkConf.scala +++ b/core/src/main/scala/org/apache/spark/SparkConf.scala @@ -68,7 +68,7 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging { if (value == null) { throw new NullPointerException(null value for + key) } -settings.put(translateConfKey(key, warn = true), value) +settings.put(key, value) this } @@ -140,7 +140,7 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging { /** Set a parameter if it isn't already configured */ def setIfMissing(key: String, value: String): SparkConf = { -settings.putIfAbsent(translateConfKey(key, warn = true), value) +settings.putIfAbsent(key, value) this } @@ -176,7 +176,7 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging { /** Get a parameter as an Option */ def getOption(key: String): Option[String] = { -Option(settings.get(translateConfKey(key))) +Option(settings.get(key)) } /** Get all parameters as a list of pairs */ @@ -229,7 +229,7 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging { def getAppId: String = get(spark.app.id) /** Does the configuration contain a given parameter? */ - def contains(key: String): Boolean = settings.containsKey(translateConfKey(key)) + def contains(key: String): Boolean = settings.containsKey(key) /** Copy this object */ override def clone: SparkConf = { @@ -343,6 +343,13 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging { } } } + +// Warn against the use of deprecated configs +deprecatedConfigs.values.foreach { dc = + if (contains(dc.oldName)) { +dc.warn() + } +} } /** http://git-wip-us.apache.org/repos/asf/spark/blob/ea69cf28/core/src/main/scala/org/apache/spark/executor/Executor.scala -- diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala b/core/src/main/scala/org/apache/spark/executor/Executor.scala index b684fb7..bed0a08 100644 --- a/core/src/main/scala/org/apache/spark/executor/Executor.scala +++ b/core/src/main/scala/org/apache/spark/executor/Executor.scala @@ -92,6 +92,12 @@ private[spark] class
spark git commit: [SPARK-6127][Streaming][Docs] Add Kafka to Python api docs
Repository: spark Updated Branches: refs/heads/branch-1.3 11389f026 - ffd059109 [SPARK-6127][Streaming][Docs] Add Kafka to Python api docs davies Author: Tathagata Das tathagata.das1...@gmail.com Closes #4860 from tdas/SPARK-6127 and squashes the following commits: 82de92a [Tathagata Das] Add Kafka to Python api docs (cherry picked from commit 9eb22ece115c69899d100cecb8a5e20b3a268649) Signed-off-by: Tathagata Das tathagata.das1...@gmail.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ffd05910 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ffd05910 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ffd05910 Branch: refs/heads/branch-1.3 Commit: ffd0591094a1a3edafed5c5f3ff1ca1b6048bf46 Parents: 11389f0 Author: Tathagata Das tathagata.das1...@gmail.com Authored: Mon Mar 2 18:40:46 2015 -0800 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Mon Mar 2 18:40:57 2015 -0800 -- python/docs/pyspark.streaming.rst | 7 +++ 1 file changed, 7 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ffd05910/python/docs/pyspark.streaming.rst -- diff --git a/python/docs/pyspark.streaming.rst b/python/docs/pyspark.streaming.rst index f081856..7890d9d 100644 --- a/python/docs/pyspark.streaming.rst +++ b/python/docs/pyspark.streaming.rst @@ -8,3 +8,10 @@ Module contents :members: :undoc-members: :show-inheritance: + +pyspark.streaming.kafka module + +.. automodule:: pyspark.streaming.kafka +:members: +:undoc-members: +:show-inheritance: - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-6082] [SQL] Provides better error message for malformed rows when caching tables
Repository: spark Updated Branches: refs/heads/master 8223ce6a8 - 1a49496b4 [SPARK-6082] [SQL] Provides better error message for malformed rows when caching tables Constructs like Hive `TRANSFORM` may generate malformed rows (via badly authored external scripts for example). I'm a bit hesitant to have this feature, since it introduces per-tuple cost when caching tables. However, considering caching tables is usually a one-time cost, this is probably worth having. !-- Reviewable:start -- [img src=https://reviewable.io/review_button.png; height=40 alt=Review on Reviewable/](https://reviewable.io/reviews/apache/spark/4842) !-- Reviewable:end -- Author: Cheng Lian l...@databricks.com Closes #4842 from liancheng/spark-6082 and squashes the following commits: b05dbff [Cheng Lian] Provides better error message for malformed rows when caching tables Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1a49496b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1a49496b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1a49496b Branch: refs/heads/master Commit: 1a49496b4a9df40c74739fc0fb8a21c88a477075 Parents: 8223ce6 Author: Cheng Lian l...@databricks.com Authored: Mon Mar 2 16:18:00 2015 -0800 Committer: Michael Armbrust mich...@databricks.com Committed: Mon Mar 2 16:18:00 2015 -0800 -- .../spark/sql/columnar/InMemoryColumnarTableScan.scala | 11 +++ 1 file changed, 11 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/1a49496b/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala index 11d5943..8944a32 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala @@ -119,6 +119,17 @@ private[sql] case class InMemoryRelation( var rowCount = 0 while (rowIterator.hasNext rowCount batchSize) { val row = rowIterator.next() + +// Added for SPARK-6082. This assertion can be useful for scenarios when something +// like Hive TRANSFORM is used. The external data generation script used in TRANSFORM +// may result malformed rows, causing ArrayIndexOutOfBoundsException, which is somewhat +// hard to decipher. +assert( + row.size == columnBuilders.size, + sRow column number mismatch, expected ${output.size} columns, but got ${row.size}. + |Row content: $row + .stripMargin) + var i = 0 while (i row.length) { columnBuilders(i).appendFrom(row, i) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-6082] [SQL] Provides better error message for malformed rows when caching tables
Repository: spark Updated Branches: refs/heads/branch-1.3 3899c7c2c - 866f2814a [SPARK-6082] [SQL] Provides better error message for malformed rows when caching tables Constructs like Hive `TRANSFORM` may generate malformed rows (via badly authored external scripts for example). I'm a bit hesitant to have this feature, since it introduces per-tuple cost when caching tables. However, considering caching tables is usually a one-time cost, this is probably worth having. !-- Reviewable:start -- [img src=https://reviewable.io/review_button.png; height=40 alt=Review on Reviewable/](https://reviewable.io/reviews/apache/spark/4842) !-- Reviewable:end -- Author: Cheng Lian l...@databricks.com Closes #4842 from liancheng/spark-6082 and squashes the following commits: b05dbff [Cheng Lian] Provides better error message for malformed rows when caching tables (cherry picked from commit 1a49496b4a9df40c74739fc0fb8a21c88a477075) Signed-off-by: Michael Armbrust mich...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/866f2814 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/866f2814 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/866f2814 Branch: refs/heads/branch-1.3 Commit: 866f2814a48a34820da9069378c2cbbb3589fb0f Parents: 3899c7c Author: Cheng Lian l...@databricks.com Authored: Mon Mar 2 16:18:00 2015 -0800 Committer: Michael Armbrust mich...@databricks.com Committed: Mon Mar 2 16:18:10 2015 -0800 -- .../spark/sql/columnar/InMemoryColumnarTableScan.scala | 11 +++ 1 file changed, 11 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/866f2814/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala index 11d5943..8944a32 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala @@ -119,6 +119,17 @@ private[sql] case class InMemoryRelation( var rowCount = 0 while (rowIterator.hasNext rowCount batchSize) { val row = rowIterator.next() + +// Added for SPARK-6082. This assertion can be useful for scenarios when something +// like Hive TRANSFORM is used. The external data generation script used in TRANSFORM +// may result malformed rows, causing ArrayIndexOutOfBoundsException, which is somewhat +// hard to decipher. +assert( + row.size == columnBuilders.size, + sRow column number mismatch, expected ${output.size} columns, but got ${row.size}. + |Row content: $row + .stripMargin) + var i = 0 while (i row.length) { columnBuilders(i).appendFrom(row, i) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-5950][SQL]Insert array into a metastore table saved as parquet should work when using datasource api
Repository: spark Updated Branches: refs/heads/master 9eb22ece1 - 12599942e [SPARK-5950][SQL]Insert array into a metastore table saved as parquet should work when using datasource api This PR contains the following changes: 1. Add a new method, `DataType.equalsIgnoreCompatibleNullability`, which is the middle ground between DataType's equality check and `DataType.equalsIgnoreNullability`. For two data types `from` and `to`, it does `equalsIgnoreNullability` as well as if the nullability of `from` is compatible with that of `to`. For example, the nullability of `ArrayType(IntegerType, containsNull = false)` is compatible with that of `ArrayType(IntegerType, containsNull = true)` (for an array without null values, we can always say it may contain null values). However, the nullability of `ArrayType(IntegerType, containsNull = true)` is incompatible with that of `ArrayType(IntegerType, containsNull = false)` (for an array that may have null values, we cannot say it does not have null values). 2. For the `resolved` field of `InsertIntoTable`, use `equalsIgnoreCompatibleNullability` to replace the equality check of the data types. 3. For our data source write path, when appending data, we always use the schema of existing table to write the data. This is important for parquet, since nullability direct impacts the way to encode/decode values. If we do not do this, we may see corrupted values when reading values from a set of parquet files generated with different nullability settings. 4. When generating a new parquet table, we always set nullable/containsNull/valueContainsNull to true. So, we will not face situations that we cannot append data because containsNull/valueContainsNull in an Array/Map column of the existing table has already been set to `false`. This change makes the whole data pipeline more robust. 5. Update the equality check of JSON relation. Since JSON does not really cares nullability, `equalsIgnoreNullability` seems a better choice to compare schemata from to JSON tables. JIRA: https://issues.apache.org/jira/browse/SPARK-5950 Thanks viirya for the initial work in #4729. cc marmbrus liancheng Author: Yin Huai yh...@databricks.com Closes #4826 from yhuai/insertNullabilityCheck and squashes the following commits: 3b61a04 [Yin Huai] Revert change on equals. 80e487e [Yin Huai] asNullable in UDT. 587d88b [Yin Huai] Make methods private. 0cb7ea2 [Yin Huai] marmbrus's comments. 3cec464 [Yin Huai] Cheng's comments. 486ed08 [Yin Huai] Merge remote-tracking branch 'upstream/master' into insertNullabilityCheck d3747d1 [Yin Huai] Remove unnecessary change. 8360817 [Yin Huai] Merge remote-tracking branch 'upstream/master' into insertNullabilityCheck 8a3f237 [Yin Huai] Use equalsIgnoreNullability instead of equality check. 0eb5578 [Yin Huai] Fix tests. f6ed813 [Yin Huai] Update old parquet path. e4f397c [Yin Huai] Unit tests. b2c06f8 [Yin Huai] Ignore nullability in JSON relation's equality check. 8bd008b [Yin Huai] nullable, containsNull, and valueContainsNull will be always true for parquet data. bf50d73 [Yin Huai] When appending data, we use the schema of the existing table instead of the schema of the new data. 0a703e7 [Yin Huai] Test failed again since we cannot read correct content. 9a26611 [Yin Huai] Make InsertIntoTable happy. 8f19fe5 [Yin Huai] equalsIgnoreCompatibleNullability 4ec17fd [Yin Huai] Failed test. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/12599942 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/12599942 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/12599942 Branch: refs/heads/master Commit: 12599942e69e4d73040f3a8611661a0862514ffc Parents: 9eb22ec Author: Yin Huai yh...@databricks.com Authored: Mon Mar 2 19:31:55 2015 -0800 Committer: Michael Armbrust mich...@databricks.com Committed: Mon Mar 2 19:31:55 2015 -0800 -- .../org/apache/spark/mllib/linalg/Vectors.scala | 2 + .../apache/spark/mllib/util/modelSaveLoad.scala | 2 +- .../catalyst/plans/logical/basicOperators.scala | 3 +- .../org/apache/spark/sql/types/dataTypes.scala | 97 +++- .../apache/spark/sql/types/DataTypeSuite.scala | 83 + .../apache/spark/sql/json/JSONRelation.scala| 4 +- .../spark/sql/parquet/ParquetRelation.scala | 9 +- .../sql/parquet/ParquetTableOperations.scala| 5 +- .../apache/spark/sql/parquet/newParquet.scala | 10 +- .../org/apache/spark/sql/sources/commands.scala | 7 +- .../org/apache/spark/sql/sources/rules.scala| 2 +- .../apache/spark/sql/test/ExamplePointUDT.scala | 2 + .../apache/spark/sql/UserDefinedTypeSuite.scala | 2 + .../spark/sql/hive/HiveMetastoreCatalog.scala | 5 +- .../spark/sql/hive/execution/commands.scala | 33 +++ .../sql/hive/MetastoreDataSourcesSuite.scala| 71 +-
spark git commit: [SPARK-5950][SQL]Insert array into a metastore table saved as parquet should work when using datasource api
Repository: spark Updated Branches: refs/heads/branch-1.3 ffd059109 - 1b490e91f [SPARK-5950][SQL]Insert array into a metastore table saved as parquet should work when using datasource api This PR contains the following changes: 1. Add a new method, `DataType.equalsIgnoreCompatibleNullability`, which is the middle ground between DataType's equality check and `DataType.equalsIgnoreNullability`. For two data types `from` and `to`, it does `equalsIgnoreNullability` as well as if the nullability of `from` is compatible with that of `to`. For example, the nullability of `ArrayType(IntegerType, containsNull = false)` is compatible with that of `ArrayType(IntegerType, containsNull = true)` (for an array without null values, we can always say it may contain null values). However, the nullability of `ArrayType(IntegerType, containsNull = true)` is incompatible with that of `ArrayType(IntegerType, containsNull = false)` (for an array that may have null values, we cannot say it does not have null values). 2. For the `resolved` field of `InsertIntoTable`, use `equalsIgnoreCompatibleNullability` to replace the equality check of the data types. 3. For our data source write path, when appending data, we always use the schema of existing table to write the data. This is important for parquet, since nullability direct impacts the way to encode/decode values. If we do not do this, we may see corrupted values when reading values from a set of parquet files generated with different nullability settings. 4. When generating a new parquet table, we always set nullable/containsNull/valueContainsNull to true. So, we will not face situations that we cannot append data because containsNull/valueContainsNull in an Array/Map column of the existing table has already been set to `false`. This change makes the whole data pipeline more robust. 5. Update the equality check of JSON relation. Since JSON does not really cares nullability, `equalsIgnoreNullability` seems a better choice to compare schemata from to JSON tables. JIRA: https://issues.apache.org/jira/browse/SPARK-5950 Thanks viirya for the initial work in #4729. cc marmbrus liancheng Author: Yin Huai yh...@databricks.com Closes #4826 from yhuai/insertNullabilityCheck and squashes the following commits: 3b61a04 [Yin Huai] Revert change on equals. 80e487e [Yin Huai] asNullable in UDT. 587d88b [Yin Huai] Make methods private. 0cb7ea2 [Yin Huai] marmbrus's comments. 3cec464 [Yin Huai] Cheng's comments. 486ed08 [Yin Huai] Merge remote-tracking branch 'upstream/master' into insertNullabilityCheck d3747d1 [Yin Huai] Remove unnecessary change. 8360817 [Yin Huai] Merge remote-tracking branch 'upstream/master' into insertNullabilityCheck 8a3f237 [Yin Huai] Use equalsIgnoreNullability instead of equality check. 0eb5578 [Yin Huai] Fix tests. f6ed813 [Yin Huai] Update old parquet path. e4f397c [Yin Huai] Unit tests. b2c06f8 [Yin Huai] Ignore nullability in JSON relation's equality check. 8bd008b [Yin Huai] nullable, containsNull, and valueContainsNull will be always true for parquet data. bf50d73 [Yin Huai] When appending data, we use the schema of the existing table instead of the schema of the new data. 0a703e7 [Yin Huai] Test failed again since we cannot read correct content. 9a26611 [Yin Huai] Make InsertIntoTable happy. 8f19fe5 [Yin Huai] equalsIgnoreCompatibleNullability 4ec17fd [Yin Huai] Failed test. (cherry picked from commit 12599942e69e4d73040f3a8611661a0862514ffc) Signed-off-by: Michael Armbrust mich...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1b490e91 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1b490e91 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1b490e91 Branch: refs/heads/branch-1.3 Commit: 1b490e91fd6b5d06d9caeb50e597639ccfc0bc3b Parents: ffd0591 Author: Yin Huai yh...@databricks.com Authored: Mon Mar 2 19:31:55 2015 -0800 Committer: Michael Armbrust mich...@databricks.com Committed: Mon Mar 2 19:32:08 2015 -0800 -- .../org/apache/spark/mllib/linalg/Vectors.scala | 2 + .../apache/spark/mllib/util/modelSaveLoad.scala | 2 +- .../catalyst/plans/logical/basicOperators.scala | 3 +- .../org/apache/spark/sql/types/dataTypes.scala | 97 +++- .../apache/spark/sql/types/DataTypeSuite.scala | 83 + .../apache/spark/sql/json/JSONRelation.scala| 4 +- .../spark/sql/parquet/ParquetRelation.scala | 9 +- .../sql/parquet/ParquetTableOperations.scala| 5 +- .../apache/spark/sql/parquet/newParquet.scala | 10 +- .../org/apache/spark/sql/sources/commands.scala | 7 +- .../org/apache/spark/sql/sources/rules.scala| 2 +- .../apache/spark/sql/test/ExamplePointUDT.scala | 2 + .../apache/spark/sql/UserDefinedTypeSuite.scala | 2 + .../spark/sql/hive/HiveMetastoreCatalog.scala | 5 +-
[2/2] spark git commit: [SPARK-5310][SQL] Fixes to Docs and Datasources API
[SPARK-5310][SQL] Fixes to Docs and Datasources API - Various Fixes to docs - Make data source traits actually interfaces Based on #4862 but with fixed conflicts. Author: Reynold Xin r...@databricks.com Author: Michael Armbrust mich...@databricks.com Closes #4868 from marmbrus/pr/4862 and squashes the following commits: fe091ea [Michael Armbrust] Merge remote-tracking branch 'origin/master' into pr/4862 0208497 [Reynold Xin] Test fixes. 34e0a28 [Reynold Xin] [SPARK-5310][SQL] Various fixes to Spark SQL docs. (cherry picked from commit 54d19689ff8d786acde5b8ada6741854ffadadea) Signed-off-by: Michael Armbrust mich...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4e6e0086 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4e6e0086 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4e6e0086 Branch: refs/heads/branch-1.3 Commit: 4e6e0086c27e3f37eda6391c063b481896a69476 Parents: 1b490e9 Author: Reynold Xin r...@databricks.com Authored: Mon Mar 2 22:14:08 2015 -0800 Committer: Michael Armbrust mich...@databricks.com Committed: Mon Mar 2 22:14:20 2015 -0800 -- project/SparkBuild.scala| 29 +- .../scala/org/apache/spark/sql/DataFrame.scala | 36 +- .../scala/org/apache/spark/sql/RDDApi.scala | 4 +- .../apache/spark/sql/jdbc/JDBCRelation.scala| 3 +- .../apache/spark/sql/json/JSONRelation.scala| 5 +- .../apache/spark/sql/parquet/newParquet.scala | 3 +- .../apache/spark/sql/sources/interfaces.scala | 43 +- .../apache/spark/sql/sources/DDLTestSuite.scala | 2 +- .../spark/sql/sources/FilteredScanSuite.scala | 3 +- .../spark/sql/sources/PrunedScanSuite.scala | 3 +- .../spark/sql/sources/TableScanSuite.scala | 11 +- .../spark/sql/hive/HiveMetastoreCatalog.scala | 3 +- .../hive/execution/CreateTableAsSelect.scala| 3 +- .../execution/DescribeHiveTableCommand.scala| 4 +- .../sql/hive/execution/HiveNativeCommand.scala | 6 +- .../sql/hive/execution/HiveTableScan.scala | 4 +- .../hive/execution/InsertIntoHiveTable.scala| 6 +- .../hive/execution/ScriptTransformation.scala | 15 +- .../spark/sql/hive/execution/commands.scala | 27 +- .../spark/sql/hive/execution/package.scala | 25 - .../spark/sql/hive/HiveParquetSuite.scala | 92 +++ .../apache/spark/sql/hive/parquetSuites.scala | 767 +++ .../spark/sql/parquet/HiveParquetSuite.scala| 91 --- .../spark/sql/parquet/parquetSuites.scala | 766 -- 24 files changed, 965 insertions(+), 986 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4e6e0086/project/SparkBuild.scala -- diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala index e4b1b96..4f17df5 100644 --- a/project/SparkBuild.scala +++ b/project/SparkBuild.scala @@ -357,6 +357,21 @@ object Unidoc { names.map(s = org.apache.spark. + s).mkString(:) } + private def ignoreUndocumentedPackages(packages: Seq[Seq[File]]): Seq[Seq[File]] = { +packages + .map(_.filterNot(_.getName.contains($))) + .map(_.filterNot(_.getCanonicalPath.contains(akka))) + .map(_.filterNot(_.getCanonicalPath.contains(deploy))) + .map(_.filterNot(_.getCanonicalPath.contains(network))) + .map(_.filterNot(_.getCanonicalPath.contains(shuffle))) + .map(_.filterNot(_.getCanonicalPath.contains(executor))) + .map(_.filterNot(_.getCanonicalPath.contains(python))) + .map(_.filterNot(_.getCanonicalPath.contains(collection))) + .map(_.filterNot(_.getCanonicalPath.contains(sql/catalyst))) + .map(_.filterNot(_.getCanonicalPath.contains(sql/execution))) + .map(_.filterNot(_.getCanonicalPath.contains(sql/hive/test))) + } + lazy val settings = scalaJavaUnidocSettings ++ Seq ( publish := {}, @@ -368,22 +383,12 @@ object Unidoc { // Skip actual catalyst, but include the subproject. // Catalyst is not public API and contains quasiquotes which break scaladoc. unidocAllSources in (ScalaUnidoc, unidoc) := { - (unidocAllSources in (ScalaUnidoc, unidoc)).value -.map(_.filterNot(_.getCanonicalPath.contains(sql/catalyst))) + ignoreUndocumentedPackages((unidocAllSources in (ScalaUnidoc, unidoc)).value) }, // Skip class names containing $ and some internal packages in Javadocs unidocAllSources in (JavaUnidoc, unidoc) := { - (unidocAllSources in (JavaUnidoc, unidoc)).value -.map(_.filterNot(_.getName.contains($))) -.map(_.filterNot(_.getCanonicalPath.contains(akka))) -.map(_.filterNot(_.getCanonicalPath.contains(deploy))) -.map(_.filterNot(_.getCanonicalPath.contains(network))) -
spark git commit: [SPARK-6097][MLLIB] Support tree model save/load in PySpark/MLlib
Repository: spark Updated Branches: refs/heads/master 54d19689f - 7e53a79c3 [SPARK-6097][MLLIB] Support tree model save/load in PySpark/MLlib Similar to `MatrixFactorizaionModel`, we only need wrappers to support save/load for tree models in Python. jkbradley Author: Xiangrui Meng m...@databricks.com Closes #4854 from mengxr/SPARK-6097 and squashes the following commits: 4586a4d [Xiangrui Meng] fix more typos 8ebcac2 [Xiangrui Meng] fix python style 91172d8 [Xiangrui Meng] fix typos 201b3b9 [Xiangrui Meng] update user guide b5158e2 [Xiangrui Meng] support tree model save/load in PySpark/MLlib Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7e53a79c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7e53a79c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7e53a79c Branch: refs/heads/master Commit: 7e53a79c30511dbd0e5d9878a4b8b0f5bc94e68b Parents: 54d1968 Author: Xiangrui Meng m...@databricks.com Authored: Mon Mar 2 22:27:01 2015 -0800 Committer: Xiangrui Meng m...@databricks.com Committed: Mon Mar 2 22:27:01 2015 -0800 -- docs/mllib-decision-tree.md| 16 - docs/mllib-ensembles.md| 32 +++-- python/pyspark/mllib/recommendation.py | 9 +++ python/pyspark/mllib/tests.py | 27 - python/pyspark/mllib/tree.py | 21 python/pyspark/mllib/util.py | 37 + 6 files changed, 109 insertions(+), 33 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/7e53a79c/docs/mllib-decision-tree.md -- diff --git a/docs/mllib-decision-tree.md b/docs/mllib-decision-tree.md index 8e478ab..c1d0f8a 100644 --- a/docs/mllib-decision-tree.md +++ b/docs/mllib-decision-tree.md @@ -293,11 +293,9 @@ DecisionTreeModel sameModel = DecisionTreeModel.load(sc.sc(), myModelPath); div data-lang=python -Note that the Python API does not yet support model save/load but will in the future. - {% highlight python %} from pyspark.mllib.regression import LabeledPoint -from pyspark.mllib.tree import DecisionTree +from pyspark.mllib.tree import DecisionTree, DecisionTreeModel from pyspark.mllib.util import MLUtils # Load and parse the data file into an RDD of LabeledPoint. @@ -317,6 +315,10 @@ testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(tes print('Test Error = ' + str(testErr)) print('Learned classification tree model:') print(model.toDebugString()) + +# Save and load model +model.save(sc, myModelPath) +sameModel = DecisionTreeModel.load(sc, myModelPath) {% endhighlight %} /div @@ -440,11 +442,9 @@ DecisionTreeModel sameModel = DecisionTreeModel.load(sc.sc(), myModelPath); div data-lang=python -Note that the Python API does not yet support model save/load but will in the future. - {% highlight python %} from pyspark.mllib.regression import LabeledPoint -from pyspark.mllib.tree import DecisionTree +from pyspark.mllib.tree import DecisionTree, DecisionTreeModel from pyspark.mllib.util import MLUtils # Load and parse the data file into an RDD of LabeledPoint. @@ -464,6 +464,10 @@ testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / flo print('Test Mean Squared Error = ' + str(testMSE)) print('Learned regression tree model:') print(model.toDebugString()) + +# Save and load model +model.save(sc, myModelPath) +sameModel = DecisionTreeModel.load(sc, myModelPath) {% endhighlight %} /div http://git-wip-us.apache.org/repos/asf/spark/blob/7e53a79c/docs/mllib-ensembles.md -- diff --git a/docs/mllib-ensembles.md b/docs/mllib-ensembles.md index ec1ef38..cbfb682 100644 --- a/docs/mllib-ensembles.md +++ b/docs/mllib-ensembles.md @@ -202,10 +202,8 @@ RandomForestModel sameModel = RandomForestModel.load(sc.sc(), myModelPath); div data-lang=python -Note that the Python API does not yet support model save/load but will in the future. - {% highlight python %} -from pyspark.mllib.tree import RandomForest +from pyspark.mllib.tree import RandomForest, RandomForestModel from pyspark.mllib.util import MLUtils # Load and parse the data file into an RDD of LabeledPoint. @@ -228,6 +226,10 @@ testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(tes print('Test Error = ' + str(testErr)) print('Learned classification forest model:') print(model.toDebugString()) + +# Save and load model +model.save(sc, myModelPath) +sameModel = RandomForestModel.load(sc, myModelPath) {% endhighlight %} /div @@ -354,10 +356,8 @@ RandomForestModel sameModel = RandomForestModel.load(sc.sc(), myModelPath); div data-lang=python
[2/2] spark git commit: [SPARK-5310][SQL] Fixes to Docs and Datasources API
[SPARK-5310][SQL] Fixes to Docs and Datasources API - Various Fixes to docs - Make data source traits actually interfaces Based on #4862 but with fixed conflicts. Author: Reynold Xin r...@databricks.com Author: Michael Armbrust mich...@databricks.com Closes #4868 from marmbrus/pr/4862 and squashes the following commits: fe091ea [Michael Armbrust] Merge remote-tracking branch 'origin/master' into pr/4862 0208497 [Reynold Xin] Test fixes. 34e0a28 [Reynold Xin] [SPARK-5310][SQL] Various fixes to Spark SQL docs. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/54d19689 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/54d19689 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/54d19689 Branch: refs/heads/master Commit: 54d19689ff8d786acde5b8ada6741854ffadadea Parents: 1259994 Author: Reynold Xin r...@databricks.com Authored: Mon Mar 2 22:14:08 2015 -0800 Committer: Michael Armbrust mich...@databricks.com Committed: Mon Mar 2 22:14:08 2015 -0800 -- project/SparkBuild.scala| 29 +- .../scala/org/apache/spark/sql/DataFrame.scala | 36 +- .../scala/org/apache/spark/sql/RDDApi.scala | 4 +- .../apache/spark/sql/jdbc/JDBCRelation.scala| 3 +- .../apache/spark/sql/json/JSONRelation.scala| 5 +- .../apache/spark/sql/parquet/newParquet.scala | 3 +- .../apache/spark/sql/sources/interfaces.scala | 43 +- .../apache/spark/sql/sources/DDLTestSuite.scala | 2 +- .../spark/sql/sources/FilteredScanSuite.scala | 3 +- .../spark/sql/sources/PrunedScanSuite.scala | 3 +- .../spark/sql/sources/TableScanSuite.scala | 11 +- .../spark/sql/hive/HiveMetastoreCatalog.scala | 3 +- .../hive/execution/CreateTableAsSelect.scala| 3 +- .../execution/DescribeHiveTableCommand.scala| 4 +- .../sql/hive/execution/HiveNativeCommand.scala | 6 +- .../sql/hive/execution/HiveTableScan.scala | 4 +- .../hive/execution/InsertIntoHiveTable.scala| 6 +- .../hive/execution/ScriptTransformation.scala | 15 +- .../spark/sql/hive/execution/commands.scala | 27 +- .../spark/sql/hive/execution/package.scala | 25 - .../spark/sql/hive/HiveParquetSuite.scala | 92 +++ .../apache/spark/sql/hive/parquetSuites.scala | 767 +++ .../spark/sql/parquet/HiveParquetSuite.scala| 91 --- .../spark/sql/parquet/parquetSuites.scala | 766 -- 24 files changed, 965 insertions(+), 986 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/54d19689/project/SparkBuild.scala -- diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala index e4b1b96..4f17df5 100644 --- a/project/SparkBuild.scala +++ b/project/SparkBuild.scala @@ -357,6 +357,21 @@ object Unidoc { names.map(s = org.apache.spark. + s).mkString(:) } + private def ignoreUndocumentedPackages(packages: Seq[Seq[File]]): Seq[Seq[File]] = { +packages + .map(_.filterNot(_.getName.contains($))) + .map(_.filterNot(_.getCanonicalPath.contains(akka))) + .map(_.filterNot(_.getCanonicalPath.contains(deploy))) + .map(_.filterNot(_.getCanonicalPath.contains(network))) + .map(_.filterNot(_.getCanonicalPath.contains(shuffle))) + .map(_.filterNot(_.getCanonicalPath.contains(executor))) + .map(_.filterNot(_.getCanonicalPath.contains(python))) + .map(_.filterNot(_.getCanonicalPath.contains(collection))) + .map(_.filterNot(_.getCanonicalPath.contains(sql/catalyst))) + .map(_.filterNot(_.getCanonicalPath.contains(sql/execution))) + .map(_.filterNot(_.getCanonicalPath.contains(sql/hive/test))) + } + lazy val settings = scalaJavaUnidocSettings ++ Seq ( publish := {}, @@ -368,22 +383,12 @@ object Unidoc { // Skip actual catalyst, but include the subproject. // Catalyst is not public API and contains quasiquotes which break scaladoc. unidocAllSources in (ScalaUnidoc, unidoc) := { - (unidocAllSources in (ScalaUnidoc, unidoc)).value -.map(_.filterNot(_.getCanonicalPath.contains(sql/catalyst))) + ignoreUndocumentedPackages((unidocAllSources in (ScalaUnidoc, unidoc)).value) }, // Skip class names containing $ and some internal packages in Javadocs unidocAllSources in (JavaUnidoc, unidoc) := { - (unidocAllSources in (JavaUnidoc, unidoc)).value -.map(_.filterNot(_.getName.contains($))) -.map(_.filterNot(_.getCanonicalPath.contains(akka))) -.map(_.filterNot(_.getCanonicalPath.contains(deploy))) -.map(_.filterNot(_.getCanonicalPath.contains(network))) -.map(_.filterNot(_.getCanonicalPath.contains(shuffle))) -.map(_.filterNot(_.getCanonicalPath.contains(executor))) -
[1/2] spark git commit: [SPARK-5310][SQL] Fixes to Docs and Datasources API
Repository: spark Updated Branches: refs/heads/master 12599942e - 54d19689f http://git-wip-us.apache.org/repos/asf/spark/blob/54d19689/sql/hive/src/test/scala/org/apache/spark/sql/parquet/parquetSuites.scala -- diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/parquet/parquetSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/parquet/parquetSuites.scala deleted file mode 100644 index 89b943f..000 --- a/sql/hive/src/test/scala/org/apache/spark/sql/parquet/parquetSuites.scala +++ /dev/null @@ -1,766 +0,0 @@ - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the License); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - *http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an AS IS BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.parquet - -import java.io.File - -import org.scalatest.BeforeAndAfterAll - -import org.apache.spark.sql.{SQLConf, QueryTest} -import org.apache.spark.sql.catalyst.expressions.Row -import org.apache.spark.sql.execution.{ExecutedCommand, PhysicalRDD} -import org.apache.spark.sql.hive.execution.{InsertIntoHiveTable, HiveTableScan} -import org.apache.spark.sql.hive.test.TestHive._ -import org.apache.spark.sql.hive.test.TestHive.implicits._ -import org.apache.spark.sql.sources.{InsertIntoDataSource, LogicalRelation} -import org.apache.spark.sql.SaveMode -import org.apache.spark.sql.types._ - -// The data where the partitioning key exists only in the directory structure. -case class ParquetData(intField: Int, stringField: String) -// The data that also includes the partitioning key -case class ParquetDataWithKey(p: Int, intField: Int, stringField: String) - -case class StructContainer(intStructField :Int, stringStructField: String) - -case class ParquetDataWithComplexTypes( -intField: Int, -stringField: String, -structField: StructContainer, -arrayField: Seq[Int]) - -case class ParquetDataWithKeyAndComplexTypes( -p: Int, -intField: Int, -stringField: String, -structField: StructContainer, -arrayField: Seq[Int]) - -/** - * A suite to test the automatic conversion of metastore tables with parquet data to use the - * built in parquet support. - */ -class ParquetMetastoreSuiteBase extends ParquetPartitioningTest { - override def beforeAll(): Unit = { -super.beforeAll() - -sql(s - create external table partitioned_parquet - ( -intField INT, -stringField STRING - ) - PARTITIONED BY (p int) - ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' - STORED AS - INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' - OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' - location '${partitionedTableDir.getCanonicalPath}' -) - -sql(s - create external table partitioned_parquet_with_key - ( -intField INT, -stringField STRING - ) - PARTITIONED BY (p int) - ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' - STORED AS - INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' - OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' - location '${partitionedTableDirWithKey.getCanonicalPath}' -) - -sql(s - create external table normal_parquet - ( -intField INT, -stringField STRING - ) - ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' - STORED AS - INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' - OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' - location '${new File(normalTableDir, normal).getCanonicalPath}' -) - -sql(s - CREATE EXTERNAL TABLE partitioned_parquet_with_complextypes - ( -intField INT, -stringField STRING, -structField STRUCTintStructField: INT, stringStructField: STRING, -arrayField ARRAYINT - ) - PARTITIONED BY (p int) - ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' - STORED AS - INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' - OUTPUTFORMAT
spark git commit: [SPARK-5537][MLlib][Docs] Add user guide for multinomial logistic regression
Repository: spark Updated Branches: refs/heads/master c2fe3a6ff - b19605619 [SPARK-5537][MLlib][Docs] Add user guide for multinomial logistic regression Adding more description on top of #4861. Author: DB Tsai dbt...@alpinenow.com Closes #4866 from dbtsai/doc and squashes the following commits: 37e9d07 [DB Tsai] doc Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b1960561 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b1960561 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b1960561 Branch: refs/heads/master Commit: b196056190c569505cc32669d1aec30ed9d70665 Parents: c2fe3a6 Author: DB Tsai dbt...@alpinenow.com Authored: Mon Mar 2 22:37:12 2015 -0800 Committer: Xiangrui Meng m...@databricks.com Committed: Mon Mar 2 22:37:12 2015 -0800 -- docs/mllib-linear-methods.md | 10 ++ 1 file changed, 10 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b1960561/docs/mllib-linear-methods.md -- diff --git a/docs/mllib-linear-methods.md b/docs/mllib-linear-methods.md index 03f90d7..9270741 100644 --- a/docs/mllib-linear-methods.md +++ b/docs/mllib-linear-methods.md @@ -784,9 +784,19 @@ regularization parameter (`regParam`) along with various parameters associated w gradient descent (`stepSize`, `numIterations`, `miniBatchFraction`). For each of them, we support all three possible regularizations (none, L1 or L2). +For Logistic Regression, [L-BFGS](api/scala/index.html#org.apache.spark.mllib.optimization.LBFGS) +version is implemented under [LogisticRegressionWithLBFGS] +(api/scala/index.html#org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS), and this +version supports both binary and multinomial Logistic Regression while SGD version only supports +binary Logistic Regression. However, L-BFGS version doesn't support L1 regularization but SGD one +supports L1 regularization. When L1 regularization is not required, L-BFGS version is strongly +recommended since it converges faster and more accurately compared to SGD by approximating the +inverse Hessian matrix using quasi-Newton method. + Algorithms are all implemented in Scala: * [SVMWithSGD](api/scala/index.html#org.apache.spark.mllib.classification.SVMWithSGD) +* [LogisticRegressionWithLBFGS](api/scala/index.html#org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS) * [LogisticRegressionWithSGD](api/scala/index.html#org.apache.spark.mllib.classification.LogisticRegressionWithSGD) * [LinearRegressionWithSGD](api/scala/index.html#org.apache.spark.mllib.regression.LinearRegressionWithSGD) * [RidgeRegressionWithSGD](api/scala/index.html#org.apache.spark.mllib.regression.RidgeRegressionWithSGD) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-6097][MLLIB] Support tree model save/load in PySpark/MLlib
Repository: spark Updated Branches: refs/heads/branch-1.3 4e6e0086c - 62c53be2a [SPARK-6097][MLLIB] Support tree model save/load in PySpark/MLlib Similar to `MatrixFactorizaionModel`, we only need wrappers to support save/load for tree models in Python. jkbradley Author: Xiangrui Meng m...@databricks.com Closes #4854 from mengxr/SPARK-6097 and squashes the following commits: 4586a4d [Xiangrui Meng] fix more typos 8ebcac2 [Xiangrui Meng] fix python style 91172d8 [Xiangrui Meng] fix typos 201b3b9 [Xiangrui Meng] update user guide b5158e2 [Xiangrui Meng] support tree model save/load in PySpark/MLlib (cherry picked from commit 7e53a79c30511dbd0e5d9878a4b8b0f5bc94e68b) Signed-off-by: Xiangrui Meng m...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/62c53be2 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/62c53be2 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/62c53be2 Branch: refs/heads/branch-1.3 Commit: 62c53be2a9ef805545c314ffbbfafdcf2fced9f2 Parents: 4e6e008 Author: Xiangrui Meng m...@databricks.com Authored: Mon Mar 2 22:27:01 2015 -0800 Committer: Xiangrui Meng m...@databricks.com Committed: Mon Mar 2 22:27:23 2015 -0800 -- docs/mllib-decision-tree.md| 16 - docs/mllib-ensembles.md| 32 +++-- python/pyspark/mllib/recommendation.py | 9 +++ python/pyspark/mllib/tests.py | 27 - python/pyspark/mllib/tree.py | 21 python/pyspark/mllib/util.py | 37 + 6 files changed, 109 insertions(+), 33 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/62c53be2/docs/mllib-decision-tree.md -- diff --git a/docs/mllib-decision-tree.md b/docs/mllib-decision-tree.md index 8e478ab..c1d0f8a 100644 --- a/docs/mllib-decision-tree.md +++ b/docs/mllib-decision-tree.md @@ -293,11 +293,9 @@ DecisionTreeModel sameModel = DecisionTreeModel.load(sc.sc(), myModelPath); div data-lang=python -Note that the Python API does not yet support model save/load but will in the future. - {% highlight python %} from pyspark.mllib.regression import LabeledPoint -from pyspark.mllib.tree import DecisionTree +from pyspark.mllib.tree import DecisionTree, DecisionTreeModel from pyspark.mllib.util import MLUtils # Load and parse the data file into an RDD of LabeledPoint. @@ -317,6 +315,10 @@ testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(tes print('Test Error = ' + str(testErr)) print('Learned classification tree model:') print(model.toDebugString()) + +# Save and load model +model.save(sc, myModelPath) +sameModel = DecisionTreeModel.load(sc, myModelPath) {% endhighlight %} /div @@ -440,11 +442,9 @@ DecisionTreeModel sameModel = DecisionTreeModel.load(sc.sc(), myModelPath); div data-lang=python -Note that the Python API does not yet support model save/load but will in the future. - {% highlight python %} from pyspark.mllib.regression import LabeledPoint -from pyspark.mllib.tree import DecisionTree +from pyspark.mllib.tree import DecisionTree, DecisionTreeModel from pyspark.mllib.util import MLUtils # Load and parse the data file into an RDD of LabeledPoint. @@ -464,6 +464,10 @@ testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / flo print('Test Mean Squared Error = ' + str(testMSE)) print('Learned regression tree model:') print(model.toDebugString()) + +# Save and load model +model.save(sc, myModelPath) +sameModel = DecisionTreeModel.load(sc, myModelPath) {% endhighlight %} /div http://git-wip-us.apache.org/repos/asf/spark/blob/62c53be2/docs/mllib-ensembles.md -- diff --git a/docs/mllib-ensembles.md b/docs/mllib-ensembles.md index d66b419..84f3b3b 100644 --- a/docs/mllib-ensembles.md +++ b/docs/mllib-ensembles.md @@ -202,10 +202,8 @@ RandomForestModel sameModel = RandomForestModel.load(sc.sc(), myModelPath); div data-lang=python -Note that the Python API does not yet support model save/load but will in the future. - {% highlight python %} -from pyspark.mllib.tree import RandomForest +from pyspark.mllib.tree import RandomForest, RandomForestModel from pyspark.mllib.util import MLUtils # Load and parse the data file into an RDD of LabeledPoint. @@ -228,6 +226,10 @@ testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(tes print('Test Error = ' + str(testErr)) print('Learned classification forest model:') print(model.toDebugString()) + +# Save and load model +model.save(sc, myModelPath) +sameModel = RandomForestModel.load(sc, myModelPath) {% endhighlight %} /div
spark git commit: [SPARK-6073][SQL] Need to refresh metastore cache after append data in CreateMetastoreDataSourceAsSelect
Repository: spark Updated Branches: refs/heads/branch-1.3 1fe677a36 - c59871cca [SPARK-6073][SQL] Need to refresh metastore cache after append data in CreateMetastoreDataSourceAsSelect JIRA: https://issues.apache.org/jira/browse/SPARK-6073 liancheng Author: Yin Huai yh...@databricks.com Closes #4824 from yhuai/refreshCache and squashes the following commits: b9542ef [Yin Huai] Refresh metadata cache in the Catalog in CreateMetastoreDataSourceAsSelect. (cherry picked from commit 39a54b40aff66816f8b8f5c6133eaaad6eaecae1) Signed-off-by: Cheng Lian l...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c59871cc Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c59871cc Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c59871cc Branch: refs/heads/branch-1.3 Commit: c59871cca5695402e13a89344bd12f5952704019 Parents: 1fe677a Author: Yin Huai yh...@databricks.com Authored: Mon Mar 2 22:42:18 2015 +0800 Committer: Cheng Lian l...@databricks.com Committed: Mon Mar 2 22:44:05 2015 +0800 -- .../spark/sql/hive/execution/commands.scala | 2 + .../sql/hive/MetastoreDataSourcesSuite.scala| 52 2 files changed, 54 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c59871cc/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala -- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala index 9934a5d..ffaef8e 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala @@ -248,6 +248,8 @@ case class CreateMetastoreDataSourceAsSelect( isExternal) } +// Refresh the cache of the table in the catalog. +hiveContext.refreshTable(tableName) Seq.empty[Row] } } http://git-wip-us.apache.org/repos/asf/spark/blob/c59871cc/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala -- diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala index 00306f1..868c35f 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala @@ -612,4 +612,56 @@ class MetastoreDataSourcesSuite extends QueryTest with BeforeAndAfterEach { val actualSchema = table(wide_schema).schema assert(schema === actualSchema) } + + test(insert into a table) { +def createDF(from: Int, to: Int): DataFrame = + createDataFrame((from to to).map(i = Tuple2(i, sstr$i))).toDF(c1, c2) + +createDF(0, 9).saveAsTable(insertParquet, parquet) +checkAnswer( + sql(SELECT p.c1, p.c2 FROM insertParquet p WHERE p.c1 5), + (6 to 9).map(i = Row(i, sstr$i))) + +intercept[AnalysisException] { + createDF(10, 19).saveAsTable(insertParquet, parquet) +} + +createDF(10, 19).saveAsTable(insertParquet, parquet, SaveMode.Append) +checkAnswer( + sql(SELECT p.c1, p.c2 FROM insertParquet p WHERE p.c1 5), + (6 to 19).map(i = Row(i, sstr$i))) + +createDF(20, 29).saveAsTable(insertParquet, parquet, SaveMode.Append) +checkAnswer( + sql(SELECT p.c1, c2 FROM insertParquet p WHERE p.c1 5 AND p.c1 25), + (6 to 24).map(i = Row(i, sstr$i))) + +intercept[AnalysisException] { + createDF(30, 39).saveAsTable(insertParquet) +} + +createDF(30, 39).saveAsTable(insertParquet, SaveMode.Append) +checkAnswer( + sql(SELECT p.c1, c2 FROM insertParquet p WHERE p.c1 5 AND p.c1 35), + (6 to 34).map(i = Row(i, sstr$i))) + +createDF(40, 49).insertInto(insertParquet) +checkAnswer( + sql(SELECT p.c1, c2 FROM insertParquet p WHERE p.c1 5 AND p.c1 45), + (6 to 44).map(i = Row(i, sstr$i))) + +createDF(50, 59).saveAsTable(insertParquet, SaveMode.Overwrite) +checkAnswer( + sql(SELECT p.c1, c2 FROM insertParquet p WHERE p.c1 51 AND p.c1 55), + (52 to 54).map(i = Row(i, sstr$i))) +createDF(60, 69).saveAsTable(insertParquet, SaveMode.Ignore) +checkAnswer( + sql(SELECT p.c1, c2 FROM insertParquet p), + (50 to 59).map(i = Row(i, sstr$i))) + +createDF(70, 79).insertInto(insertParquet, overwrite = true) +checkAnswer( + sql(SELECT p.c1, c2 FROM insertParquet p), + (70 to 79).map(i = Row(i, sstr$i))) + } } - To
spark git commit: [SPARK-6073][SQL] Need to refresh metastore cache after append data in CreateMetastoreDataSourceAsSelect
Repository: spark Updated Branches: refs/heads/master 49c7a8f6f - 39a54b40a [SPARK-6073][SQL] Need to refresh metastore cache after append data in CreateMetastoreDataSourceAsSelect JIRA: https://issues.apache.org/jira/browse/SPARK-6073 liancheng Author: Yin Huai yh...@databricks.com Closes #4824 from yhuai/refreshCache and squashes the following commits: b9542ef [Yin Huai] Refresh metadata cache in the Catalog in CreateMetastoreDataSourceAsSelect. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/39a54b40 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/39a54b40 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/39a54b40 Branch: refs/heads/master Commit: 39a54b40aff66816f8b8f5c6133eaaad6eaecae1 Parents: 49c7a8f Author: Yin Huai yh...@databricks.com Authored: Mon Mar 2 22:42:18 2015 +0800 Committer: Cheng Lian l...@databricks.com Committed: Mon Mar 2 22:42:18 2015 +0800 -- .../spark/sql/hive/execution/commands.scala | 2 + .../sql/hive/MetastoreDataSourcesSuite.scala| 52 2 files changed, 54 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/39a54b40/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala -- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala index 9934a5d..ffaef8e 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala @@ -248,6 +248,8 @@ case class CreateMetastoreDataSourceAsSelect( isExternal) } +// Refresh the cache of the table in the catalog. +hiveContext.refreshTable(tableName) Seq.empty[Row] } } http://git-wip-us.apache.org/repos/asf/spark/blob/39a54b40/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala -- diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala index 00306f1..868c35f 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala @@ -612,4 +612,56 @@ class MetastoreDataSourcesSuite extends QueryTest with BeforeAndAfterEach { val actualSchema = table(wide_schema).schema assert(schema === actualSchema) } + + test(insert into a table) { +def createDF(from: Int, to: Int): DataFrame = + createDataFrame((from to to).map(i = Tuple2(i, sstr$i))).toDF(c1, c2) + +createDF(0, 9).saveAsTable(insertParquet, parquet) +checkAnswer( + sql(SELECT p.c1, p.c2 FROM insertParquet p WHERE p.c1 5), + (6 to 9).map(i = Row(i, sstr$i))) + +intercept[AnalysisException] { + createDF(10, 19).saveAsTable(insertParquet, parquet) +} + +createDF(10, 19).saveAsTable(insertParquet, parquet, SaveMode.Append) +checkAnswer( + sql(SELECT p.c1, p.c2 FROM insertParquet p WHERE p.c1 5), + (6 to 19).map(i = Row(i, sstr$i))) + +createDF(20, 29).saveAsTable(insertParquet, parquet, SaveMode.Append) +checkAnswer( + sql(SELECT p.c1, c2 FROM insertParquet p WHERE p.c1 5 AND p.c1 25), + (6 to 24).map(i = Row(i, sstr$i))) + +intercept[AnalysisException] { + createDF(30, 39).saveAsTable(insertParquet) +} + +createDF(30, 39).saveAsTable(insertParquet, SaveMode.Append) +checkAnswer( + sql(SELECT p.c1, c2 FROM insertParquet p WHERE p.c1 5 AND p.c1 35), + (6 to 34).map(i = Row(i, sstr$i))) + +createDF(40, 49).insertInto(insertParquet) +checkAnswer( + sql(SELECT p.c1, c2 FROM insertParquet p WHERE p.c1 5 AND p.c1 45), + (6 to 44).map(i = Row(i, sstr$i))) + +createDF(50, 59).saveAsTable(insertParquet, SaveMode.Overwrite) +checkAnswer( + sql(SELECT p.c1, c2 FROM insertParquet p WHERE p.c1 51 AND p.c1 55), + (52 to 54).map(i = Row(i, sstr$i))) +createDF(60, 69).saveAsTable(insertParquet, SaveMode.Ignore) +checkAnswer( + sql(SELECT p.c1, c2 FROM insertParquet p), + (50 to 59).map(i = Row(i, sstr$i))) + +createDF(70, 79).insertInto(insertParquet, overwrite = true) +checkAnswer( + sql(SELECT p.c1, c2 FROM insertParquet p), + (70 to 79).map(i = Row(i, sstr$i))) + } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [Streaming][Minor]Fix some error docs in streaming examples
Repository: spark Updated Branches: refs/heads/branch-1.3 6a2fc85e0 - 1fe677a36 [Streaming][Minor]Fix some error docs in streaming examples Small changes, please help to review, thanks a lot. Author: Saisai Shao saisai.s...@intel.com Closes #4837 from jerryshao/doc-fix and squashes the following commits: 545291a [Saisai Shao] Fix some error docs in streaming examples (cherry picked from commit d8fb40edea7c8c811814f1ff288d59178928964b) Signed-off-by: Sean Owen so...@cloudera.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1fe677a3 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1fe677a3 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1fe677a3 Branch: refs/heads/branch-1.3 Commit: 1fe677a36d96dfa50b7fb5520b102452a6e97091 Parents: 6a2fc85 Author: Saisai Shao saisai.s...@intel.com Authored: Mon Mar 2 08:49:19 2015 + Committer: Sean Owen so...@cloudera.com Committed: Mon Mar 2 08:49:28 2015 + -- .../apache/spark/examples/streaming/DirectKafkaWordCount.scala| 3 ++- examples/src/main/python/streaming/kafka_wordcount.py | 2 +- python/pyspark/streaming/kafka.py | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/1fe677a3/examples/scala-2.10/src/main/scala/org/apache/spark/examples/streaming/DirectKafkaWordCount.scala -- diff --git a/examples/scala-2.10/src/main/scala/org/apache/spark/examples/streaming/DirectKafkaWordCount.scala b/examples/scala-2.10/src/main/scala/org/apache/spark/examples/streaming/DirectKafkaWordCount.scala index deb08fd..1c8a20b 100644 --- a/examples/scala-2.10/src/main/scala/org/apache/spark/examples/streaming/DirectKafkaWordCount.scala +++ b/examples/scala-2.10/src/main/scala/org/apache/spark/examples/streaming/DirectKafkaWordCount.scala @@ -30,7 +30,8 @@ import org.apache.spark.SparkConf * topics is a list of one or more kafka topics to consume from * * Example: - *$ bin/run-example streaming.KafkaWordCount broker1-host:port,broker2-host:port topic1,topic2 + *$ bin/run-example streaming.DirectKafkaWordCount broker1-host:port,broker2-host:port \ + *topic1,topic2 */ object DirectKafkaWordCount { def main(args: Array[String]) { http://git-wip-us.apache.org/repos/asf/spark/blob/1fe677a3/examples/src/main/python/streaming/kafka_wordcount.py -- diff --git a/examples/src/main/python/streaming/kafka_wordcount.py b/examples/src/main/python/streaming/kafka_wordcount.py index ed398a8..f82f161 100644 --- a/examples/src/main/python/streaming/kafka_wordcount.py +++ b/examples/src/main/python/streaming/kafka_wordcount.py @@ -23,7 +23,7 @@ http://kafka.apache.org/documentation.html#quickstart and then run the example -`$ bin/spark-submit --driver-class-path external/kafka-assembly/target/scala-*/\ +`$ bin/spark-submit --jars external/kafka-assembly/target/scala-*/\ spark-streaming-kafka-assembly-*.jar examples/src/main/python/streaming/kafka_wordcount.py \ localhost:2181 test` http://git-wip-us.apache.org/repos/asf/spark/blob/1fe677a3/python/pyspark/streaming/kafka.py -- diff --git a/python/pyspark/streaming/kafka.py b/python/pyspark/streaming/kafka.py index 0002dc1..f083ed1 100644 --- a/python/pyspark/streaming/kafka.py +++ b/python/pyspark/streaming/kafka.py @@ -82,7 +82,7 @@ 2. Download the JAR of the artifact from Maven Central http://search.maven.org/, Group Id = org.apache.spark, Artifact Id = spark-streaming-kafka-assembly, Version = %s. - Then, innclude the jar in the spark-submit command as + Then, include the jar in the spark-submit command as $ bin/spark-submit --jars spark-streaming-kafka-assembly.jar ... - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [Streaming][Minor]Fix some error docs in streaming examples
Repository: spark Updated Branches: refs/heads/master 3f00bb3ef - d8fb40ede [Streaming][Minor]Fix some error docs in streaming examples Small changes, please help to review, thanks a lot. Author: Saisai Shao saisai.s...@intel.com Closes #4837 from jerryshao/doc-fix and squashes the following commits: 545291a [Saisai Shao] Fix some error docs in streaming examples Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d8fb40ed Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d8fb40ed Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d8fb40ed Branch: refs/heads/master Commit: d8fb40edea7c8c811814f1ff288d59178928964b Parents: 3f00bb3 Author: Saisai Shao saisai.s...@intel.com Authored: Mon Mar 2 08:49:19 2015 + Committer: Sean Owen so...@cloudera.com Committed: Mon Mar 2 08:49:19 2015 + -- .../apache/spark/examples/streaming/DirectKafkaWordCount.scala| 3 ++- examples/src/main/python/streaming/kafka_wordcount.py | 2 +- python/pyspark/streaming/kafka.py | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d8fb40ed/examples/scala-2.10/src/main/scala/org/apache/spark/examples/streaming/DirectKafkaWordCount.scala -- diff --git a/examples/scala-2.10/src/main/scala/org/apache/spark/examples/streaming/DirectKafkaWordCount.scala b/examples/scala-2.10/src/main/scala/org/apache/spark/examples/streaming/DirectKafkaWordCount.scala index deb08fd..1c8a20b 100644 --- a/examples/scala-2.10/src/main/scala/org/apache/spark/examples/streaming/DirectKafkaWordCount.scala +++ b/examples/scala-2.10/src/main/scala/org/apache/spark/examples/streaming/DirectKafkaWordCount.scala @@ -30,7 +30,8 @@ import org.apache.spark.SparkConf * topics is a list of one or more kafka topics to consume from * * Example: - *$ bin/run-example streaming.KafkaWordCount broker1-host:port,broker2-host:port topic1,topic2 + *$ bin/run-example streaming.DirectKafkaWordCount broker1-host:port,broker2-host:port \ + *topic1,topic2 */ object DirectKafkaWordCount { def main(args: Array[String]) { http://git-wip-us.apache.org/repos/asf/spark/blob/d8fb40ed/examples/src/main/python/streaming/kafka_wordcount.py -- diff --git a/examples/src/main/python/streaming/kafka_wordcount.py b/examples/src/main/python/streaming/kafka_wordcount.py index ed398a8..f82f161 100644 --- a/examples/src/main/python/streaming/kafka_wordcount.py +++ b/examples/src/main/python/streaming/kafka_wordcount.py @@ -23,7 +23,7 @@ http://kafka.apache.org/documentation.html#quickstart and then run the example -`$ bin/spark-submit --driver-class-path external/kafka-assembly/target/scala-*/\ +`$ bin/spark-submit --jars external/kafka-assembly/target/scala-*/\ spark-streaming-kafka-assembly-*.jar examples/src/main/python/streaming/kafka_wordcount.py \ localhost:2181 test` http://git-wip-us.apache.org/repos/asf/spark/blob/d8fb40ed/python/pyspark/streaming/kafka.py -- diff --git a/python/pyspark/streaming/kafka.py b/python/pyspark/streaming/kafka.py index 0002dc1..f083ed1 100644 --- a/python/pyspark/streaming/kafka.py +++ b/python/pyspark/streaming/kafka.py @@ -82,7 +82,7 @@ 2. Download the JAR of the artifact from Maven Central http://search.maven.org/, Group Id = org.apache.spark, Artifact Id = spark-streaming-kafka-assembly, Version = %s. - Then, innclude the jar in the spark-submit command as + Then, include the jar in the spark-submit command as $ bin/spark-submit --jars spark-streaming-kafka-assembly.jar ... - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: SPARK-3357 [CORE] Internal log messages should be set at DEBUG level instead of INFO
Repository: spark Updated Branches: refs/heads/master d8fb40ede - 948c2390a SPARK-3357 [CORE] Internal log messages should be set at DEBUG level instead of INFO Demote some 'noisy' log messages to debug level. I added a few more, to include everything that gets logged in stanzas like this: ``` 15/03/01 00:03:54 INFO BlockManager: Removing broadcast 0 15/03/01 00:03:54 INFO BlockManager: Removing block broadcast_0_piece0 15/03/01 00:03:54 INFO MemoryStore: Block broadcast_0_piece0 of size 839 dropped from memory (free 277976091) 15/03/01 00:03:54 INFO BlockManagerInfo: Removed broadcast_0_piece0 on localhost:49524 in memory (size: 839.0 B, free: 265.1 MB) 15/03/01 00:03:54 INFO BlockManagerMaster: Updated info of block broadcast_0_piece0 15/03/01 00:03:54 INFO BlockManager: Removing block broadcast_0 15/03/01 00:03:54 INFO MemoryStore: Block broadcast_0 of size 1088 dropped from memory (free 277977179) 15/03/01 00:03:54 INFO ContextCleaner: Cleaned broadcast 0 ``` as well as regular messages like ``` 15/03/01 00:02:33 INFO MemoryStore: ensureFreeSpace(2640) called with curMem=47322, maxMem=278019440 ``` WDYT? good or should some be left alone? CC mengxr who suggested some of this. Author: Sean Owen so...@cloudera.com Closes #4838 from srowen/SPARK-3357 and squashes the following commits: dce75c1 [Sean Owen] Back out some debug level changes d9b784d [Sean Owen] Demote some 'noisy' log messages to debug level Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/948c2390 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/948c2390 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/948c2390 Branch: refs/heads/master Commit: 948c2390ab004ad5cf3876d87c05d3e43a9aa9e0 Parents: d8fb40e Author: Sean Owen so...@cloudera.com Authored: Mon Mar 2 08:51:03 2015 + Committer: Sean Owen so...@cloudera.com Committed: Mon Mar 2 08:51:03 2015 + -- core/src/main/scala/org/apache/spark/ContextCleaner.scala| 4 ++-- core/src/main/scala/org/apache/spark/storage/BlockManager.scala | 4 ++-- .../main/scala/org/apache/spark/storage/BlockManagerMaster.scala | 2 +- core/src/main/scala/org/apache/spark/storage/MemoryStore.scala | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/948c2390/core/src/main/scala/org/apache/spark/ContextCleaner.scala -- diff --git a/core/src/main/scala/org/apache/spark/ContextCleaner.scala b/core/src/main/scala/org/apache/spark/ContextCleaner.scala index 434f1e4..4a9d007 100644 --- a/core/src/main/scala/org/apache/spark/ContextCleaner.scala +++ b/core/src/main/scala/org/apache/spark/ContextCleaner.scala @@ -188,10 +188,10 @@ private[spark] class ContextCleaner(sc: SparkContext) extends Logging { /** Perform broadcast cleanup. */ def doCleanupBroadcast(broadcastId: Long, blocking: Boolean) { try { - logDebug(Cleaning broadcast + broadcastId) + logDebug(sCleaning broadcast $broadcastId) broadcastManager.unbroadcast(broadcastId, true, blocking) listeners.foreach(_.broadcastCleaned(broadcastId)) - logInfo(Cleaned broadcast + broadcastId) + logDebug(sCleaned broadcast $broadcastId) } catch { case e: Exception = logError(Error cleaning broadcast + broadcastId, e) } http://git-wip-us.apache.org/repos/asf/spark/blob/948c2390/core/src/main/scala/org/apache/spark/storage/BlockManager.scala -- diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala index 86dbd89..c8b7763 100644 --- a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala +++ b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala @@ -1074,7 +1074,7 @@ private[spark] class BlockManager( * Remove all blocks belonging to the given broadcast. */ def removeBroadcast(broadcastId: Long, tellMaster: Boolean): Int = { -logInfo(sRemoving broadcast $broadcastId) +logDebug(sRemoving broadcast $broadcastId) val blocksToRemove = blockInfo.keys.collect { case bid @ BroadcastBlockId(`broadcastId`, _) = bid } @@ -1086,7 +1086,7 @@ private[spark] class BlockManager( * Remove a block from both memory and disk. */ def removeBlock(blockId: BlockId, tellMaster: Boolean = true): Unit = { -logInfo(sRemoving block $blockId) +logDebug(sRemoving block $blockId) val info = blockInfo.get(blockId).orNull if (info != null) { info.synchronized { http://git-wip-us.apache.org/repos/asf/spark/blob/948c2390/core/src/main/scala/org/apache/spark/storage/BlockManagerMaster.scala
spark git commit: [SPARK-6103][Graphx]remove unused class to import in EdgeRDDImpl
Repository: spark Updated Branches: refs/heads/master 948c2390a - 49c7a8f6f [SPARK-6103][Graphx]remove unused class to import in EdgeRDDImpl Class TaskContext is unused in EdgeRDDImpl, so we need to remove it from import list. Author: Lianhui Wang lianhuiwan...@gmail.com Closes #4846 from lianhuiwang/SPARK-6103 and squashes the following commits: 31aed64 [Lianhui Wang] remove unused class to import in EdgeRDDImpl Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/49c7a8f6 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/49c7a8f6 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/49c7a8f6 Branch: refs/heads/master Commit: 49c7a8f6f33d64d7e6c35391f83121440844a41d Parents: 948c239 Author: Lianhui Wang lianhuiwan...@gmail.com Authored: Mon Mar 2 09:06:56 2015 + Committer: Sean Owen so...@cloudera.com Committed: Mon Mar 2 09:06:56 2015 + -- .../src/main/scala/org/apache/spark/graphx/impl/EdgeRDDImpl.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/49c7a8f6/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgeRDDImpl.scala -- diff --git a/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgeRDDImpl.scala b/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgeRDDImpl.scala index 56cb416..43a3aea 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgeRDDImpl.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgeRDDImpl.scala @@ -19,7 +19,7 @@ package org.apache.spark.graphx.impl import scala.reflect.{classTag, ClassTag} -import org.apache.spark.{OneToOneDependency, HashPartitioner, TaskContext} +import org.apache.spark.{OneToOneDependency, HashPartitioner} import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-6052][SQL]In JSON schema inference, we should always set containsNull of an ArrayType to true
Repository: spark Updated Branches: refs/heads/master 39a54b40a - 3efd8bb6c [SPARK-6052][SQL]In JSON schema inference, we should always set containsNull of an ArrayType to true Always set `containsNull = true` when infer the schema of JSON datasets. If we set `containsNull` based on records we scanned, we may miss arrays with null values when we do sampling. Also, because future data can have arrays with null values, if we convert JSON data to parquet, always setting `containsNull = true` is a more robust way to go. JIRA: https://issues.apache.org/jira/browse/SPARK-6052 Author: Yin Huai yh...@databricks.com Closes #4806 from yhuai/jsonArrayContainsNull and squashes the following commits: 05eab9d [Yin Huai] Change containsNull to true. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3efd8bb6 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3efd8bb6 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3efd8bb6 Branch: refs/heads/master Commit: 3efd8bb6cf139ce094ff631c7a9c1eb93fdcd566 Parents: 39a54b4 Author: Yin Huai yh...@databricks.com Authored: Mon Mar 2 23:18:07 2015 +0800 Committer: Cheng Lian l...@databricks.com Committed: Mon Mar 2 23:18:07 2015 +0800 -- .../org/apache/spark/sql/json/JsonRDD.scala | 9 +++-- .../org/apache/spark/sql/json/JsonSuite.scala | 38 ++-- 2 files changed, 23 insertions(+), 24 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/3efd8bb6/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala index d83bdc2..e54a2a3 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala @@ -199,13 +199,12 @@ private[sql] object JsonRDD extends Logging { * type conflicts. */ private def typeOfArray(l: Seq[Any]): ArrayType = { -val containsNull = l.exists(v = v == null) val elements = l.flatMap(v = Option(v)) if (elements.isEmpty) { // If this JSON array is empty, we use NullType as a placeholder. // If this array is not empty in other JSON objects, we can resolve // the type after we have passed through all JSON objects. - ArrayType(NullType, containsNull) + ArrayType(NullType, containsNull = true) } else { val elementType = elements.map { e = e match { @@ -217,7 +216,7 @@ private[sql] object JsonRDD extends Logging { } }.reduce((type1: DataType, type2: DataType) = compatibleType(type1, type2)) - ArrayType(elementType, containsNull) + ArrayType(elementType, containsNull = true) } } @@ -245,7 +244,7 @@ private[sql] object JsonRDD extends Logging { // The value associated with the key is an array. // Handle inner structs of an array. def buildKeyPathForInnerStructs(v: Any, t: DataType): Seq[(String, DataType)] = t match { - case ArrayType(e: StructType, containsNull) = { + case ArrayType(e: StructType, _) = { // The elements of this arrays are structs. v.asInstanceOf[Seq[Map[String, Any]]].flatMap(Option(_)).flatMap { element = allKeysWithValueTypes(element) @@ -253,7 +252,7 @@ private[sql] object JsonRDD extends Logging { case (k, t) = (s$key.$k, t) } } - case ArrayType(t1, containsNull) = + case ArrayType(t1, _) = v.asInstanceOf[Seq[Any]].flatMap(Option(_)).flatMap { element = buildKeyPathForInnerStructs(element, t1) } http://git-wip-us.apache.org/repos/asf/spark/blob/3efd8bb6/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala index 005f20b..9d94d34 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala @@ -248,26 +248,26 @@ class JsonSuite extends QueryTest { val jsonDF = jsonRDD(complexFieldAndType1) val expectedSchema = StructType( - StructField(arrayOfArray1, ArrayType(ArrayType(StringType, false), false), true) :: - StructField(arrayOfArray2, ArrayType(ArrayType(DoubleType, false), false), true) :: - StructField(arrayOfBigInteger, ArrayType(DecimalType.Unlimited, false), true) :: - StructField(arrayOfBoolean, ArrayType(BooleanType, false), true) :: -
spark git commit: [SPARK-6111] Fixed usage string in documentation.
Repository: spark Updated Branches: refs/heads/branch-1.3 a3fef2c02 - b2b7f011e [SPARK-6111] Fixed usage string in documentation. Usage info in documentation does not match actual usage info. Doc string usage says ```Usage: network_wordcount.py zk topic``` whereas the actual usage is ```Usage: kafka_wordcount.py zk topic``` Author: Kenneth Myers myer...@us.ibm.com Closes #4852 from kennethmyers/kafka_wordcount_documentation_fix and squashes the following commits: 3855325 [Kenneth Myers] Fixed usage string in documentation. (cherry picked from commit 95ac68bf127b5370c13d6bc15adbda78228829cc) Signed-off-by: Sean Owen so...@cloudera.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b2b7f011 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b2b7f011 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b2b7f011 Branch: refs/heads/branch-1.3 Commit: b2b7f011e3e17ac6263b5cac926a48fe89d4638a Parents: a3fef2c Author: Kenneth Myers myer...@us.ibm.com Authored: Mon Mar 2 17:25:24 2015 + Committer: Sean Owen so...@cloudera.com Committed: Mon Mar 2 17:25:37 2015 + -- examples/src/main/python/streaming/kafka_wordcount.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b2b7f011/examples/src/main/python/streaming/kafka_wordcount.py -- diff --git a/examples/src/main/python/streaming/kafka_wordcount.py b/examples/src/main/python/streaming/kafka_wordcount.py index f82f161..51e1ff8 100644 --- a/examples/src/main/python/streaming/kafka_wordcount.py +++ b/examples/src/main/python/streaming/kafka_wordcount.py @@ -17,7 +17,7 @@ Counts words in UTF8 encoded, '\n' delimited text received from the network every second. - Usage: network_wordcount.py zk topic + Usage: kafka_wordcount.py zk topic To run this on your local machine, you need to setup Kafka and create a producer first, see http://kafka.apache.org/documentation.html#quickstart - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-6080] [PySpark] correct LogisticRegressionWithLBFGS regType parameter for pyspark
Repository: spark Updated Branches: refs/heads/branch-1.3 f47610890 - 4ffaf8568 [SPARK-6080] [PySpark] correct LogisticRegressionWithLBFGS regType parameter for pyspark Currently LogisticRegressionWithLBFGS in python/pyspark/mllib/classification.py will invoke callMLlibFunc with a wrong regType parameter. It was assigned to str(regType) which translate None(Python) to None(Java/Scala). The right way should be translate None(Python) to null(Java/Scala) just as what we did at LogisticRegressionWithSGD. Author: Yanbo Liang yblia...@gmail.com Closes #4831 from yanboliang/pyspark_classification and squashes the following commits: 12db65a [Yanbo Liang] correct LogisticRegressionWithLBFGS regType parameter for pyspark (cherry picked from commit af2effdd7b54316af0c02e781911acfb148b962b) Signed-off-by: Xiangrui Meng m...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4ffaf856 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4ffaf856 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4ffaf856 Branch: refs/heads/branch-1.3 Commit: 4ffaf856882fb1f4a5bfc24e5a05c74ba950e282 Parents: f476108 Author: Yanbo Liang yblia...@gmail.com Authored: Mon Mar 2 10:17:24 2015 -0800 Committer: Xiangrui Meng m...@databricks.com Committed: Mon Mar 2 10:17:32 2015 -0800 -- python/pyspark/mllib/classification.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4ffaf856/python/pyspark/mllib/classification.py -- diff --git a/python/pyspark/mllib/classification.py b/python/pyspark/mllib/classification.py index 00e2e76..e476517 100644 --- a/python/pyspark/mllib/classification.py +++ b/python/pyspark/mllib/classification.py @@ -207,7 +207,7 @@ class LogisticRegressionWithLBFGS(object): def train(rdd, i): return callMLlibFunc(trainLogisticRegressionModelWithLBFGS, rdd, int(iterations), i, - float(regParam), str(regType), bool(intercept), int(corrections), + float(regParam), regType, bool(intercept), int(corrections), float(tolerance)) return _regression_train_wrapper(train, LogisticRegressionModel, data, initialWeights) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: aggregateMessages example in graphX doc
Repository: spark Updated Branches: refs/heads/master 9ce12aaf2 - e7d8ae444 aggregateMessages example in graphX doc Examples illustrating difference between legacy mapReduceTriplets usage and aggregateMessages usage has type issues on the reduce for both operators. Being just an example- changed example to reduce the message String by concatenation. Although non-optimal for performance. Author: DEBORAH SIEGEL deborahsiegel@DEBORAHs-MacBook-Pro.local Closes #4853 from d3borah/master and squashes the following commits: db54173 [DEBORAH SIEGEL] fixed aggregateMessages example in graphX doc Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e7d8ae44 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e7d8ae44 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e7d8ae44 Branch: refs/heads/master Commit: e7d8ae444fead27fe85a879f2f7a4cfdd8c47b16 Parents: 9ce12aa Author: DEBORAH SIEGEL deborahsiegel@DEBORAHs-MacBook-Pro.local Authored: Mon Mar 2 10:15:32 2015 -0800 Committer: Reynold Xin r...@databricks.com Committed: Mon Mar 2 10:15:32 2015 -0800 -- docs/graphx-programming-guide.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e7d8ae44/docs/graphx-programming-guide.md -- diff --git a/docs/graphx-programming-guide.md b/docs/graphx-programming-guide.md index 28bdf81..c601d79 100644 --- a/docs/graphx-programming-guide.md +++ b/docs/graphx-programming-guide.md @@ -663,7 +663,7 @@ val graph: Graph[Int, Float] = ... def msgFun(triplet: Triplet[Int, Float]): Iterator[(Int, String)] = { Iterator((triplet.dstId, Hi)) } -def reduceFun(a: Int, b: Int): Int = a + b +def reduceFun(a: String, b: String): String = a + + b val result = graph.mapReduceTriplets[String](msgFun, reduceFun) {% endhighlight %} @@ -674,7 +674,7 @@ val graph: Graph[Int, Float] = ... def msgFun(triplet: EdgeContext[Int, Float, String]) { triplet.sendToDst(Hi) } -def reduceFun(a: Int, b: Int): Int = a + b +def reduceFun(a: String, b: String): String = a + + b val result = graph.aggregateMessages[String](msgFun, reduceFun) {% endhighlight %} - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-5741][SQL] Support the path contains comma in HiveContext
Repository: spark Updated Branches: refs/heads/branch-1.3 b2b7f011e - f47610890 [SPARK-5741][SQL] Support the path contains comma in HiveContext When run ```select * from nzhang_part where hr = 'file,';```, it throws exception ```java.lang.IllegalArgumentException: Can not create a Path from an empty string``` . Because the path of hdfs contains comma, and FileInputFormat.setInputPaths will split path by comma. ### SQL ``` set hive.merge.mapfiles=true; set hive.merge.mapredfiles=true; set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat; set hive.exec.dynamic.partition=true; set hive.exec.dynamic.partition.mode=nonstrict; create table nzhang_part like srcpart; insert overwrite table nzhang_part partition (ds='2010-08-15', hr) select key, value, hr from srcpart where ds='2008-04-08'; insert overwrite table nzhang_part partition (ds='2010-08-15', hr=11) select key, value from srcpart where ds='2008-04-08'; insert overwrite table nzhang_part partition (ds='2010-08-15', hr) select * from ( select key, value, hr from srcpart where ds='2008-04-08' union all select '1' as key, '1' as value, 'file,' as hr from src limit 1) s; select * from nzhang_part where hr = 'file,'; ``` ### Error Log ``` 15/02/10 14:33:16 ERROR SparkSQLDriver: Failed in [select * from nzhang_part where hr = 'file,'] java.lang.IllegalArgumentException: Can not create a Path from an empty string at org.apache.hadoop.fs.Path.checkPathArg(Path.java:127) at org.apache.hadoop.fs.Path.init(Path.java:135) at org.apache.hadoop.util.StringUtils.stringToPath(StringUtils.java:241) at org.apache.hadoop.mapred.FileInputFormat.setInputPaths(FileInputFormat.java:400) at org.apache.spark.sql.hive.HadoopTableReader$.initializeLocalJobConfFunc(TableReader.scala:251) at org.apache.spark.sql.hive.HadoopTableReader$$anonfun$11.apply(TableReader.scala:229) at org.apache.spark.sql.hive.HadoopTableReader$$anonfun$11.apply(TableReader.scala:229) at org.apache.spark.rdd.HadoopRDD$$anonfun$getJobConf$6.apply(HadoopRDD.scala:172) at org.apache.spark.rdd.HadoopRDD$$anonfun$getJobConf$6.apply(HadoopRDD.scala:172) at scala.Option.map(Option.scala:145) at org.apache.spark.rdd.HadoopRDD.getJobConf(HadoopRDD.scala:172) at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:196) Author: q00251598 qiyad...@huawei.com Closes #4532 from watermen/SPARK-5741 and squashes the following commits: 9758ab1 [q00251598] fix bug 1db1a1c [q00251598] use setInputPaths(Job job, Path... inputPaths) b788a72 [q00251598] change FileInputFormat.setInputPaths to jobConf.set and add test suite (cherry picked from commit 9ce12aaf283a2793e719bdc956dd858922636e8d) Signed-off-by: Michael Armbrust mich...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f4761089 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f4761089 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f4761089 Branch: refs/heads/branch-1.3 Commit: f476108901c42ea61873f02dc2fee15896550d30 Parents: b2b7f01 Author: q00251598 qiyad...@huawei.com Authored: Mon Mar 2 10:13:11 2015 -0800 Committer: Michael Armbrust mich...@databricks.com Committed: Mon Mar 2 10:13:30 2015 -0800 -- .../hive/execution/HiveCompatibilitySuite.scala |1 + .../org/apache/spark/sql/hive/TableReader.scala |2 +- .../merge4-0-b12e5c70d6d29757471b900b6160fa8a |1 + .../merge4-1-593999fae618b6b38322bc9ae4e0c027 |1 + .../merge4-10-692a197bd688b48f762e72978f54aa32 |0 .../merge4-11-f407e661307b23a5d52a08a3e7af19b | 1500 ++ .../merge4-12-62541540a18d68a3cb8497a741061d11 |0 .../merge4-13-ed1103f06609365b40e78d13c654cc71 |0 .../merge4-14-ba5dbcd0527b8ddab284bc322255bfc7 |3 + .../merge4-15-68f50dc2ad6ff803a372bdd88dd8e19a |1 + .../merge4-2-43d53504df013e6b35f81811138a167a |1 + .../merge4-3-a4fb8359a2179ec70777aad6366071b7 |1 + .../merge4-4-16367c381d4b189b3640c92511244bfe |1 + .../merge4-5-3d24d877366c42030f6d9a596665720d |0 .../merge4-6-b3a76420183795720ab3a384046e5af|0 .../merge4-7-631a45828eae3f5f562d992efe4cd56d |0 .../merge4-8-f407e661307b23a5d52a08a3e7af19b| 1000 .../merge4-9-ad3dc168c8b6f048717e39ab16b0a319 |0 18 files changed, 2511 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f4761089/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala -- diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala index c6ead45..6126ce7 100644 ---