date:20150302

spark git commit: [Minor] Fix doc typo for describing primitiveTerm effectiveness condition

2015-03-02 Thread marmbrus

Repository: spark
Updated Branches:
  refs/heads/branch-1.3 58e7198f0 - f92876a54


[Minor] Fix doc typo for describing primitiveTerm effectiveness condition

It should be `true` instead of `false`?

Author: Liang-Chi Hsieh vii...@gmail.com

Closes #4762 from viirya/doc_fix and squashes the following commits:

2e37482 [Liang-Chi Hsieh] Fix doc.

(cherry picked from commit 3f9def81170c24f24f4a6b7ca7905de4f75e11e0)
Signed-off-by: Michael Armbrust mich...@databricks.com


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f92876a5
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f92876a5
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f92876a5

Branch: refs/heads/branch-1.3
Commit: f92876a5430d2b87e2df66a4eddc833d568e3e1a
Parents: 58e7198
Author: Liang-Chi Hsieh vii...@gmail.com
Authored: Mon Mar 2 13:11:17 2015 -0800
Committer: Michael Armbrust mich...@databricks.com
Committed: Mon Mar 2 13:11:38 2015 -0800

--
 .../spark/sql/catalyst/expressions/codegen/CodeGenerator.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/f92876a5/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index 1f80d84..c347780 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -121,7 +121,7 @@ abstract class CodeGenerator[InType : AnyRef, OutType : 
AnyRef] extends Loggin
* @param nullTerm A term that holds a boolean value representing whether 
the expression evaluated
* to null.
* @param primitiveTerm A term for a possible primitive value of the result 
of the evaluation. Not
-   *  valid if `nullTerm` is set to `false`.
+   *  valid if `nullTerm` is set to `true`.
* @param objectTerm A possibly boxed version of the result of evaluating 
this expression.
*/
   protected case class EvaluatedExpression(


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

svn commit: r1663444 - in /spark/site: examples.html index.html

2015-03-02 Thread srowen

Author: srowen
Date: Mon Mar  2 22:06:52 2015
New Revision: 1663444

URL: http://svn.apache.org/r1663444
Log:
SPARK-4992 Improve var naming in Python example; change other var names to match

Modified:
spark/site/examples.html
spark/site/index.html

Modified: spark/site/examples.html
URL: 
http://svn.apache.org/viewvc/spark/site/examples.html?rev=1663444r1=1663443r2=1663444view=diff
==
--- spark/site/examples.html (original)
+++ spark/site/examples.html Mon Mar  2 22:06:52 2015
@@ -187,8 +187,8 @@ previous ones, and emactions/em, whi
 div class=tab-content
   div class=tab-pane tab-pane-python active
 div class=code code-tab
-file = spark.textFile(span class=stringhdfs://.../span)br /
-errors = file.span class=sparkopfilter/span(span 
class=closurelambda line: ERROR in line/span)br /
+text_file = spark.textFile(span class=stringhdfs://.../span)br /
+errors = text_file.span class=sparkopfilter/span(span 
class=closurelambda line: ERROR in line/span)br /
 span class=comment# Count all the errors/spanbr /
 errors.span class=sparkopcount/span()br /
 span class=comment# Count errors mentioning MySQL/spanbr /
@@ -199,8 +199,8 @@ previous ones, and emactions/em, whi
   /div
   div class=tab-pane tab-pane-scala
 div class=code code-tab
-span class=keywordval/span file = spark.textFile(span 
class=stringhdfs://.../span)br /
-span class=keywordval/span errors = file.span 
class=sparkopfilter/span(span class=closureline =gt; 
line.contains(ERROR)/span)br /
+span class=keywordval/span textFile = spark.textFile(span 
class=stringhdfs://.../span)br /
+span class=keywordval/span errors = textFile.span 
class=sparkopfilter/span(span class=closureline =gt; 
line.contains(ERROR)/span)br /
 span class=comment// Count all the errors/spanbr /
 errors.span class=sparkopcount/span()br /
 span class=comment// Count errors mentioning MySQL/spanbr /
@@ -211,8 +211,8 @@ previous ones, and emactions/em, whi
   /div
   div class=tab-pane tab-pane-java
 div class=code code-tab
-JavaRDDlt;Stringgt; file = spark.textFile(span 
class=stringhdfs://.../span);br /
-JavaRDDlt;Stringgt; errors = file.span 
class=sparkopfilter/span(span class=closurenew Functionlt;String, 
Booleangt;() {br /
+JavaRDDlt;Stringgt; textFile = spark.textFile(span 
class=stringhdfs://.../span);br /
+JavaRDDlt;Stringgt; errors = textFile.span 
class=sparkopfilter/span(span class=closurenew Functionlt;String, 
Booleangt;() {br /
 nbsp;nbsp;public Boolean call(String s) { return s.contains(ERROR); 
}br /
 }/span);br /
 span class=comment// Count all the errors/spanbr /
@@ -272,8 +272,8 @@ previous ones, and emactions/em, whi
 div class=tab-content
   div class=tab-pane tab-pane-python active
 div class=code code-tab
-file = spark.textFile(span class=stringhdfs://.../span)br /
-counts = file.span class=sparkopflatMap/span(span 
class=closurelambda line: line.split( )/span) \br /
+text_file = spark.textFile(span class=stringhdfs://.../span)br /
+counts = text_file.span class=sparkopflatMap/span(span 
class=closurelambda line: line.split( )/span) \br /
 
nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;.span
 class=sparkopmap/span(span class=closurelambda word: (word, 
1)/span) \br /
 
nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;.span
 class=sparkopreduceByKey/span(span class=closurelambda a, b: a + 
b/span)br /
 counts.span class=sparkopsaveAsTextFile/span(span 
class=stringhdfs://.../span)
@@ -281,8 +281,8 @@ previous ones, and emactions/em, whi
   /div
   div class=tab-pane tab-pane-scala
 div class=code code-tab
-span class=keywordval/span file = spark.textFile(span 
class=stringhdfs://.../span)br /
-span class=keywordval/span counts = file.span 
class=sparkopflatMap/span(span class=closureline =gt; line.split( 
)/span)br /
+span class=keywordval/span textFile = spark.textFile(span 
class=stringhdfs://.../span)br /
+span class=keywordval/span counts = textFile.span 
class=sparkopflatMap/span(span class=closureline =gt; line.split( 
)/span)br /
 
nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;.span
 class=sparkopmap/span(span class=closureword =gt; (word, 
1)/span)br /
 
nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;.span
 class=sparkopreduceByKey/span(span class=closure_ + _/span)br /
 counts.span class=sparkopsaveAsTextFile/span(span 
class=stringhdfs://.../span)
@@ -290,8 +290,8 @@ previous ones, and emactions/em, whi
   /div
   div class=tab-pane tab-pane-java
 div class=code code-tab
-JavaRDDlt;Stringgt; file = spark.textFile(span 
class=stringhdfs://.../span);br /
-JavaRDDlt;Stringgt; words = file.span 
class=sparkopflatMap/span(span class=closurenew 
FlatMapFunctionlt;String, Stringgt;() {br /
+

spark git commit: [Minor] Fix doc typo for describing primitiveTerm effectiveness condition

2015-03-02 Thread marmbrus

Repository: spark
Updated Branches:
  refs/heads/master 0b472f60c - 3f9def811


[Minor] Fix doc typo for describing primitiveTerm effectiveness condition

It should be `true` instead of `false`?

Author: Liang-Chi Hsieh vii...@gmail.com

Closes #4762 from viirya/doc_fix and squashes the following commits:

2e37482 [Liang-Chi Hsieh] Fix doc.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3f9def81
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3f9def81
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3f9def81

Branch: refs/heads/master
Commit: 3f9def81170c24f24f4a6b7ca7905de4f75e11e0
Parents: 0b472f6
Author: Liang-Chi Hsieh vii...@gmail.com
Authored: Mon Mar 2 13:11:17 2015 -0800
Committer: Michael Armbrust mich...@databricks.com
Committed: Mon Mar 2 13:11:17 2015 -0800

--
 .../spark/sql/catalyst/expressions/codegen/CodeGenerator.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/3f9def81/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index 1f80d84..c347780 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -121,7 +121,7 @@ abstract class CodeGenerator[InType : AnyRef, OutType : 
AnyRef] extends Loggin
* @param nullTerm A term that holds a boolean value representing whether 
the expression evaluated
* to null.
* @param primitiveTerm A term for a possible primitive value of the result 
of the evaluation. Not
-   *  valid if `nullTerm` is set to `false`.
+   *  valid if `nullTerm` is set to `true`.
* @param objectTerm A possibly boxed version of the result of evaluating 
this expression.
*/
   protected case class EvaluatedExpression(


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-6050] [yarn] Relax matching of vcore count in received containers.

2015-03-02 Thread tgraves

Repository: spark
Updated Branches:
  refs/heads/branch-1.3 a83b9bbb2 - 650d1e7fb


[SPARK-6050] [yarn] Relax matching of vcore count in received containers.

Some YARN configurations return a vcore count for allocated
containers that does not match the requested resource. That means
Spark would always ignore those containers. So relax the the matching
of the vcore count to allow the Spark jobs to run.

Author: Marcelo Vanzin van...@cloudera.com

Closes #4818 from vanzin/SPARK-6050 and squashes the following commits:

991c803 [Marcelo Vanzin] Remove config option, standardize on legacy behavior 
(no vcore matching).
8c9c346 [Marcelo Vanzin] Restrict lax matching to vcores only.
3359692 [Marcelo Vanzin] [SPARK-6050] [yarn] Add config option to do lax 
resource matching.

(cherry picked from commit 6b348d90f475440c285a4b636134ffa9351580b9)
Signed-off-by: Thomas Graves tgra...@apache.org


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/650d1e7f
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/650d1e7f
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/650d1e7f

Branch: refs/heads/branch-1.3
Commit: 650d1e7fb13545d0d102de9bb6e11ab4f9ef6359
Parents: a83b9bb
Author: Marcelo Vanzin van...@cloudera.com
Authored: Mon Mar 2 16:41:43 2015 -0600
Committer: Thomas Graves tgra...@apache.org
Committed: Mon Mar 2 16:42:02 2015 -0600

--
 .../org/apache/spark/deploy/yarn/YarnAllocator.scala  | 10 --
 1 file changed, 8 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/650d1e7f/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
--
diff --git 
a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala 
b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
index 12c62a6..55bfbcd 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
@@ -290,8 +290,14 @@ private[yarn] class YarnAllocator(
   location: String,
   containersToUse: ArrayBuffer[Container],
   remaining: ArrayBuffer[Container]): Unit = {
+// SPARK-6050: certain Yarn configurations return a virtual core count 
that doesn't match the
+// request; for example, capacity scheduler + DefaultResourceCalculator. 
So match on requested
+// memory, but use the asked vcore count for matching, effectively 
disabling matching on vcore
+// count.
+val matchingResource = 
Resource.newInstance(allocatedContainer.getResource.getMemory,
+  resource.getVirtualCores)
 val matchingRequests = 
amClient.getMatchingRequests(allocatedContainer.getPriority, location,
-  allocatedContainer.getResource)
+  matchingResource)
 
 // Match the allocation to a request
 if (!matchingRequests.isEmpty) {
@@ -318,7 +324,7 @@ private[yarn] class YarnAllocator(
   assert(container.getResource.getMemory = resource.getMemory)
 
   logInfo(Launching container %s for on host %s.format(containerId, 
executorHostname))
-  executorIdToContainer(executorId) = container  
+  executorIdToContainer(executorId) = container
 
   val containerSet = 
allocatedHostToContainersMap.getOrElseUpdate(executorHostname,
 new HashSet[ContainerId])


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-5522] Accelerate the Histroty Server start

2015-03-02 Thread andrewor14

Repository: spark
Updated Branches:
  refs/heads/master 6b348d90f - 26c1c56de


[SPARK-5522] Accelerate the Histroty Server start

When starting the history server, all the log files will be fetched and parsed 
in order to get the applications' meta data e.g. App Name, Start Time, 
Duration, etc. In our production cluster, there exist 2600 log files (160G) in 
HDFS and it costs 3 hours to restart the history server, which is a little bit 
too long for us.

It would be better, if the history server can show logs with missing 
information during start-up and fill the missing information after fetching and 
parsing a log file.

Author: guliangliang guliangli...@qiyi.com

Closes #4525 from marsishandsome/Spark5522 and squashes the following commits:

a865c11 [guliangliang] fix bug2
4340c2b [guliangliang] fix bug
af92a5a [guliangliang] [SPARK-5522] Accelerate the Histroty Server start


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/26c1c56d
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/26c1c56d
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/26c1c56d

Branch: refs/heads/master
Commit: 26c1c56dea5d4160913bb65bb743aeb63fee3240
Parents: 6b348d9
Author: guliangliang guliangli...@qiyi.com
Authored: Mon Mar 2 15:33:23 2015 -0800
Committer: Andrew Or and...@databricks.com
Committed: Mon Mar 2 15:33:23 2015 -0800

--
 .../deploy/history/FsHistoryProvider.scala  | 115 ---
 1 file changed, 74 insertions(+), 41 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/26c1c56d/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
--
diff --git 
a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala 
b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
index 3e3d6ff..c5fab1d 100644
--- 
a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
+++ 
b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
@@ -18,22 +18,23 @@
 package org.apache.spark.deploy.history
 
 import java.io.{IOException, BufferedInputStream, FileNotFoundException, 
InputStream}
-import java.util.concurrent.{Executors, TimeUnit}
+import java.util.concurrent.{ExecutorService, Executors, TimeUnit}
 
 import scala.collection.mutable
 import scala.concurrent.duration.Duration
 
 import com.google.common.util.concurrent.ThreadFactoryBuilder
 
-import org.apache.hadoop.fs.{FileStatus, Path}
+import com.google.common.util.concurrent.MoreExecutors
 import org.apache.hadoop.fs.permission.AccessControlException
-
-import org.apache.spark.{Logging, SecurityManager, SparkConf}
+import org.apache.hadoop.fs.{FileStatus, Path}
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.io.CompressionCodec
 import org.apache.spark.scheduler._
 import org.apache.spark.ui.SparkUI
 import org.apache.spark.util.Utils
+import org.apache.spark.{Logging, SecurityManager, SparkConf}
+
 
 /**
  * A class that provides application history from event logs stored in the 
file system.
@@ -98,6 +99,17 @@ private[history] class FsHistoryProvider(conf: SparkConf) 
extends ApplicationHis
 }
   }
 
+  /**
+   * An Executor to fetch and parse log files.
+   */
+  private val replayExecutor: ExecutorService = {
+if (!conf.contains(spark.testing)) {
+  
Executors.newSingleThreadExecutor(Utils.namedThreadFactory(log-replay-executor))
+} else {
+  MoreExecutors.sameThreadExecutor()
+}
+  }
+
   initialize()
 
   private def initialize(): Unit = {
@@ -171,10 +183,10 @@ private[history] class FsHistoryProvider(conf: SparkConf) 
extends ApplicationHis
*/
   private[history] def checkForLogs(): Unit = {
 try {
-  var newLastModifiedTime = lastModifiedTime
   val statusList = Option(fs.listStatus(new Path(logDir))).map(_.toSeq)
 .getOrElse(Seq[FileStatus]())
-  val logInfos = statusList
+  var newLastModifiedTime = lastModifiedTime
+  val logInfos: Seq[FileStatus] = statusList
 .filter { entry =
   try {
 getModificationTime(entry).map { time =
@@ -189,48 +201,69 @@ private[history] class FsHistoryProvider(conf: SparkConf) 
extends ApplicationHis
   false
   }
 }
-.flatMap { entry =
-  try {
-Some(replay(entry, new ReplayListenerBus()))
-  } catch {
-case e: Exception =
-  logError(sFailed to load application log data from $entry., e)
-  None
-  }
-}
-.sortWith(compareAppInfo)
+.flatMap { entry = Some(entry) }
+.sortWith { case (entry1, entry2) =
+  val mod1 = getModificationTime(entry1).getOrElse(-1L)
+  val

svn commit: r1663449 - /spark/site/index.html

2015-03-02 Thread srowen

Author: srowen
Date: Mon Mar  2 22:18:42 2015
New Revision: 1663449

URL: http://svn.apache.org/r1663449
Log:
SPARK-4992 follow-up to inhibit undesired text wrapping in Python example

Modified:
spark/site/index.html

Modified: spark/site/index.html
URL: 
http://svn.apache.org/viewvc/spark/site/index.html?rev=1663449r1=1663448r2=1663449view=diff
==
--- spark/site/index.html (original)
+++ spark/site/index.html Mon Mar  2 22:18:42 2015
@@ -214,7 +214,7 @@
   div class=code
 text_file = spark.textFile(span 
class=stringhdfs://.../span)br /
 nbsp;br /
-text_file.span class=sparkopflatMap/span(span 
class=closurelambda line: line.split()/span)br /
+text_file.span class=sparkopflatMap/span(span 
class=closurelambdanbsp;line:nbsp;line.split()/span)br /
 nbsp;nbsp;nbsp;nbsp;.span class=sparkopmap/span(span 
class=closurelambda word: (word, 1)/span)br /
 nbsp;nbsp;nbsp;nbsp;.span 
class=sparkopreduceByKey/span(span class=closurelambda a, b: 
a+b/span)
   /div



-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-6050] [yarn] Relax matching of vcore count in received containers.

2015-03-02 Thread tgraves

Repository: spark
Updated Branches:
  refs/heads/master 582e5a24c - 6b348d90f


[SPARK-6050] [yarn] Relax matching of vcore count in received containers.

Some YARN configurations return a vcore count for allocated
containers that does not match the requested resource. That means
Spark would always ignore those containers. So relax the the matching
of the vcore count to allow the Spark jobs to run.

Author: Marcelo Vanzin van...@cloudera.com

Closes #4818 from vanzin/SPARK-6050 and squashes the following commits:

991c803 [Marcelo Vanzin] Remove config option, standardize on legacy behavior 
(no vcore matching).
8c9c346 [Marcelo Vanzin] Restrict lax matching to vcores only.
3359692 [Marcelo Vanzin] [SPARK-6050] [yarn] Add config option to do lax 
resource matching.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6b348d90
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6b348d90
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6b348d90

Branch: refs/heads/master
Commit: 6b348d90f475440c285a4b636134ffa9351580b9
Parents: 582e5a2
Author: Marcelo Vanzin van...@cloudera.com
Authored: Mon Mar 2 16:41:43 2015 -0600
Committer: Thomas Graves tgra...@apache.org
Committed: Mon Mar 2 16:41:43 2015 -0600

--
 .../org/apache/spark/deploy/yarn/YarnAllocator.scala  | 10 --
 1 file changed, 8 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/6b348d90/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
--
diff --git 
a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala 
b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
index 12c62a6..55bfbcd 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
@@ -290,8 +290,14 @@ private[yarn] class YarnAllocator(
   location: String,
   containersToUse: ArrayBuffer[Container],
   remaining: ArrayBuffer[Container]): Unit = {
+// SPARK-6050: certain Yarn configurations return a virtual core count 
that doesn't match the
+// request; for example, capacity scheduler + DefaultResourceCalculator. 
So match on requested
+// memory, but use the asked vcore count for matching, effectively 
disabling matching on vcore
+// count.
+val matchingResource = 
Resource.newInstance(allocatedContainer.getResource.getMemory,
+  resource.getVirtualCores)
 val matchingRequests = 
amClient.getMatchingRequests(allocatedContainer.getPriority, location,
-  allocatedContainer.getResource)
+  matchingResource)
 
 // Match the allocation to a request
 if (!matchingRequests.isEmpty) {
@@ -318,7 +324,7 @@ private[yarn] class YarnAllocator(
   assert(container.getResource.getMemory = resource.getMemory)
 
   logInfo(Launching container %s for on host %s.format(containerId, 
executorHostname))
-  executorIdToContainer(executorId) = container  
+  executorIdToContainer(executorId) = container
 
   val containerSet = 
allocatedHostToContainersMap.getOrElseUpdate(executorHostname,
 new HashSet[ContainerId])


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: SPARK-5390 [DOCS] Encourage users to post on Stack Overflow in Community Docs

2015-03-02 Thread srowen

Repository: spark
Updated Branches:
  refs/heads/branch-1.3 54ac24365 - 58e7198f0


SPARK-5390 [DOCS] Encourage users to post on Stack Overflow in Community Docs

Point Community to main Spark Community page; mention SO tag apache-spark.

Separately, the Apache site can be updated to mention, under Mailing Lists:
StackOverflow also has an apache-spark tag for Spark QA. or similar.

Author: Sean Owen so...@cloudera.com

Closes #4843 from srowen/SPARK-5390 and squashes the following commits:

3508ac6 [Sean Owen] Point Community to main Spark Community page; mention SO 
tag apache-spark

(cherry picked from commit 0b472f60cdf4984ab5e28e6dbf12615e8997a448)
Signed-off-by: Sean Owen so...@cloudera.com


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/58e7198f
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/58e7198f
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/58e7198f

Branch: refs/heads/branch-1.3
Commit: 58e7198f008753ef3e3b81f3ef30080300645d50
Parents: 54ac243
Author: Sean Owen so...@cloudera.com
Authored: Mon Mar 2 21:10:08 2015 +
Committer: Sean Owen so...@cloudera.com
Committed: Mon Mar 2 21:10:19 2015 +

--
 docs/index.md | 10 ++
 1 file changed, 2 insertions(+), 8 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/58e7198f/docs/index.md
--
diff --git a/docs/index.md b/docs/index.md
index e006be6..0986398 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -115,6 +115,8 @@ options for deployment:
 
 * [Spark Homepage](http://spark.apache.org)
 * [Spark Wiki](https://cwiki.apache.org/confluence/display/SPARK)
+* [Spark Community](http://spark.apache.org/community.html) resources, 
including local meetups
+* [StackOverflow tag 
`apache-spark`](http://stackoverflow.com/questions/tagged/apache-spark)
 * [Mailing Lists](http://spark.apache.org/mailing-lists.html): ask questions 
about Spark here
 * [AMP Camps](http://ampcamp.berkeley.edu/): a series of training camps at UC 
Berkeley that featured talks and
   exercises about Spark, Spark Streaming, Mesos, and more. 
[Videos](http://ampcamp.berkeley.edu/3/),
@@ -123,11 +125,3 @@ options for deployment:
 * [Code Examples](http://spark.apache.org/examples.html): more are also 
available in the `examples` subfolder of Spark 
([Scala]({{site.SPARK_GITHUB_URL}}/tree/master/examples/src/main/scala/org/apache/spark/examples),
  
[Java]({{site.SPARK_GITHUB_URL}}/tree/master/examples/src/main/java/org/apache/spark/examples),
  [Python]({{site.SPARK_GITHUB_URL}}/tree/master/examples/src/main/python))
-
-# Community
-
-To get help using Spark or keep up with Spark development, sign up for the 
[user mailing list](http://spark.apache.org/mailing-lists.html).
-
-If you're in the San Francisco Bay Area, there's a regular [Spark 
meetup](http://www.meetup.com/spark-users/) every few weeks. Come by to meet 
the developers and other users.
-
-Finally, if you'd like to contribute code to Spark, read [how to 
contribute](contributing-to-spark.html).


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: SPARK-5390 [DOCS] Encourage users to post on Stack Overflow in Community Docs

2015-03-02 Thread srowen

Repository: spark
Updated Branches:
  refs/heads/master d9a8bae77 - 0b472f60c


SPARK-5390 [DOCS] Encourage users to post on Stack Overflow in Community Docs

Point Community to main Spark Community page; mention SO tag apache-spark.

Separately, the Apache site can be updated to mention, under Mailing Lists:
StackOverflow also has an apache-spark tag for Spark QA. or similar.

Author: Sean Owen so...@cloudera.com

Closes #4843 from srowen/SPARK-5390 and squashes the following commits:

3508ac6 [Sean Owen] Point Community to main Spark Community page; mention SO 
tag apache-spark


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0b472f60
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0b472f60
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0b472f60

Branch: refs/heads/master
Commit: 0b472f60cdf4984ab5e28e6dbf12615e8997a448
Parents: d9a8bae
Author: Sean Owen so...@cloudera.com
Authored: Mon Mar 2 21:10:08 2015 +
Committer: Sean Owen so...@cloudera.com
Committed: Mon Mar 2 21:10:08 2015 +

--
 docs/index.md | 10 ++
 1 file changed, 2 insertions(+), 8 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/0b472f60/docs/index.md
--
diff --git a/docs/index.md b/docs/index.md
index e006be6..0986398 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -115,6 +115,8 @@ options for deployment:
 
 * [Spark Homepage](http://spark.apache.org)
 * [Spark Wiki](https://cwiki.apache.org/confluence/display/SPARK)
+* [Spark Community](http://spark.apache.org/community.html) resources, 
including local meetups
+* [StackOverflow tag 
`apache-spark`](http://stackoverflow.com/questions/tagged/apache-spark)
 * [Mailing Lists](http://spark.apache.org/mailing-lists.html): ask questions 
about Spark here
 * [AMP Camps](http://ampcamp.berkeley.edu/): a series of training camps at UC 
Berkeley that featured talks and
   exercises about Spark, Spark Streaming, Mesos, and more. 
[Videos](http://ampcamp.berkeley.edu/3/),
@@ -123,11 +125,3 @@ options for deployment:
 * [Code Examples](http://spark.apache.org/examples.html): more are also 
available in the `examples` subfolder of Spark 
([Scala]({{site.SPARK_GITHUB_URL}}/tree/master/examples/src/main/scala/org/apache/spark/examples),
  
[Java]({{site.SPARK_GITHUB_URL}}/tree/master/examples/src/main/java/org/apache/spark/examples),
  [Python]({{site.SPARK_GITHUB_URL}}/tree/master/examples/src/main/python))
-
-# Community
-
-To get help using Spark or keep up with Spark development, sign up for the 
[user mailing list](http://spark.apache.org/mailing-lists.html).
-
-If you're in the San Francisco Bay Area, there's a regular [Spark 
meetup](http://www.meetup.com/spark-users/) every few weeks. Come by to meet 
the developers and other users.
-
-Finally, if you'd like to contribute code to Spark, read [how to 
contribute](contributing-to-spark.html).


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [DOCS] Refactored Dataframe join comment to use correct parameter ordering

2015-03-02 Thread marmbrus

Repository: spark
Updated Branches:
  refs/heads/master af2effdd7 - d9a8bae77


[DOCS] Refactored Dataframe join comment to use correct parameter ordering

The API signatire for join requires the JoinType to be the third parameter. The 
code examples provided for join show JoinType being provided as the 2nd parater 
resuling in errors (i.e. df1.join(df2, outer, $df1Key === $df2Key) ). 
The correct sample code is df1.join(df2, $df1Key === $df2Key, outer)

Author: Paul Power paul.po...@peerside.com

Closes #4847 from peerside/master and squashes the following commits:

ebc1efa [Paul Power] Merge pull request #1 from peerside/peerside-patch-1
e353340 [Paul Power] Updated comments use correct sample code for Dataframe 
joins


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d9a8bae7
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d9a8bae7
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d9a8bae7

Branch: refs/heads/master
Commit: d9a8bae77826a0cc77df29d85883e914d0f0b4f3
Parents: af2effd
Author: Paul Power paul.po...@peerside.com
Authored: Mon Mar 2 13:08:47 2015 -0800
Committer: Michael Armbrust mich...@databricks.com
Committed: Mon Mar 2 13:09:35 2015 -0800

--
 sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/d9a8bae7/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
--
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index 060ab5e..f3aac08 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -337,11 +337,11 @@ class DataFrame protected[sql](
* {{{
*   // Scala:
*   import org.apache.spark.sql.functions._
-   *   df1.join(df2, outer, $df1Key === $df2Key)
+   *   df1.join(df2, $df1Key === $df2Key, outer)
*
*   // Java:
*   import static org.apache.spark.sql.functions.*;
-   *   df1.join(df2, outer, col(df1Key) === col(df2Key));
+   *   df1.join(df2, col(df1Key).equalTo(col(df2Key)), outer);
* }}}
*
* @param right Right side of the join.


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-6040][SQL] Fix the percent bug in tablesample

2015-03-02 Thread marmbrus

Repository: spark
Updated Branches:
  refs/heads/branch-1.3 f92876a54 - a83b9bbb2


[SPARK-6040][SQL] Fix the percent bug in tablesample

HiveQL expression like `select count(1) from src tablesample(1 percent);` means 
take 1% sample to select. But it means 100% in the current version of the Spark.

Author: q00251598 qiyad...@huawei.com

Closes #4789 from watermen/SPARK-6040 and squashes the following commits:

2453ebe [q00251598] check and adjust the fraction.

(cherry picked from commit 582e5a24c55e8c876733537c9910001affc8b29b)
Signed-off-by: Michael Armbrust mich...@databricks.com


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a83b9bbb
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a83b9bbb
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a83b9bbb

Branch: refs/heads/branch-1.3
Commit: a83b9bbb242ff6dedf261b8838add5a391d5bd36
Parents: f92876a
Author: q00251598 qiyad...@huawei.com
Authored: Mon Mar 2 13:16:29 2015 -0800
Committer: Michael Armbrust mich...@databricks.com
Committed: Mon Mar 2 13:16:40 2015 -0800

--
 .../main/scala/org/apache/spark/sql/hive/HiveQl.scala| 11 ++-
 .../apache/spark/sql/hive/execution/HiveQuerySuite.scala |  1 +
 2 files changed, 11 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/a83b9bbb/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
--
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
index 98263f6..ced99cd 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
@@ -40,6 +40,7 @@ import org.apache.spark.sql.execution.ExplainCommand
 import org.apache.spark.sql.sources.DescribeCommand
 import org.apache.spark.sql.hive.execution.{HiveNativeCommand, DropTable, 
AnalyzeTable, HiveScriptIOSchema}
 import org.apache.spark.sql.types._
+import org.apache.spark.util.random.RandomSampler
 
 /* Implicit conversions */
 import scala.collection.JavaConversions._
@@ -850,7 +851,15 @@ 
https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
 case Token(TOK_TABLESPLITSAMPLE,
Token(TOK_PERCENT, Nil) ::
Token(fraction, Nil) :: Nil) =
-  Sample(fraction.toDouble, withReplacement = false, (math.random * 
1000).toInt, relation)
+  // The range of fraction accepted by Sample is [0, 1]. Because 
Hive's block sampling
+  // function takes X PERCENT as the input and the range of X is [0, 
100], we need to
+  // adjust the fraction.
+  require(
+fraction.toDouble = (0.0 - RandomSampler.roundingEpsilon)
+   fraction.toDouble = (100.0 + RandomSampler.roundingEpsilon),
+sSampling fraction ($fraction) must be on interval [0, 100])
+  Sample(fraction.toDouble / 100, withReplacement = false, 
(math.random * 1000).toInt,
+relation)
 case Token(TOK_TABLEBUCKETSAMPLE,
Token(numerator, Nil) ::
Token(denominator, Nil) :: Nil) =

http://git-wip-us.apache.org/repos/asf/spark/blob/a83b9bbb/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
--
diff --git 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
index bb0a67d..c0d21bc 100644
--- 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
+++ 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
@@ -467,6 +467,7 @@ class HiveQuerySuite extends HiveComparisonTest with 
BeforeAndAfter {
 
   test(sampling) {
 sql(SELECT * FROM src TABLESAMPLE(0.1 PERCENT) s)
+sql(SELECT * FROM src TABLESAMPLE(100 PERCENT) s)
   }
 
   test(DataFrame toString) {


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-6066] Make event log format easier to parse

2015-03-02 Thread pwendell

Repository: spark
Updated Branches:
  refs/heads/master 1a49496b4 - 6776cb33e


[SPARK-6066] Make event log format easier to parse

Some users have reported difficulty in parsing the new event log format. Since 
we embed the metadata in the beginning of the file, when we compress the event 
log we need to skip the metadata because we need that information to parse the 
log later. This means we'll end up with a partially compressed file if event 
logging compression is turned on. The old format looks like:
```
sparkVersion = 1.3.0
compressionCodec = org.apache.spark.io.LZFCompressionCodec
=== LOG_HEADER_END ===
// actual events, could be compressed bytes
```
The new format in this patch puts the compression codec in the log file name 
instead. It also removes the metadata header altogether along with the Spark 
version, which was not needed. The new file name looks something like:
```
app_without_compression
app_123.lzf
app_456.snappy
```

I tested this with and without compression, using different compression codecs 
and event logging directories. I verified that both the `Master` and the 
`HistoryServer` can render both compressed and uncompressed logs as before.

Author: Andrew Or and...@databricks.com

Closes #4821 from andrewor14/event-log-format and squashes the following 
commits:

8511141 [Andrew Or] Fix test
654883d [Andrew Or] Add back metadata with Spark version
7f537cd [Andrew Or] Address review feedback
7d6aa61 [Andrew Or] Make codec an extension
59abee9 [Andrew Or] Merge branch 'master' of github.com:apache/spark into 
event-log-format
27c9a6c [Andrew Or] Address review feedback
519e51a [Andrew Or] Address review feedback
ef69276 [Andrew Or] Merge branch 'master' of github.com:apache/spark into 
event-log-format
88a091d [Andrew Or] Add tests for new format and file name
f32d8d2 [Andrew Or] Fix tests
8db5a06 [Andrew Or] Embed metadata in the event log file name instead


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6776cb33
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6776cb33
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6776cb33

Branch: refs/heads/master
Commit: 6776cb33ea691f7843b956b3e80979282967e826
Parents: 1a49496
Author: Andrew Or and...@databricks.com
Authored: Mon Mar 2 16:34:32 2015 -0800
Committer: Patrick Wendell patr...@databricks.com
Committed: Mon Mar 2 16:34:32 2015 -0800

--
 .../scala/org/apache/spark/SparkContext.scala   |   9 ++
 .../spark/deploy/ApplicationDescription.scala   |  10 +-
 .../deploy/history/FsHistoryProvider.scala  |  22 +--
 .../org/apache/spark/deploy/master/Master.scala |   8 +-
 .../org/apache/spark/io/CompressionCodec.scala  |  21 ++-
 .../spark/scheduler/EventLoggingListener.scala  | 162 +++
 .../spark/scheduler/ReplayListenerBus.scala |   3 +-
 .../apache/spark/scheduler/SparkListener.scala  |   5 +
 .../spark/scheduler/SparkListenerBus.scala  |   1 +
 .../cluster/SparkDeploySchedulerBackend.scala   |   2 +-
 .../org/apache/spark/util/JsonProtocol.scala|  14 ++
 .../deploy/history/FsHistoryProviderSuite.scala |  69 +---
 .../scheduler/EventLoggingListenerSuite.scala   |  62 ---
 .../spark/scheduler/ReplayListenerSuite.scala   |  13 +-
 14 files changed, 212 insertions(+), 189 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/6776cb33/core/src/main/scala/org/apache/spark/SparkContext.scala
--
diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala 
b/core/src/main/scala/org/apache/spark/SparkContext.scala
index 3cd0c21..e231e83 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -51,6 +51,7 @@ import org.apache.spark.deploy.{LocalSparkCluster, 
SparkHadoopUtil}
 import org.apache.spark.executor.TriggerThreadDump
 import org.apache.spark.input.{StreamInputFormat, PortableDataStream, 
WholeTextFileInputFormat,
   FixedLengthBinaryInputFormat}
+import org.apache.spark.io.CompressionCodec
 import org.apache.spark.partial.{ApproximateEvaluator, PartialResult}
 import org.apache.spark.rdd._
 import org.apache.spark.scheduler._
@@ -233,6 +234,14 @@ class SparkContext(config: SparkConf) extends Logging with 
ExecutorAllocationCli
   None
 }
   }
+  private[spark] val eventLogCodec: Option[String] = {
+val compress = conf.getBoolean(spark.eventLog.compress, false)
+if (compress  isEventLogEnabled) {
+  
Some(CompressionCodec.getCodecName(conf)).map(CompressionCodec.getShortName)
+} else {
+  None
+}
+  }
 
   // Generate the random name for a temp folder in Tachyon
   // Add a timestamp as the suffix here to make it more safe

spark git commit: [SPARK-6066] Make event log format easier to parse

2015-03-02 Thread pwendell

Repository: spark
Updated Branches:
  refs/heads/branch-1.3 866f2814a - 8100b79c2


[SPARK-6066] Make event log format easier to parse

Some users have reported difficulty in parsing the new event log format. Since 
we embed the metadata in the beginning of the file, when we compress the event 
log we need to skip the metadata because we need that information to parse the 
log later. This means we'll end up with a partially compressed file if event 
logging compression is turned on. The old format looks like:
```
sparkVersion = 1.3.0
compressionCodec = org.apache.spark.io.LZFCompressionCodec
=== LOG_HEADER_END ===
// actual events, could be compressed bytes
```
The new format in this patch puts the compression codec in the log file name 
instead. It also removes the metadata header altogether along with the Spark 
version, which was not needed. The new file name looks something like:
```
app_without_compression
app_123.lzf
app_456.snappy
```

I tested this with and without compression, using different compression codecs 
and event logging directories. I verified that both the `Master` and the 
`HistoryServer` can render both compressed and uncompressed logs as before.

Author: Andrew Or and...@databricks.com

Closes #4821 from andrewor14/event-log-format and squashes the following 
commits:

8511141 [Andrew Or] Fix test
654883d [Andrew Or] Add back metadata with Spark version
7f537cd [Andrew Or] Address review feedback
7d6aa61 [Andrew Or] Make codec an extension
59abee9 [Andrew Or] Merge branch 'master' of github.com:apache/spark into 
event-log-format
27c9a6c [Andrew Or] Address review feedback
519e51a [Andrew Or] Address review feedback
ef69276 [Andrew Or] Merge branch 'master' of github.com:apache/spark into 
event-log-format
88a091d [Andrew Or] Add tests for new format and file name
f32d8d2 [Andrew Or] Fix tests
8db5a06 [Andrew Or] Embed metadata in the event log file name instead

(cherry picked from commit 6776cb33ea691f7843b956b3e80979282967e826)
Signed-off-by: Patrick Wendell patr...@databricks.com


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8100b79c
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8100b79c
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8100b79c

Branch: refs/heads/branch-1.3
Commit: 8100b79c292349aaeefe7ff9545afb9e526c2bff
Parents: 866f281
Author: Andrew Or and...@databricks.com
Authored: Mon Mar 2 16:34:32 2015 -0800
Committer: Patrick Wendell patr...@databricks.com
Committed: Mon Mar 2 16:34:39 2015 -0800

--
 .../scala/org/apache/spark/SparkContext.scala   |   9 ++
 .../spark/deploy/ApplicationDescription.scala   |  10 +-
 .../deploy/history/FsHistoryProvider.scala  |  22 +--
 .../org/apache/spark/deploy/master/Master.scala |   8 +-
 .../org/apache/spark/io/CompressionCodec.scala  |  21 ++-
 .../spark/scheduler/EventLoggingListener.scala  | 162 +++
 .../spark/scheduler/ReplayListenerBus.scala |   3 +-
 .../apache/spark/scheduler/SparkListener.scala  |   5 +
 .../spark/scheduler/SparkListenerBus.scala  |   1 +
 .../cluster/SparkDeploySchedulerBackend.scala   |   2 +-
 .../org/apache/spark/util/JsonProtocol.scala|  14 ++
 .../deploy/history/FsHistoryProviderSuite.scala |  69 +---
 .../scheduler/EventLoggingListenerSuite.scala   |  62 ---
 .../spark/scheduler/ReplayListenerSuite.scala   |  13 +-
 14 files changed, 212 insertions(+), 189 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/8100b79c/core/src/main/scala/org/apache/spark/SparkContext.scala
--
diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala 
b/core/src/main/scala/org/apache/spark/SparkContext.scala
index d59b466..05c3210 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -51,6 +51,7 @@ import org.apache.spark.deploy.{LocalSparkCluster, 
SparkHadoopUtil}
 import org.apache.spark.executor.TriggerThreadDump
 import org.apache.spark.input.{StreamInputFormat, PortableDataStream, 
WholeTextFileInputFormat,
   FixedLengthBinaryInputFormat}
+import org.apache.spark.io.CompressionCodec
 import org.apache.spark.partial.{ApproximateEvaluator, PartialResult}
 import org.apache.spark.rdd._
 import org.apache.spark.scheduler._
@@ -233,6 +234,14 @@ class SparkContext(config: SparkConf) extends Logging with 
ExecutorAllocationCli
   None
 }
   }
+  private[spark] val eventLogCodec: Option[String] = {
+val compress = conf.getBoolean(spark.eventLog.compress, false)
+if (compress  isEventLogEnabled) {
+  
Some(CompressionCodec.getCodecName(conf)).map(CompressionCodec.getShortName)
+} else {
+  None
+}
+  }
 
   // Generate the random name for a temp folder

spark git commit: [SPARK-6121][SQL][MLLIB] simpleString for UDT

2015-03-02 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master e3a88d110 - 2db6a853a


[SPARK-6121][SQL][MLLIB] simpleString for UDT

`df.dtypes` shows `null` for UDTs. This PR uses `udt` by default and 
`VectorUDT` overwrites it with `vector`.

jkbradley davies

Author: Xiangrui Meng m...@databricks.com

Closes #4858 from mengxr/SPARK-6121 and squashes the following commits:

34f0a77 [Xiangrui Meng] simpleString for UDT


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2db6a853
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2db6a853
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2db6a853

Branch: refs/heads/master
Commit: 2db6a853a53b4c25e35983bc489510abb8a73e1d
Parents: e3a88d1
Author: Xiangrui Meng m...@databricks.com
Authored: Mon Mar 2 17:14:34 2015 -0800
Committer: Xiangrui Meng m...@databricks.com
Committed: Mon Mar 2 17:14:34 2015 -0800

--
 python/pyspark/mllib/linalg.py | 3 +++
 python/pyspark/sql/types.py| 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/2db6a853/python/pyspark/mllib/linalg.py
--
diff --git a/python/pyspark/mllib/linalg.py b/python/pyspark/mllib/linalg.py
index 597012b..f5aad28 100644
--- a/python/pyspark/mllib/linalg.py
+++ b/python/pyspark/mllib/linalg.py
@@ -152,6 +152,9 @@ class VectorUDT(UserDefinedType):
 else:
 raise ValueError(do not recognize type %r % tpe)
 
+def simpleString(self):
+return vector
+
 
 class Vector(object):
 

http://git-wip-us.apache.org/repos/asf/spark/blob/2db6a853/python/pyspark/sql/types.py
--
diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index 31a861e..0169028 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -468,7 +468,7 @@ class UserDefinedType(DataType):
 raise NotImplementedError(UDT must implement deserialize().)
 
 def simpleString(self):
-return 'null'
+return 'udt'
 
 def json(self):
 return json.dumps(self.jsonValue(), separators=(',', ':'), 
sort_keys=True)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-6121][SQL][MLLIB] simpleString for UDT

2015-03-02 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-1.3 ea69cf28e - 1b8ab5752


[SPARK-6121][SQL][MLLIB] simpleString for UDT

`df.dtypes` shows `null` for UDTs. This PR uses `udt` by default and 
`VectorUDT` overwrites it with `vector`.

jkbradley davies

Author: Xiangrui Meng m...@databricks.com

Closes #4858 from mengxr/SPARK-6121 and squashes the following commits:

34f0a77 [Xiangrui Meng] simpleString for UDT

(cherry picked from commit 2db6a853a53b4c25e35983bc489510abb8a73e1d)
Signed-off-by: Xiangrui Meng m...@databricks.com


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1b8ab575
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1b8ab575
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1b8ab575

Branch: refs/heads/branch-1.3
Commit: 1b8ab5752fccbc08c3f76c50bc384b89231d0a78
Parents: ea69cf2
Author: Xiangrui Meng m...@databricks.com
Authored: Mon Mar 2 17:14:34 2015 -0800
Committer: Xiangrui Meng m...@databricks.com
Committed: Mon Mar 2 17:14:43 2015 -0800

--
 python/pyspark/mllib/linalg.py | 3 +++
 python/pyspark/sql/types.py| 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/1b8ab575/python/pyspark/mllib/linalg.py
--
diff --git a/python/pyspark/mllib/linalg.py b/python/pyspark/mllib/linalg.py
index 597012b..f5aad28 100644
--- a/python/pyspark/mllib/linalg.py
+++ b/python/pyspark/mllib/linalg.py
@@ -152,6 +152,9 @@ class VectorUDT(UserDefinedType):
 else:
 raise ValueError(do not recognize type %r % tpe)
 
+def simpleString(self):
+return vector
+
 
 class Vector(object):
 

http://git-wip-us.apache.org/repos/asf/spark/blob/1b8ab575/python/pyspark/sql/types.py
--
diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index 31a861e..0169028 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -468,7 +468,7 @@ class UserDefinedType(DataType):
 raise NotImplementedError(UDT must implement deserialize().)
 
 def simpleString(self):
-return 'null'
+return 'udt'
 
 def json(self):
 return json.dumps(self.jsonValue(), separators=(',', ':'), 
sort_keys=True)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-5537] Add user guide for multinomial logistic regression

2015-03-02 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-1.3 1b8ab5752 - 11389f026


[SPARK-5537] Add user guide for multinomial logistic regression

This is based on #4801 from dbtsai. The linear method guide is re-organized a 
little bit for this change.

Closes #4801

Author: Xiangrui Meng m...@databricks.com
Author: DB Tsai dbt...@alpinenow.com

Closes #4861 from mengxr/SPARK-5537 and squashes the following commits:

47af0ac [Xiangrui Meng] update user guide for multinomial logistic regression
cdc2e15 [Xiangrui Meng] Merge remote-tracking branch 'apache/master' into 
AlpineNow-mlor-doc
096d0ca [DB Tsai] first commit

(cherry picked from commit 9d6c5aeebd3c7f8ff6defe3bccd8ff12ed918293)
Signed-off-by: Xiangrui Meng m...@databricks.com


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/11389f02
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/11389f02
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/11389f02

Branch: refs/heads/branch-1.3
Commit: 11389f0262b1a755effc8cbd247aa199c0d8fd9d
Parents: 1b8ab57
Author: Xiangrui Meng m...@databricks.com
Authored: Mon Mar 2 18:10:50 2015 -0800
Committer: Xiangrui Meng m...@databricks.com
Committed: Mon Mar 2 18:10:56 2015 -0800

--
 docs/mllib-linear-methods.md | 278 +-
 1 file changed, 217 insertions(+), 61 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/11389f02/docs/mllib-linear-methods.md
--
diff --git a/docs/mllib-linear-methods.md b/docs/mllib-linear-methods.md
index ffbd7ef..03f90d7 100644
--- a/docs/mllib-linear-methods.md
+++ b/docs/mllib-linear-methods.md
@@ -17,7 +17,7 @@ displayTitle: a href=mllib-guide.htmlMLlib/a - Linear 
Methods
 \newcommand{\av}{\mathbf{\alpha}}
 \newcommand{\bv}{\mathbf{b}}
 \newcommand{\N}{\mathbb{N}}
-\newcommand{\id}{\mathbf{I}} 
+\newcommand{\id}{\mathbf{I}}
 \newcommand{\ind}{\mathbf{1}} 
 \newcommand{\0}{\mathbf{0}} 
 \newcommand{\unit}{\mathbf{e}} 
@@ -114,18 +114,26 @@ especially when the number of training examples is small.
 
 Under the hood, linear methods use convex optimization methods to optimize the 
objective functions.  MLlib uses two methods, SGD and L-BFGS, described in the 
[optimization section](mllib-optimization.html).  Currently, most algorithm 
APIs support Stochastic Gradient Descent (SGD), and a few support L-BFGS. Refer 
to [this optimization 
section](mllib-optimization.html#Choosing-an-Optimization-Method) for 
guidelines on choosing between optimization methods.
 
-## Binary classification
-
-[Binary classification](http://en.wikipedia.org/wiki/Binary_classification)
-aims to divide items into two categories: positive and negative.  MLlib
-supports two linear methods for binary classification: linear Support Vector
-Machines (SVMs) and logistic regression. For both methods, MLlib supports
-L1 and L2 regularized variants. The training data set is represented by an RDD
-of [LabeledPoint](mllib-data-types.html) in MLlib.  Note that, in the
-mathematical formulation in this guide, a training label $y$ is denoted as
-either $+1$ (positive) or $-1$ (negative), which is convenient for the
-formulation.  *However*, the negative label is represented by $0$ in MLlib
-instead of $-1$, to be consistent with multiclass labeling.
+## Classification
+
+[Classification](http://en.wikipedia.org/wiki/Statistical_classification) aims 
to divide items into
+categories.
+The most common classification type is
+[binary classificaion](http://en.wikipedia.org/wiki/Binary_classification), 
where there are two
+categories, usually named positive and negative.
+If there are more than two categories, it is called
+[multiclass 
classification](http://en.wikipedia.org/wiki/Multiclass_classification).
+MLlib supports two linear methods for classification: linear Support Vector 
Machines (SVMs)
+and logistic regression.
+Linear SVMs supports only binary classification, while logistic regression 
supports both binary and
+multiclass classification problems.
+For both methods, MLlib supports L1 and L2 regularized variants.
+The training data set is represented by an RDD of 
[LabeledPoint](mllib-data-types.html) in MLlib,
+where labels are class indices starting from zero: $0, 1, 2, \ldots$.
+Note that, in the mathematical formulation in this guide, a binary label $y$ 
is denoted as either
+$+1$ (positive) or $-1$ (negative), which is convenient for the formulation.
+*However*, the negative label is represented by $0$ in MLlib instead of $-1$, 
to be consistent with
+multiclass labeling.
 
 ### Linear Support Vector Machines (SVMs)
 
@@ -144,41 +152,7 @@ denoted by $\x$, the model makes predictions based on the 
value of $\wv^T \x$.
 By the default, if $\wv^T \x \geq 0$ then the outcome is

spark git commit: [SPARK-6127][Streaming][Docs] Add Kafka to Python api docs

2015-03-02 Thread tdas

Repository: spark
Updated Branches:
  refs/heads/master 9d6c5aeeb - 9eb22ece1


[SPARK-6127][Streaming][Docs] Add Kafka to Python api docs

davies

Author: Tathagata Das tathagata.das1...@gmail.com

Closes #4860 from tdas/SPARK-6127 and squashes the following commits:

82de92a [Tathagata Das] Add Kafka to Python api docs


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9eb22ece
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9eb22ece
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9eb22ece

Branch: refs/heads/master
Commit: 9eb22ece115c69899d100cecb8a5e20b3a268649
Parents: 9d6c5ae
Author: Tathagata Das tathagata.das1...@gmail.com
Authored: Mon Mar 2 18:40:46 2015 -0800
Committer: Tathagata Das tathagata.das1...@gmail.com
Committed: Mon Mar 2 18:40:46 2015 -0800

--
 python/docs/pyspark.streaming.rst | 7 +++
 1 file changed, 7 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/9eb22ece/python/docs/pyspark.streaming.rst
--
diff --git a/python/docs/pyspark.streaming.rst 
b/python/docs/pyspark.streaming.rst
index f081856..7890d9d 100644
--- a/python/docs/pyspark.streaming.rst
+++ b/python/docs/pyspark.streaming.rst
@@ -8,3 +8,10 @@ Module contents
 :members:
 :undoc-members:
 :show-inheritance:
+
+pyspark.streaming.kafka module
+
+.. automodule:: pyspark.streaming.kafka
+:members:
+:undoc-members:
+:show-inheritance:


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-6114][SQL] Avoid metastore conversions before plan is resolved

2015-03-02 Thread marmbrus

Repository: spark
Updated Branches:
  refs/heads/master 26c1c56de - 8223ce6a8


[SPARK-6114][SQL] Avoid metastore conversions before plan is resolved

Author: Michael Armbrust mich...@databricks.com

Closes #4855 from marmbrus/explodeBug and squashes the following commits:

a712249 [Michael Armbrust] [SPARK-6114][SQL] Avoid metastore conversions before 
plan is resolved


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8223ce6a
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8223ce6a
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8223ce6a

Branch: refs/heads/master
Commit: 8223ce6a81e4cc9fdf816892365fcdff4006c35e
Parents: 26c1c56
Author: Michael Armbrust mich...@databricks.com
Authored: Mon Mar 2 16:10:54 2015 -0800
Committer: Michael Armbrust mich...@databricks.com
Committed: Mon Mar 2 16:10:54 2015 -0800

--
 .../org/apache/spark/sql/hive/HiveMetastoreCatalog.scala  |  4 
 .../apache/spark/sql/hive/execution/SQLQuerySuite.scala   | 10 ++
 2 files changed, 14 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/8223ce6a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
--
diff --git 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index d3ad364..74b4e76 100644
--- 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -444,6 +444,10 @@ private[hive] class HiveMetastoreCatalog(hive: 
HiveContext) extends Catalog with
*/
   object ParquetConversions extends Rule[LogicalPlan] {
 override def apply(plan: LogicalPlan): LogicalPlan = {
+  if (!plan.resolved) {
+return plan
+  }
+
   // Collects all `MetastoreRelation`s which should be replaced
   val toBeReplaced = plan.collect {
 // Write path

http://git-wip-us.apache.org/repos/asf/spark/blob/8223ce6a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
--
diff --git 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index f2bc73b..22ea19b 100644
--- 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -31,6 +31,9 @@ case class Nested1(f1: Nested2)
 case class Nested2(f2: Nested3)
 case class Nested3(f3: Int)
 
+case class NestedArray2(b: Seq[Int])
+case class NestedArray1(a: NestedArray2)
+
 /**
  * A collection of hive query tests where we generate the answers ourselves 
instead of depending on
  * Hive to generate them (in contrast to HiveQuerySuite).  Often this is 
because the query is
@@ -38,6 +41,13 @@ case class Nested3(f3: Int)
  */
 class SQLQuerySuite extends QueryTest {
 
+  test(explode nested Field) {
+
Seq(NestedArray1(NestedArray2(Seq(1,2,3.toDF.registerTempTable(nestedArray)
+checkAnswer(
+  sql(SELECT ints FROM nestedArray LATERAL VIEW explode(a.b) a AS ints),
+  Row(1) :: Row(2) :: Row(3) :: Nil)
+  }
+
   test(SPARK-4512 Fix attribute reference resolution error when using SORT 
BY) {
 checkAnswer(
   sql(SELECT * FROM (SELECT key + key AS a FROM src SORT BY value) t 
ORDER BY t.a),


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-6048] SparkConf should not translate deprecated configs on set

2015-03-02 Thread pwendell

Repository: spark
Updated Branches:
  refs/heads/master 6776cb33e - 258d154c9


[SPARK-6048] SparkConf should not translate deprecated configs on set

There are multiple issues with translating on set outlined in the JIRA.

This PR reverts the translation logic added to `SparkConf`. In the future, 
after the 1.3.0 release we will figure out a way to reorganize the internal 
structure more elegantly. For now, let's preserve the existing semantics of 
`SparkConf` since it's a public interface. Unfortunately this means duplicating 
some code for now, but this is all internal and we can always clean it up later.

Author: Andrew Or and...@databricks.com

Closes #4799 from andrewor14/conf-set-translate and squashes the following 
commits:

11c525b [Andrew Or] Move warning to driver
10e77b5 [Andrew Or] Add documentation for deprecation precedence
a369cb1 [Andrew Or] Merge branch 'master' of github.com:apache/spark into 
conf-set-translate
c26a9e3 [Andrew Or] Revert all translate logic in SparkConf
fef6c9c [Andrew Or] Restore deprecation logic for 
spark.executor.userClassPathFirst
94b4dfa [Andrew Or] Translate on get, not set


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/258d154c
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/258d154c
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/258d154c

Branch: refs/heads/master
Commit: 258d154c9f1afdd52dce19f03d81683ee34effac
Parents: 6776cb3
Author: Andrew Or and...@databricks.com
Authored: Mon Mar 2 16:36:42 2015 -0800
Committer: Patrick Wendell patr...@databricks.com
Committed: Mon Mar 2 16:36:42 2015 -0800

--
 core/src/main/scala/org/apache/spark/SparkConf.scala | 15 +++
 .../scala/org/apache/spark/executor/Executor.scala   | 13 +
 .../test/scala/org/apache/spark/SparkConfSuite.scala | 12 
 docs/configuration.md|  4 +++-
 .../scala/org/apache/spark/deploy/yarn/Client.scala  |  3 ++-
 5 files changed, 25 insertions(+), 22 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/258d154c/core/src/main/scala/org/apache/spark/SparkConf.scala
--
diff --git a/core/src/main/scala/org/apache/spark/SparkConf.scala 
b/core/src/main/scala/org/apache/spark/SparkConf.scala
index 61b34d5..2ca19f5 100644
--- a/core/src/main/scala/org/apache/spark/SparkConf.scala
+++ b/core/src/main/scala/org/apache/spark/SparkConf.scala
@@ -68,7 +68,7 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with 
Logging {
 if (value == null) {
   throw new NullPointerException(null value for  + key)
 }
-settings.put(translateConfKey(key, warn = true), value)
+settings.put(key, value)
 this
   }
 
@@ -140,7 +140,7 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable 
with Logging {
 
   /** Set a parameter if it isn't already configured */
   def setIfMissing(key: String, value: String): SparkConf = {
-settings.putIfAbsent(translateConfKey(key, warn = true), value)
+settings.putIfAbsent(key, value)
 this
   }
 
@@ -176,7 +176,7 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable 
with Logging {
 
   /** Get a parameter as an Option */
   def getOption(key: String): Option[String] = {
-Option(settings.get(translateConfKey(key)))
+Option(settings.get(key))
   }
 
   /** Get all parameters as a list of pairs */
@@ -229,7 +229,7 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable 
with Logging {
   def getAppId: String = get(spark.app.id)
 
   /** Does the configuration contain a given parameter? */
-  def contains(key: String): Boolean = 
settings.containsKey(translateConfKey(key))
+  def contains(key: String): Boolean = settings.containsKey(key)
 
   /** Copy this object */
   override def clone: SparkConf = {
@@ -343,6 +343,13 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable 
with Logging {
 }
   }
 }
+
+// Warn against the use of deprecated configs
+deprecatedConfigs.values.foreach { dc =
+  if (contains(dc.oldName)) {
+dc.warn()
+  }
+}
   }
 
   /**

http://git-wip-us.apache.org/repos/asf/spark/blob/258d154c/core/src/main/scala/org/apache/spark/executor/Executor.scala
--
diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala 
b/core/src/main/scala/org/apache/spark/executor/Executor.scala
index b684fb7..bed0a08 100644
--- a/core/src/main/scala/org/apache/spark/executor/Executor.scala
+++ b/core/src/main/scala/org/apache/spark/executor/Executor.scala
@@ -92,6 +92,12 @@ private[spark] class Executor(
   private val executorActor = env.actorSystem.actorOf(
 Props(new ExecutorActor(executorId)), ExecutorActor)
 
+  //

spark git commit: [SPARK-6048] SparkConf should not translate deprecated configs on set

2015-03-02 Thread pwendell

Repository: spark
Updated Branches:
  refs/heads/branch-1.3 8100b79c2 - ea69cf28e


[SPARK-6048] SparkConf should not translate deprecated configs on set

There are multiple issues with translating on set outlined in the JIRA.

This PR reverts the translation logic added to `SparkConf`. In the future, 
after the 1.3.0 release we will figure out a way to reorganize the internal 
structure more elegantly. For now, let's preserve the existing semantics of 
`SparkConf` since it's a public interface. Unfortunately this means duplicating 
some code for now, but this is all internal and we can always clean it up later.

Author: Andrew Or and...@databricks.com

Closes #4799 from andrewor14/conf-set-translate and squashes the following 
commits:

11c525b [Andrew Or] Move warning to driver
10e77b5 [Andrew Or] Add documentation for deprecation precedence
a369cb1 [Andrew Or] Merge branch 'master' of github.com:apache/spark into 
conf-set-translate
c26a9e3 [Andrew Or] Revert all translate logic in SparkConf
fef6c9c [Andrew Or] Restore deprecation logic for 
spark.executor.userClassPathFirst
94b4dfa [Andrew Or] Translate on get, not set

(cherry picked from commit 258d154c9f1afdd52dce19f03d81683ee34effac)
Signed-off-by: Patrick Wendell patr...@databricks.com


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ea69cf28
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ea69cf28
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ea69cf28

Branch: refs/heads/branch-1.3
Commit: ea69cf28e6874d205fca70872a637547407bc08b
Parents: 8100b79
Author: Andrew Or and...@databricks.com
Authored: Mon Mar 2 16:36:42 2015 -0800
Committer: Patrick Wendell patr...@databricks.com
Committed: Mon Mar 2 16:36:50 2015 -0800

--
 core/src/main/scala/org/apache/spark/SparkConf.scala | 15 +++
 .../scala/org/apache/spark/executor/Executor.scala   | 13 +
 .../test/scala/org/apache/spark/SparkConfSuite.scala | 12 
 docs/configuration.md|  4 +++-
 .../scala/org/apache/spark/deploy/yarn/Client.scala  |  3 ++-
 5 files changed, 25 insertions(+), 22 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/ea69cf28/core/src/main/scala/org/apache/spark/SparkConf.scala
--
diff --git a/core/src/main/scala/org/apache/spark/SparkConf.scala 
b/core/src/main/scala/org/apache/spark/SparkConf.scala
index 0dbd261..f3de467 100644
--- a/core/src/main/scala/org/apache/spark/SparkConf.scala
+++ b/core/src/main/scala/org/apache/spark/SparkConf.scala
@@ -68,7 +68,7 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with 
Logging {
 if (value == null) {
   throw new NullPointerException(null value for  + key)
 }
-settings.put(translateConfKey(key, warn = true), value)
+settings.put(key, value)
 this
   }
 
@@ -140,7 +140,7 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable 
with Logging {
 
   /** Set a parameter if it isn't already configured */
   def setIfMissing(key: String, value: String): SparkConf = {
-settings.putIfAbsent(translateConfKey(key, warn = true), value)
+settings.putIfAbsent(key, value)
 this
   }
 
@@ -176,7 +176,7 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable 
with Logging {
 
   /** Get a parameter as an Option */
   def getOption(key: String): Option[String] = {
-Option(settings.get(translateConfKey(key)))
+Option(settings.get(key))
   }
 
   /** Get all parameters as a list of pairs */
@@ -229,7 +229,7 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable 
with Logging {
   def getAppId: String = get(spark.app.id)
 
   /** Does the configuration contain a given parameter? */
-  def contains(key: String): Boolean = 
settings.containsKey(translateConfKey(key))
+  def contains(key: String): Boolean = settings.containsKey(key)
 
   /** Copy this object */
   override def clone: SparkConf = {
@@ -343,6 +343,13 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable 
with Logging {
 }
   }
 }
+
+// Warn against the use of deprecated configs
+deprecatedConfigs.values.foreach { dc =
+  if (contains(dc.oldName)) {
+dc.warn()
+  }
+}
   }
 
   /**

http://git-wip-us.apache.org/repos/asf/spark/blob/ea69cf28/core/src/main/scala/org/apache/spark/executor/Executor.scala
--
diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala 
b/core/src/main/scala/org/apache/spark/executor/Executor.scala
index b684fb7..bed0a08 100644
--- a/core/src/main/scala/org/apache/spark/executor/Executor.scala
+++ b/core/src/main/scala/org/apache/spark/executor/Executor.scala
@@ -92,6 +92,12 @@ private[spark] class

spark git commit: [SPARK-6127][Streaming][Docs] Add Kafka to Python api docs

2015-03-02 Thread tdas

Repository: spark
Updated Branches:
  refs/heads/branch-1.3 11389f026 - ffd059109


[SPARK-6127][Streaming][Docs] Add Kafka to Python api docs

davies

Author: Tathagata Das tathagata.das1...@gmail.com

Closes #4860 from tdas/SPARK-6127 and squashes the following commits:

82de92a [Tathagata Das] Add Kafka to Python api docs

(cherry picked from commit 9eb22ece115c69899d100cecb8a5e20b3a268649)
Signed-off-by: Tathagata Das tathagata.das1...@gmail.com


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ffd05910
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ffd05910
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ffd05910

Branch: refs/heads/branch-1.3
Commit: ffd0591094a1a3edafed5c5f3ff1ca1b6048bf46
Parents: 11389f0
Author: Tathagata Das tathagata.das1...@gmail.com
Authored: Mon Mar 2 18:40:46 2015 -0800
Committer: Tathagata Das tathagata.das1...@gmail.com
Committed: Mon Mar 2 18:40:57 2015 -0800

--
 python/docs/pyspark.streaming.rst | 7 +++
 1 file changed, 7 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/ffd05910/python/docs/pyspark.streaming.rst
--
diff --git a/python/docs/pyspark.streaming.rst 
b/python/docs/pyspark.streaming.rst
index f081856..7890d9d 100644
--- a/python/docs/pyspark.streaming.rst
+++ b/python/docs/pyspark.streaming.rst
@@ -8,3 +8,10 @@ Module contents
 :members:
 :undoc-members:
 :show-inheritance:
+
+pyspark.streaming.kafka module
+
+.. automodule:: pyspark.streaming.kafka
+:members:
+:undoc-members:
+:show-inheritance:


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-6082] [SQL] Provides better error message for malformed rows when caching tables

2015-03-02 Thread marmbrus

Repository: spark
Updated Branches:
  refs/heads/master 8223ce6a8 - 1a49496b4


[SPARK-6082] [SQL] Provides better error message for malformed rows when 
caching tables

Constructs like Hive `TRANSFORM` may generate malformed rows (via badly 
authored external scripts for example). I'm a bit hesitant to have this 
feature, since it introduces per-tuple cost when caching tables. However, 
considering caching tables is usually a one-time cost, this is probably worth 
having.

!-- Reviewable:start --
[img src=https://reviewable.io/review_button.png; height=40 alt=Review on 
Reviewable/](https://reviewable.io/reviews/apache/spark/4842)
!-- Reviewable:end --

Author: Cheng Lian l...@databricks.com

Closes #4842 from liancheng/spark-6082 and squashes the following commits:

b05dbff [Cheng Lian] Provides better error message for malformed rows when 
caching tables


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1a49496b
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1a49496b
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1a49496b

Branch: refs/heads/master
Commit: 1a49496b4a9df40c74739fc0fb8a21c88a477075
Parents: 8223ce6
Author: Cheng Lian l...@databricks.com
Authored: Mon Mar 2 16:18:00 2015 -0800
Committer: Michael Armbrust mich...@databricks.com
Committed: Mon Mar 2 16:18:00 2015 -0800

--
 .../spark/sql/columnar/InMemoryColumnarTableScan.scala   | 11 +++
 1 file changed, 11 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/1a49496b/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala
index 11d5943..8944a32 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala
@@ -119,6 +119,17 @@ private[sql] case class InMemoryRelation(
   var rowCount = 0
   while (rowIterator.hasNext  rowCount  batchSize) {
 val row = rowIterator.next()
+
+// Added for SPARK-6082. This assertion can be useful for 
scenarios when something
+// like Hive TRANSFORM is used. The external data generation 
script used in TRANSFORM
+// may result malformed rows, causing 
ArrayIndexOutOfBoundsException, which is somewhat
+// hard to decipher.
+assert(
+  row.size == columnBuilders.size,
+  sRow column number mismatch, expected ${output.size} columns, 
but got ${row.size}.
+ |Row content: $row
+   .stripMargin)
+
 var i = 0
 while (i  row.length) {
   columnBuilders(i).appendFrom(row, i)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-6082] [SQL] Provides better error message for malformed rows when caching tables

2015-03-02 Thread marmbrus

Repository: spark
Updated Branches:
  refs/heads/branch-1.3 3899c7c2c - 866f2814a


[SPARK-6082] [SQL] Provides better error message for malformed rows when 
caching tables

Constructs like Hive `TRANSFORM` may generate malformed rows (via badly 
authored external scripts for example). I'm a bit hesitant to have this 
feature, since it introduces per-tuple cost when caching tables. However, 
considering caching tables is usually a one-time cost, this is probably worth 
having.

!-- Reviewable:start --
[img src=https://reviewable.io/review_button.png; height=40 alt=Review on 
Reviewable/](https://reviewable.io/reviews/apache/spark/4842)
!-- Reviewable:end --

Author: Cheng Lian l...@databricks.com

Closes #4842 from liancheng/spark-6082 and squashes the following commits:

b05dbff [Cheng Lian] Provides better error message for malformed rows when 
caching tables

(cherry picked from commit 1a49496b4a9df40c74739fc0fb8a21c88a477075)
Signed-off-by: Michael Armbrust mich...@databricks.com


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/866f2814
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/866f2814
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/866f2814

Branch: refs/heads/branch-1.3
Commit: 866f2814a48a34820da9069378c2cbbb3589fb0f
Parents: 3899c7c
Author: Cheng Lian l...@databricks.com
Authored: Mon Mar 2 16:18:00 2015 -0800
Committer: Michael Armbrust mich...@databricks.com
Committed: Mon Mar 2 16:18:10 2015 -0800

--
 .../spark/sql/columnar/InMemoryColumnarTableScan.scala   | 11 +++
 1 file changed, 11 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/866f2814/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala
index 11d5943..8944a32 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala
@@ -119,6 +119,17 @@ private[sql] case class InMemoryRelation(
   var rowCount = 0
   while (rowIterator.hasNext  rowCount  batchSize) {
 val row = rowIterator.next()
+
+// Added for SPARK-6082. This assertion can be useful for 
scenarios when something
+// like Hive TRANSFORM is used. The external data generation 
script used in TRANSFORM
+// may result malformed rows, causing 
ArrayIndexOutOfBoundsException, which is somewhat
+// hard to decipher.
+assert(
+  row.size == columnBuilders.size,
+  sRow column number mismatch, expected ${output.size} columns, 
but got ${row.size}.
+ |Row content: $row
+   .stripMargin)
+
 var i = 0
 while (i  row.length) {
   columnBuilders(i).appendFrom(row, i)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-5950][SQL]Insert array into a metastore table saved as parquet should work when using datasource api

2015-03-02 Thread marmbrus

Repository: spark
Updated Branches:
  refs/heads/master 9eb22ece1 - 12599942e


[SPARK-5950][SQL]Insert array into a metastore table saved as parquet should 
work when using datasource api

This PR contains the following changes:
1. Add a new method, `DataType.equalsIgnoreCompatibleNullability`, which is the 
middle ground between DataType's equality check and 
`DataType.equalsIgnoreNullability`. For two data types `from` and `to`, it does 
`equalsIgnoreNullability` as well as if the nullability of `from` is compatible 
with that of `to`. For example, the nullability of `ArrayType(IntegerType, 
containsNull = false)` is compatible with that of `ArrayType(IntegerType, 
containsNull = true)` (for an array without null values, we can always say it 
may contain null values). However,  the nullability of `ArrayType(IntegerType, 
containsNull = true)` is incompatible with that of `ArrayType(IntegerType, 
containsNull = false)` (for an array that may have null values, we cannot say 
it does not have null values).
2. For the `resolved` field of `InsertIntoTable`, use 
`equalsIgnoreCompatibleNullability` to replace the equality check of the data 
types.
3. For our data source write path, when appending data, we always use the 
schema of existing table to write the data. This is important for parquet, 
since nullability direct impacts the way to encode/decode values. If we do not 
do this, we may see corrupted values when reading values from a set of parquet 
files generated with different nullability settings.
4. When generating a new parquet table, we always set 
nullable/containsNull/valueContainsNull to true. So, we will not face 
situations that we cannot append data because containsNull/valueContainsNull in 
an Array/Map column of the existing table has already been set to `false`. This 
change makes the whole data pipeline more robust.
5. Update the equality check of JSON relation. Since JSON does not really cares 
nullability,  `equalsIgnoreNullability` seems a better choice to compare 
schemata from to JSON tables.

JIRA: https://issues.apache.org/jira/browse/SPARK-5950

Thanks viirya for the initial work in #4729.

cc marmbrus liancheng

Author: Yin Huai yh...@databricks.com

Closes #4826 from yhuai/insertNullabilityCheck and squashes the following 
commits:

3b61a04 [Yin Huai] Revert change on equals.
80e487e [Yin Huai] asNullable in UDT.
587d88b [Yin Huai] Make methods private.
0cb7ea2 [Yin Huai] marmbrus's comments.
3cec464 [Yin Huai] Cheng's comments.
486ed08 [Yin Huai] Merge remote-tracking branch 'upstream/master' into 
insertNullabilityCheck
d3747d1 [Yin Huai] Remove unnecessary change.
8360817 [Yin Huai] Merge remote-tracking branch 'upstream/master' into 
insertNullabilityCheck
8a3f237 [Yin Huai] Use equalsIgnoreNullability instead of equality check.
0eb5578 [Yin Huai] Fix tests.
f6ed813 [Yin Huai] Update old parquet path.
e4f397c [Yin Huai] Unit tests.
b2c06f8 [Yin Huai] Ignore nullability in JSON relation's equality check.
8bd008b [Yin Huai] nullable, containsNull, and valueContainsNull will be always 
true for parquet data.
bf50d73 [Yin Huai] When appending data, we use the schema of the existing table 
instead of the schema of the new data.
0a703e7 [Yin Huai] Test failed again since we cannot read correct content.
9a26611 [Yin Huai] Make InsertIntoTable happy.
8f19fe5 [Yin Huai] equalsIgnoreCompatibleNullability
4ec17fd [Yin Huai] Failed test.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/12599942
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/12599942
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/12599942

Branch: refs/heads/master
Commit: 12599942e69e4d73040f3a8611661a0862514ffc
Parents: 9eb22ec
Author: Yin Huai yh...@databricks.com
Authored: Mon Mar 2 19:31:55 2015 -0800
Committer: Michael Armbrust mich...@databricks.com
Committed: Mon Mar 2 19:31:55 2015 -0800

--
 .../org/apache/spark/mllib/linalg/Vectors.scala |  2 +
 .../apache/spark/mllib/util/modelSaveLoad.scala |  2 +-
 .../catalyst/plans/logical/basicOperators.scala |  3 +-
 .../org/apache/spark/sql/types/dataTypes.scala  | 97 +++-
 .../apache/spark/sql/types/DataTypeSuite.scala  | 83 +
 .../apache/spark/sql/json/JSONRelation.scala|  4 +-
 .../spark/sql/parquet/ParquetRelation.scala |  9 +-
 .../sql/parquet/ParquetTableOperations.scala|  5 +-
 .../apache/spark/sql/parquet/newParquet.scala   | 10 +-
 .../org/apache/spark/sql/sources/commands.scala |  7 +-
 .../org/apache/spark/sql/sources/rules.scala|  2 +-
 .../apache/spark/sql/test/ExamplePointUDT.scala |  2 +
 .../apache/spark/sql/UserDefinedTypeSuite.scala |  2 +
 .../spark/sql/hive/HiveMetastoreCatalog.scala   |  5 +-
 .../spark/sql/hive/execution/commands.scala | 33 +++
 .../sql/hive/MetastoreDataSourcesSuite.scala| 71 +-

spark git commit: [SPARK-5950][SQL]Insert array into a metastore table saved as parquet should work when using datasource api

2015-03-02 Thread marmbrus

Repository: spark
Updated Branches:
  refs/heads/branch-1.3 ffd059109 - 1b490e91f


[SPARK-5950][SQL]Insert array into a metastore table saved as parquet should 
work when using datasource api

This PR contains the following changes:
1. Add a new method, `DataType.equalsIgnoreCompatibleNullability`, which is the 
middle ground between DataType's equality check and 
`DataType.equalsIgnoreNullability`. For two data types `from` and `to`, it does 
`equalsIgnoreNullability` as well as if the nullability of `from` is compatible 
with that of `to`. For example, the nullability of `ArrayType(IntegerType, 
containsNull = false)` is compatible with that of `ArrayType(IntegerType, 
containsNull = true)` (for an array without null values, we can always say it 
may contain null values). However,  the nullability of `ArrayType(IntegerType, 
containsNull = true)` is incompatible with that of `ArrayType(IntegerType, 
containsNull = false)` (for an array that may have null values, we cannot say 
it does not have null values).
2. For the `resolved` field of `InsertIntoTable`, use 
`equalsIgnoreCompatibleNullability` to replace the equality check of the data 
types.
3. For our data source write path, when appending data, we always use the 
schema of existing table to write the data. This is important for parquet, 
since nullability direct impacts the way to encode/decode values. If we do not 
do this, we may see corrupted values when reading values from a set of parquet 
files generated with different nullability settings.
4. When generating a new parquet table, we always set 
nullable/containsNull/valueContainsNull to true. So, we will not face 
situations that we cannot append data because containsNull/valueContainsNull in 
an Array/Map column of the existing table has already been set to `false`. This 
change makes the whole data pipeline more robust.
5. Update the equality check of JSON relation. Since JSON does not really cares 
nullability,  `equalsIgnoreNullability` seems a better choice to compare 
schemata from to JSON tables.

JIRA: https://issues.apache.org/jira/browse/SPARK-5950

Thanks viirya for the initial work in #4729.

cc marmbrus liancheng

Author: Yin Huai yh...@databricks.com

Closes #4826 from yhuai/insertNullabilityCheck and squashes the following 
commits:

3b61a04 [Yin Huai] Revert change on equals.
80e487e [Yin Huai] asNullable in UDT.
587d88b [Yin Huai] Make methods private.
0cb7ea2 [Yin Huai] marmbrus's comments.
3cec464 [Yin Huai] Cheng's comments.
486ed08 [Yin Huai] Merge remote-tracking branch 'upstream/master' into 
insertNullabilityCheck
d3747d1 [Yin Huai] Remove unnecessary change.
8360817 [Yin Huai] Merge remote-tracking branch 'upstream/master' into 
insertNullabilityCheck
8a3f237 [Yin Huai] Use equalsIgnoreNullability instead of equality check.
0eb5578 [Yin Huai] Fix tests.
f6ed813 [Yin Huai] Update old parquet path.
e4f397c [Yin Huai] Unit tests.
b2c06f8 [Yin Huai] Ignore nullability in JSON relation's equality check.
8bd008b [Yin Huai] nullable, containsNull, and valueContainsNull will be always 
true for parquet data.
bf50d73 [Yin Huai] When appending data, we use the schema of the existing table 
instead of the schema of the new data.
0a703e7 [Yin Huai] Test failed again since we cannot read correct content.
9a26611 [Yin Huai] Make InsertIntoTable happy.
8f19fe5 [Yin Huai] equalsIgnoreCompatibleNullability
4ec17fd [Yin Huai] Failed test.

(cherry picked from commit 12599942e69e4d73040f3a8611661a0862514ffc)
Signed-off-by: Michael Armbrust mich...@databricks.com


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1b490e91
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1b490e91
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1b490e91

Branch: refs/heads/branch-1.3
Commit: 1b490e91fd6b5d06d9caeb50e597639ccfc0bc3b
Parents: ffd0591
Author: Yin Huai yh...@databricks.com
Authored: Mon Mar 2 19:31:55 2015 -0800
Committer: Michael Armbrust mich...@databricks.com
Committed: Mon Mar 2 19:32:08 2015 -0800

--
 .../org/apache/spark/mllib/linalg/Vectors.scala |  2 +
 .../apache/spark/mllib/util/modelSaveLoad.scala |  2 +-
 .../catalyst/plans/logical/basicOperators.scala |  3 +-
 .../org/apache/spark/sql/types/dataTypes.scala  | 97 +++-
 .../apache/spark/sql/types/DataTypeSuite.scala  | 83 +
 .../apache/spark/sql/json/JSONRelation.scala|  4 +-
 .../spark/sql/parquet/ParquetRelation.scala |  9 +-
 .../sql/parquet/ParquetTableOperations.scala|  5 +-
 .../apache/spark/sql/parquet/newParquet.scala   | 10 +-
 .../org/apache/spark/sql/sources/commands.scala |  7 +-
 .../org/apache/spark/sql/sources/rules.scala|  2 +-
 .../apache/spark/sql/test/ExamplePointUDT.scala |  2 +
 .../apache/spark/sql/UserDefinedTypeSuite.scala |  2 +
 .../spark/sql/hive/HiveMetastoreCatalog.scala   |  5 +-

[2/2] spark git commit: [SPARK-5310][SQL] Fixes to Docs and Datasources API

2015-03-02 Thread marmbrus

[SPARK-5310][SQL] Fixes to Docs and Datasources API

 - Various Fixes to docs
 - Make data source traits actually interfaces

Based on #4862 but with fixed conflicts.

Author: Reynold Xin r...@databricks.com
Author: Michael Armbrust mich...@databricks.com

Closes #4868 from marmbrus/pr/4862 and squashes the following commits:

fe091ea [Michael Armbrust] Merge remote-tracking branch 'origin/master' into 
pr/4862
0208497 [Reynold Xin] Test fixes.
34e0a28 [Reynold Xin] [SPARK-5310][SQL] Various fixes to Spark SQL docs.

(cherry picked from commit 54d19689ff8d786acde5b8ada6741854ffadadea)
Signed-off-by: Michael Armbrust mich...@databricks.com


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4e6e0086
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4e6e0086
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4e6e0086

Branch: refs/heads/branch-1.3
Commit: 4e6e0086c27e3f37eda6391c063b481896a69476
Parents: 1b490e9
Author: Reynold Xin r...@databricks.com
Authored: Mon Mar 2 22:14:08 2015 -0800
Committer: Michael Armbrust mich...@databricks.com
Committed: Mon Mar 2 22:14:20 2015 -0800

--
 project/SparkBuild.scala|  29 +-
 .../scala/org/apache/spark/sql/DataFrame.scala  |  36 +-
 .../scala/org/apache/spark/sql/RDDApi.scala |   4 +-
 .../apache/spark/sql/jdbc/JDBCRelation.scala|   3 +-
 .../apache/spark/sql/json/JSONRelation.scala|   5 +-
 .../apache/spark/sql/parquet/newParquet.scala   |   3 +-
 .../apache/spark/sql/sources/interfaces.scala   |  43 +-
 .../apache/spark/sql/sources/DDLTestSuite.scala |   2 +-
 .../spark/sql/sources/FilteredScanSuite.scala   |   3 +-
 .../spark/sql/sources/PrunedScanSuite.scala |   3 +-
 .../spark/sql/sources/TableScanSuite.scala  |  11 +-
 .../spark/sql/hive/HiveMetastoreCatalog.scala   |   3 +-
 .../hive/execution/CreateTableAsSelect.scala|   3 +-
 .../execution/DescribeHiveTableCommand.scala|   4 +-
 .../sql/hive/execution/HiveNativeCommand.scala  |   6 +-
 .../sql/hive/execution/HiveTableScan.scala  |   4 +-
 .../hive/execution/InsertIntoHiveTable.scala|   6 +-
 .../hive/execution/ScriptTransformation.scala   |  15 +-
 .../spark/sql/hive/execution/commands.scala |  27 +-
 .../spark/sql/hive/execution/package.scala  |  25 -
 .../spark/sql/hive/HiveParquetSuite.scala   |  92 +++
 .../apache/spark/sql/hive/parquetSuites.scala   | 767 +++
 .../spark/sql/parquet/HiveParquetSuite.scala|  91 ---
 .../spark/sql/parquet/parquetSuites.scala   | 766 --
 24 files changed, 965 insertions(+), 986 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/4e6e0086/project/SparkBuild.scala
--
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index e4b1b96..4f17df5 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -357,6 +357,21 @@ object Unidoc {
 names.map(s = org.apache.spark. + s).mkString(:)
   }
 
+  private def ignoreUndocumentedPackages(packages: Seq[Seq[File]]): 
Seq[Seq[File]] = {
+packages
+  .map(_.filterNot(_.getName.contains($)))
+  .map(_.filterNot(_.getCanonicalPath.contains(akka)))
+  .map(_.filterNot(_.getCanonicalPath.contains(deploy)))
+  .map(_.filterNot(_.getCanonicalPath.contains(network)))
+  .map(_.filterNot(_.getCanonicalPath.contains(shuffle)))
+  .map(_.filterNot(_.getCanonicalPath.contains(executor)))
+  .map(_.filterNot(_.getCanonicalPath.contains(python)))
+  .map(_.filterNot(_.getCanonicalPath.contains(collection)))
+  .map(_.filterNot(_.getCanonicalPath.contains(sql/catalyst)))
+  .map(_.filterNot(_.getCanonicalPath.contains(sql/execution)))
+  .map(_.filterNot(_.getCanonicalPath.contains(sql/hive/test)))
+  }
+
   lazy val settings = scalaJavaUnidocSettings ++ Seq (
 publish := {},
 
@@ -368,22 +383,12 @@ object Unidoc {
 // Skip actual catalyst, but include the subproject.
 // Catalyst is not public API and contains quasiquotes which break 
scaladoc.
 unidocAllSources in (ScalaUnidoc, unidoc) := {
-  (unidocAllSources in (ScalaUnidoc, unidoc)).value
-.map(_.filterNot(_.getCanonicalPath.contains(sql/catalyst)))
+  ignoreUndocumentedPackages((unidocAllSources in (ScalaUnidoc, 
unidoc)).value)
 },
 
 // Skip class names containing $ and some internal packages in Javadocs
 unidocAllSources in (JavaUnidoc, unidoc) := {
-  (unidocAllSources in (JavaUnidoc, unidoc)).value
-.map(_.filterNot(_.getName.contains($)))
-.map(_.filterNot(_.getCanonicalPath.contains(akka)))
-.map(_.filterNot(_.getCanonicalPath.contains(deploy)))
-.map(_.filterNot(_.getCanonicalPath.contains(network)))
-

spark git commit: [SPARK-6097][MLLIB] Support tree model save/load in PySpark/MLlib

2015-03-02 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master 54d19689f - 7e53a79c3


[SPARK-6097][MLLIB] Support tree model save/load in PySpark/MLlib

Similar to `MatrixFactorizaionModel`, we only need wrappers to support 
save/load for tree models in Python.

jkbradley

Author: Xiangrui Meng m...@databricks.com

Closes #4854 from mengxr/SPARK-6097 and squashes the following commits:

4586a4d [Xiangrui Meng] fix more typos
8ebcac2 [Xiangrui Meng] fix python style
91172d8 [Xiangrui Meng] fix typos
201b3b9 [Xiangrui Meng] update user guide
b5158e2 [Xiangrui Meng] support tree model save/load in PySpark/MLlib


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7e53a79c
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7e53a79c
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7e53a79c

Branch: refs/heads/master
Commit: 7e53a79c30511dbd0e5d9878a4b8b0f5bc94e68b
Parents: 54d1968
Author: Xiangrui Meng m...@databricks.com
Authored: Mon Mar 2 22:27:01 2015 -0800
Committer: Xiangrui Meng m...@databricks.com
Committed: Mon Mar 2 22:27:01 2015 -0800

--
 docs/mllib-decision-tree.md| 16 -
 docs/mllib-ensembles.md| 32 +++--
 python/pyspark/mllib/recommendation.py |  9 +++
 python/pyspark/mllib/tests.py  | 27 -
 python/pyspark/mllib/tree.py   | 21 
 python/pyspark/mllib/util.py   | 37 +
 6 files changed, 109 insertions(+), 33 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/7e53a79c/docs/mllib-decision-tree.md
--
diff --git a/docs/mllib-decision-tree.md b/docs/mllib-decision-tree.md
index 8e478ab..c1d0f8a 100644
--- a/docs/mllib-decision-tree.md
+++ b/docs/mllib-decision-tree.md
@@ -293,11 +293,9 @@ DecisionTreeModel sameModel = 
DecisionTreeModel.load(sc.sc(), myModelPath);
 
 div data-lang=python
 
-Note that the Python API does not yet support model save/load but will in the 
future.
-
 {% highlight python %}
 from pyspark.mllib.regression import LabeledPoint
-from pyspark.mllib.tree import DecisionTree
+from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
 from pyspark.mllib.util import MLUtils
 
 # Load and parse the data file into an RDD of LabeledPoint.
@@ -317,6 +315,10 @@ testErr = labelsAndPredictions.filter(lambda (v, p): v != 
p).count() / float(tes
 print('Test Error = ' + str(testErr))
 print('Learned classification tree model:')
 print(model.toDebugString())
+
+# Save and load model
+model.save(sc, myModelPath)
+sameModel = DecisionTreeModel.load(sc, myModelPath)
 {% endhighlight %}
 /div
 
@@ -440,11 +442,9 @@ DecisionTreeModel sameModel = 
DecisionTreeModel.load(sc.sc(), myModelPath);
 
 div data-lang=python
 
-Note that the Python API does not yet support model save/load but will in the 
future.
-
 {% highlight python %}
 from pyspark.mllib.regression import LabeledPoint
-from pyspark.mllib.tree import DecisionTree
+from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
 from pyspark.mllib.util import MLUtils
 
 # Load and parse the data file into an RDD of LabeledPoint.
@@ -464,6 +464,10 @@ testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) 
* (v - p)).sum() / flo
 print('Test Mean Squared Error = ' + str(testMSE))
 print('Learned regression tree model:')
 print(model.toDebugString())
+
+# Save and load model
+model.save(sc, myModelPath)
+sameModel = DecisionTreeModel.load(sc, myModelPath)
 {% endhighlight %}
 /div
 

http://git-wip-us.apache.org/repos/asf/spark/blob/7e53a79c/docs/mllib-ensembles.md
--
diff --git a/docs/mllib-ensembles.md b/docs/mllib-ensembles.md
index ec1ef38..cbfb682 100644
--- a/docs/mllib-ensembles.md
+++ b/docs/mllib-ensembles.md
@@ -202,10 +202,8 @@ RandomForestModel sameModel = 
RandomForestModel.load(sc.sc(), myModelPath);
 
 div data-lang=python
 
-Note that the Python API does not yet support model save/load but will in the 
future.
-
 {% highlight python %}
-from pyspark.mllib.tree import RandomForest
+from pyspark.mllib.tree import RandomForest, RandomForestModel
 from pyspark.mllib.util import MLUtils
 
 # Load and parse the data file into an RDD of LabeledPoint.
@@ -228,6 +226,10 @@ testErr = labelsAndPredictions.filter(lambda (v, p): v != 
p).count() / float(tes
 print('Test Error = ' + str(testErr))
 print('Learned classification forest model:')
 print(model.toDebugString())
+
+# Save and load model
+model.save(sc, myModelPath)
+sameModel = RandomForestModel.load(sc, myModelPath)
 {% endhighlight %}
 /div
 
@@ -354,10 +356,8 @@ RandomForestModel sameModel = 
RandomForestModel.load(sc.sc(), myModelPath);
 
 div data-lang=python

[2/2] spark git commit: [SPARK-5310][SQL] Fixes to Docs and Datasources API

2015-03-02 Thread marmbrus

[SPARK-5310][SQL] Fixes to Docs and Datasources API

 - Various Fixes to docs
 - Make data source traits actually interfaces

Based on #4862 but with fixed conflicts.

Author: Reynold Xin r...@databricks.com
Author: Michael Armbrust mich...@databricks.com

Closes #4868 from marmbrus/pr/4862 and squashes the following commits:

fe091ea [Michael Armbrust] Merge remote-tracking branch 'origin/master' into 
pr/4862
0208497 [Reynold Xin] Test fixes.
34e0a28 [Reynold Xin] [SPARK-5310][SQL] Various fixes to Spark SQL docs.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/54d19689
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/54d19689
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/54d19689

Branch: refs/heads/master
Commit: 54d19689ff8d786acde5b8ada6741854ffadadea
Parents: 1259994
Author: Reynold Xin r...@databricks.com
Authored: Mon Mar 2 22:14:08 2015 -0800
Committer: Michael Armbrust mich...@databricks.com
Committed: Mon Mar 2 22:14:08 2015 -0800

--
 project/SparkBuild.scala|  29 +-
 .../scala/org/apache/spark/sql/DataFrame.scala  |  36 +-
 .../scala/org/apache/spark/sql/RDDApi.scala |   4 +-
 .../apache/spark/sql/jdbc/JDBCRelation.scala|   3 +-
 .../apache/spark/sql/json/JSONRelation.scala|   5 +-
 .../apache/spark/sql/parquet/newParquet.scala   |   3 +-
 .../apache/spark/sql/sources/interfaces.scala   |  43 +-
 .../apache/spark/sql/sources/DDLTestSuite.scala |   2 +-
 .../spark/sql/sources/FilteredScanSuite.scala   |   3 +-
 .../spark/sql/sources/PrunedScanSuite.scala |   3 +-
 .../spark/sql/sources/TableScanSuite.scala  |  11 +-
 .../spark/sql/hive/HiveMetastoreCatalog.scala   |   3 +-
 .../hive/execution/CreateTableAsSelect.scala|   3 +-
 .../execution/DescribeHiveTableCommand.scala|   4 +-
 .../sql/hive/execution/HiveNativeCommand.scala  |   6 +-
 .../sql/hive/execution/HiveTableScan.scala  |   4 +-
 .../hive/execution/InsertIntoHiveTable.scala|   6 +-
 .../hive/execution/ScriptTransformation.scala   |  15 +-
 .../spark/sql/hive/execution/commands.scala |  27 +-
 .../spark/sql/hive/execution/package.scala  |  25 -
 .../spark/sql/hive/HiveParquetSuite.scala   |  92 +++
 .../apache/spark/sql/hive/parquetSuites.scala   | 767 +++
 .../spark/sql/parquet/HiveParquetSuite.scala|  91 ---
 .../spark/sql/parquet/parquetSuites.scala   | 766 --
 24 files changed, 965 insertions(+), 986 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/54d19689/project/SparkBuild.scala
--
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index e4b1b96..4f17df5 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -357,6 +357,21 @@ object Unidoc {
 names.map(s = org.apache.spark. + s).mkString(:)
   }
 
+  private def ignoreUndocumentedPackages(packages: Seq[Seq[File]]): 
Seq[Seq[File]] = {
+packages
+  .map(_.filterNot(_.getName.contains($)))
+  .map(_.filterNot(_.getCanonicalPath.contains(akka)))
+  .map(_.filterNot(_.getCanonicalPath.contains(deploy)))
+  .map(_.filterNot(_.getCanonicalPath.contains(network)))
+  .map(_.filterNot(_.getCanonicalPath.contains(shuffle)))
+  .map(_.filterNot(_.getCanonicalPath.contains(executor)))
+  .map(_.filterNot(_.getCanonicalPath.contains(python)))
+  .map(_.filterNot(_.getCanonicalPath.contains(collection)))
+  .map(_.filterNot(_.getCanonicalPath.contains(sql/catalyst)))
+  .map(_.filterNot(_.getCanonicalPath.contains(sql/execution)))
+  .map(_.filterNot(_.getCanonicalPath.contains(sql/hive/test)))
+  }
+
   lazy val settings = scalaJavaUnidocSettings ++ Seq (
 publish := {},
 
@@ -368,22 +383,12 @@ object Unidoc {
 // Skip actual catalyst, but include the subproject.
 // Catalyst is not public API and contains quasiquotes which break 
scaladoc.
 unidocAllSources in (ScalaUnidoc, unidoc) := {
-  (unidocAllSources in (ScalaUnidoc, unidoc)).value
-.map(_.filterNot(_.getCanonicalPath.contains(sql/catalyst)))
+  ignoreUndocumentedPackages((unidocAllSources in (ScalaUnidoc, 
unidoc)).value)
 },
 
 // Skip class names containing $ and some internal packages in Javadocs
 unidocAllSources in (JavaUnidoc, unidoc) := {
-  (unidocAllSources in (JavaUnidoc, unidoc)).value
-.map(_.filterNot(_.getName.contains($)))
-.map(_.filterNot(_.getCanonicalPath.contains(akka)))
-.map(_.filterNot(_.getCanonicalPath.contains(deploy)))
-.map(_.filterNot(_.getCanonicalPath.contains(network)))
-.map(_.filterNot(_.getCanonicalPath.contains(shuffle)))
-.map(_.filterNot(_.getCanonicalPath.contains(executor)))
-

[1/2] spark git commit: [SPARK-5310][SQL] Fixes to Docs and Datasources API

2015-03-02 Thread marmbrus

Repository: spark
Updated Branches:
  refs/heads/master 12599942e - 54d19689f


http://git-wip-us.apache.org/repos/asf/spark/blob/54d19689/sql/hive/src/test/scala/org/apache/spark/sql/parquet/parquetSuites.scala
--
diff --git 
a/sql/hive/src/test/scala/org/apache/spark/sql/parquet/parquetSuites.scala 
b/sql/hive/src/test/scala/org/apache/spark/sql/parquet/parquetSuites.scala
deleted file mode 100644
index 89b943f..000
--- a/sql/hive/src/test/scala/org/apache/spark/sql/parquet/parquetSuites.scala
+++ /dev/null
@@ -1,766 +0,0 @@
-
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the License); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.parquet
-
-import java.io.File
-
-import org.scalatest.BeforeAndAfterAll
-
-import org.apache.spark.sql.{SQLConf, QueryTest}
-import org.apache.spark.sql.catalyst.expressions.Row
-import org.apache.spark.sql.execution.{ExecutedCommand, PhysicalRDD}
-import org.apache.spark.sql.hive.execution.{InsertIntoHiveTable, HiveTableScan}
-import org.apache.spark.sql.hive.test.TestHive._
-import org.apache.spark.sql.hive.test.TestHive.implicits._
-import org.apache.spark.sql.sources.{InsertIntoDataSource, LogicalRelation}
-import org.apache.spark.sql.SaveMode
-import org.apache.spark.sql.types._
-
-// The data where the partitioning key exists only in the directory structure.
-case class ParquetData(intField: Int, stringField: String)
-// The data that also includes the partitioning key
-case class ParquetDataWithKey(p: Int, intField: Int, stringField: String)
-
-case class StructContainer(intStructField :Int, stringStructField: String)
-
-case class ParquetDataWithComplexTypes(
-intField: Int,
-stringField: String,
-structField: StructContainer,
-arrayField: Seq[Int])
-
-case class ParquetDataWithKeyAndComplexTypes(
-p: Int,
-intField: Int,
-stringField: String,
-structField: StructContainer,
-arrayField: Seq[Int])
-
-/**
- * A suite to test the automatic conversion of metastore tables with parquet 
data to use the
- * built in parquet support.
- */
-class ParquetMetastoreSuiteBase extends ParquetPartitioningTest {
-  override def beforeAll(): Unit = {
-super.beforeAll()
-
-sql(s
-  create external table partitioned_parquet
-  (
-intField INT,
-stringField STRING
-  )
-  PARTITIONED BY (p int)
-  ROW FORMAT SERDE 
'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
-   STORED AS
-   INPUTFORMAT 
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
-   OUTPUTFORMAT 
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
-  location '${partitionedTableDir.getCanonicalPath}'
-)
-
-sql(s
-  create external table partitioned_parquet_with_key
-  (
-intField INT,
-stringField STRING
-  )
-  PARTITIONED BY (p int)
-  ROW FORMAT SERDE 
'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
-   STORED AS
-   INPUTFORMAT 
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
-   OUTPUTFORMAT 
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
-  location '${partitionedTableDirWithKey.getCanonicalPath}'
-)
-
-sql(s
-  create external table normal_parquet
-  (
-intField INT,
-stringField STRING
-  )
-  ROW FORMAT SERDE 
'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
-   STORED AS
-   INPUTFORMAT 
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
-   OUTPUTFORMAT 
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
-  location '${new File(normalTableDir, normal).getCanonicalPath}'
-)
-
-sql(s
-  CREATE EXTERNAL TABLE partitioned_parquet_with_complextypes
-  (
-intField INT,
-stringField STRING,
-structField STRUCTintStructField: INT, stringStructField: STRING,
-arrayField ARRAYINT
-  )
-  PARTITIONED BY (p int)
-  ROW FORMAT SERDE 
'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
-   STORED AS
-   INPUTFORMAT 
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
-   OUTPUTFORMAT

spark git commit: [SPARK-5537][MLlib][Docs] Add user guide for multinomial logistic regression

2015-03-02 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master c2fe3a6ff - b19605619


[SPARK-5537][MLlib][Docs] Add user guide for multinomial logistic regression

Adding more description on top of #4861.

Author: DB Tsai dbt...@alpinenow.com

Closes #4866 from dbtsai/doc and squashes the following commits:

37e9d07 [DB Tsai] doc


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b1960561
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b1960561
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b1960561

Branch: refs/heads/master
Commit: b196056190c569505cc32669d1aec30ed9d70665
Parents: c2fe3a6
Author: DB Tsai dbt...@alpinenow.com
Authored: Mon Mar 2 22:37:12 2015 -0800
Committer: Xiangrui Meng m...@databricks.com
Committed: Mon Mar 2 22:37:12 2015 -0800

--
 docs/mllib-linear-methods.md | 10 ++
 1 file changed, 10 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/b1960561/docs/mllib-linear-methods.md
--
diff --git a/docs/mllib-linear-methods.md b/docs/mllib-linear-methods.md
index 03f90d7..9270741 100644
--- a/docs/mllib-linear-methods.md
+++ b/docs/mllib-linear-methods.md
@@ -784,9 +784,19 @@ regularization parameter (`regParam`) along with various 
parameters associated w
 gradient descent (`stepSize`, `numIterations`, `miniBatchFraction`).  For each 
of them, we support
 all three possible regularizations (none, L1 or L2).
 
+For Logistic Regression, 
[L-BFGS](api/scala/index.html#org.apache.spark.mllib.optimization.LBFGS)
+version is implemented under [LogisticRegressionWithLBFGS]
+(api/scala/index.html#org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS),
 and this
+version supports both binary and multinomial Logistic Regression while SGD 
version only supports
+binary Logistic Regression. However, L-BFGS version doesn't support L1 
regularization but SGD one
+supports L1 regularization. When L1 regularization is not required, L-BFGS 
version is strongly
+recommended since it converges faster and more accurately compared to SGD by 
approximating the
+inverse Hessian matrix using quasi-Newton method.
+
 Algorithms are all implemented in Scala:
 
 * 
[SVMWithSGD](api/scala/index.html#org.apache.spark.mllib.classification.SVMWithSGD)
+* 
[LogisticRegressionWithLBFGS](api/scala/index.html#org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS)
 * 
[LogisticRegressionWithSGD](api/scala/index.html#org.apache.spark.mllib.classification.LogisticRegressionWithSGD)
 * 
[LinearRegressionWithSGD](api/scala/index.html#org.apache.spark.mllib.regression.LinearRegressionWithSGD)
 * 
[RidgeRegressionWithSGD](api/scala/index.html#org.apache.spark.mllib.regression.RidgeRegressionWithSGD)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-6097][MLLIB] Support tree model save/load in PySpark/MLlib

2015-03-02 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-1.3 4e6e0086c - 62c53be2a


[SPARK-6097][MLLIB] Support tree model save/load in PySpark/MLlib

Similar to `MatrixFactorizaionModel`, we only need wrappers to support 
save/load for tree models in Python.

jkbradley

Author: Xiangrui Meng m...@databricks.com

Closes #4854 from mengxr/SPARK-6097 and squashes the following commits:

4586a4d [Xiangrui Meng] fix more typos
8ebcac2 [Xiangrui Meng] fix python style
91172d8 [Xiangrui Meng] fix typos
201b3b9 [Xiangrui Meng] update user guide
b5158e2 [Xiangrui Meng] support tree model save/load in PySpark/MLlib

(cherry picked from commit 7e53a79c30511dbd0e5d9878a4b8b0f5bc94e68b)
Signed-off-by: Xiangrui Meng m...@databricks.com


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/62c53be2
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/62c53be2
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/62c53be2

Branch: refs/heads/branch-1.3
Commit: 62c53be2a9ef805545c314ffbbfafdcf2fced9f2
Parents: 4e6e008
Author: Xiangrui Meng m...@databricks.com
Authored: Mon Mar 2 22:27:01 2015 -0800
Committer: Xiangrui Meng m...@databricks.com
Committed: Mon Mar 2 22:27:23 2015 -0800

--
 docs/mllib-decision-tree.md| 16 -
 docs/mllib-ensembles.md| 32 +++--
 python/pyspark/mllib/recommendation.py |  9 +++
 python/pyspark/mllib/tests.py  | 27 -
 python/pyspark/mllib/tree.py   | 21 
 python/pyspark/mllib/util.py   | 37 +
 6 files changed, 109 insertions(+), 33 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/62c53be2/docs/mllib-decision-tree.md
--
diff --git a/docs/mllib-decision-tree.md b/docs/mllib-decision-tree.md
index 8e478ab..c1d0f8a 100644
--- a/docs/mllib-decision-tree.md
+++ b/docs/mllib-decision-tree.md
@@ -293,11 +293,9 @@ DecisionTreeModel sameModel = 
DecisionTreeModel.load(sc.sc(), myModelPath);
 
 div data-lang=python
 
-Note that the Python API does not yet support model save/load but will in the 
future.
-
 {% highlight python %}
 from pyspark.mllib.regression import LabeledPoint
-from pyspark.mllib.tree import DecisionTree
+from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
 from pyspark.mllib.util import MLUtils
 
 # Load and parse the data file into an RDD of LabeledPoint.
@@ -317,6 +315,10 @@ testErr = labelsAndPredictions.filter(lambda (v, p): v != 
p).count() / float(tes
 print('Test Error = ' + str(testErr))
 print('Learned classification tree model:')
 print(model.toDebugString())
+
+# Save and load model
+model.save(sc, myModelPath)
+sameModel = DecisionTreeModel.load(sc, myModelPath)
 {% endhighlight %}
 /div
 
@@ -440,11 +442,9 @@ DecisionTreeModel sameModel = 
DecisionTreeModel.load(sc.sc(), myModelPath);
 
 div data-lang=python
 
-Note that the Python API does not yet support model save/load but will in the 
future.
-
 {% highlight python %}
 from pyspark.mllib.regression import LabeledPoint
-from pyspark.mllib.tree import DecisionTree
+from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
 from pyspark.mllib.util import MLUtils
 
 # Load and parse the data file into an RDD of LabeledPoint.
@@ -464,6 +464,10 @@ testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) 
* (v - p)).sum() / flo
 print('Test Mean Squared Error = ' + str(testMSE))
 print('Learned regression tree model:')
 print(model.toDebugString())
+
+# Save and load model
+model.save(sc, myModelPath)
+sameModel = DecisionTreeModel.load(sc, myModelPath)
 {% endhighlight %}
 /div
 

http://git-wip-us.apache.org/repos/asf/spark/blob/62c53be2/docs/mllib-ensembles.md
--
diff --git a/docs/mllib-ensembles.md b/docs/mllib-ensembles.md
index d66b419..84f3b3b 100644
--- a/docs/mllib-ensembles.md
+++ b/docs/mllib-ensembles.md
@@ -202,10 +202,8 @@ RandomForestModel sameModel = 
RandomForestModel.load(sc.sc(), myModelPath);
 
 div data-lang=python
 
-Note that the Python API does not yet support model save/load but will in the 
future.
-
 {% highlight python %}
-from pyspark.mllib.tree import RandomForest
+from pyspark.mllib.tree import RandomForest, RandomForestModel
 from pyspark.mllib.util import MLUtils
 
 # Load and parse the data file into an RDD of LabeledPoint.
@@ -228,6 +226,10 @@ testErr = labelsAndPredictions.filter(lambda (v, p): v != 
p).count() / float(tes
 print('Test Error = ' + str(testErr))
 print('Learned classification forest model:')
 print(model.toDebugString())
+
+# Save and load model
+model.save(sc, myModelPath)
+sameModel = RandomForestModel.load(sc, myModelPath)
 {% endhighlight %}
 /div

spark git commit: [SPARK-6073][SQL] Need to refresh metastore cache after append data in CreateMetastoreDataSourceAsSelect

2015-03-02 Thread lian

Repository: spark
Updated Branches:
  refs/heads/branch-1.3 1fe677a36 - c59871cca


[SPARK-6073][SQL] Need to refresh metastore cache after append data in 
CreateMetastoreDataSourceAsSelect

JIRA: https://issues.apache.org/jira/browse/SPARK-6073

liancheng

Author: Yin Huai yh...@databricks.com

Closes #4824 from yhuai/refreshCache and squashes the following commits:

b9542ef [Yin Huai] Refresh metadata cache in the Catalog in 
CreateMetastoreDataSourceAsSelect.

(cherry picked from commit 39a54b40aff66816f8b8f5c6133eaaad6eaecae1)
Signed-off-by: Cheng Lian l...@databricks.com


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c59871cc
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c59871cc
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c59871cc

Branch: refs/heads/branch-1.3
Commit: c59871cca5695402e13a89344bd12f5952704019
Parents: 1fe677a
Author: Yin Huai yh...@databricks.com
Authored: Mon Mar 2 22:42:18 2015 +0800
Committer: Cheng Lian l...@databricks.com
Committed: Mon Mar 2 22:44:05 2015 +0800

--
 .../spark/sql/hive/execution/commands.scala |  2 +
 .../sql/hive/MetastoreDataSourcesSuite.scala| 52 
 2 files changed, 54 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/c59871cc/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
--
diff --git 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
index 9934a5d..ffaef8e 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
@@ -248,6 +248,8 @@ case class CreateMetastoreDataSourceAsSelect(
 isExternal)
 }
 
+// Refresh the cache of the table in the catalog.
+hiveContext.refreshTable(tableName)
 Seq.empty[Row]
   }
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/c59871cc/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
--
diff --git 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
index 00306f1..868c35f 100644
--- 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
+++ 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
@@ -612,4 +612,56 @@ class MetastoreDataSourcesSuite extends QueryTest with 
BeforeAndAfterEach {
 val actualSchema = table(wide_schema).schema
 assert(schema === actualSchema)
   }
+
+  test(insert into a table) {
+def createDF(from: Int, to: Int): DataFrame =
+  createDataFrame((from to to).map(i = Tuple2(i, sstr$i))).toDF(c1, 
c2)
+
+createDF(0, 9).saveAsTable(insertParquet, parquet)
+checkAnswer(
+  sql(SELECT p.c1, p.c2 FROM insertParquet p WHERE p.c1  5),
+  (6 to 9).map(i = Row(i, sstr$i)))
+
+intercept[AnalysisException] {
+  createDF(10, 19).saveAsTable(insertParquet, parquet)
+}
+
+createDF(10, 19).saveAsTable(insertParquet, parquet, SaveMode.Append)
+checkAnswer(
+  sql(SELECT p.c1, p.c2 FROM insertParquet p WHERE p.c1  5),
+  (6 to 19).map(i = Row(i, sstr$i)))
+
+createDF(20, 29).saveAsTable(insertParquet, parquet, SaveMode.Append)
+checkAnswer(
+  sql(SELECT p.c1, c2 FROM insertParquet p WHERE p.c1  5 AND p.c1  25),
+  (6 to 24).map(i = Row(i, sstr$i)))
+
+intercept[AnalysisException] {
+  createDF(30, 39).saveAsTable(insertParquet)
+}
+
+createDF(30, 39).saveAsTable(insertParquet, SaveMode.Append)
+checkAnswer(
+  sql(SELECT p.c1, c2 FROM insertParquet p WHERE p.c1  5 AND p.c1  35),
+  (6 to 34).map(i = Row(i, sstr$i)))
+
+createDF(40, 49).insertInto(insertParquet)
+checkAnswer(
+  sql(SELECT p.c1, c2 FROM insertParquet p WHERE p.c1  5 AND p.c1  45),
+  (6 to 44).map(i = Row(i, sstr$i)))
+
+createDF(50, 59).saveAsTable(insertParquet, SaveMode.Overwrite)
+checkAnswer(
+  sql(SELECT p.c1, c2 FROM insertParquet p WHERE p.c1  51 AND p.c1  
55),
+  (52 to 54).map(i = Row(i, sstr$i)))
+createDF(60, 69).saveAsTable(insertParquet, SaveMode.Ignore)
+checkAnswer(
+  sql(SELECT p.c1, c2 FROM insertParquet p),
+  (50 to 59).map(i = Row(i, sstr$i)))
+
+createDF(70, 79).insertInto(insertParquet, overwrite = true)
+checkAnswer(
+  sql(SELECT p.c1, c2 FROM insertParquet p),
+  (70 to 79).map(i = Row(i, sstr$i)))
+  }
 }


-
To

spark git commit: [SPARK-6073][SQL] Need to refresh metastore cache after append data in CreateMetastoreDataSourceAsSelect

2015-03-02 Thread lian

Repository: spark
Updated Branches:
  refs/heads/master 49c7a8f6f - 39a54b40a


[SPARK-6073][SQL] Need to refresh metastore cache after append data in 
CreateMetastoreDataSourceAsSelect

JIRA: https://issues.apache.org/jira/browse/SPARK-6073

liancheng

Author: Yin Huai yh...@databricks.com

Closes #4824 from yhuai/refreshCache and squashes the following commits:

b9542ef [Yin Huai] Refresh metadata cache in the Catalog in 
CreateMetastoreDataSourceAsSelect.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/39a54b40
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/39a54b40
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/39a54b40

Branch: refs/heads/master
Commit: 39a54b40aff66816f8b8f5c6133eaaad6eaecae1
Parents: 49c7a8f
Author: Yin Huai yh...@databricks.com
Authored: Mon Mar 2 22:42:18 2015 +0800
Committer: Cheng Lian l...@databricks.com
Committed: Mon Mar 2 22:42:18 2015 +0800

--
 .../spark/sql/hive/execution/commands.scala |  2 +
 .../sql/hive/MetastoreDataSourcesSuite.scala| 52 
 2 files changed, 54 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/39a54b40/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
--
diff --git 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
index 9934a5d..ffaef8e 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
@@ -248,6 +248,8 @@ case class CreateMetastoreDataSourceAsSelect(
 isExternal)
 }
 
+// Refresh the cache of the table in the catalog.
+hiveContext.refreshTable(tableName)
 Seq.empty[Row]
   }
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/39a54b40/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
--
diff --git 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
index 00306f1..868c35f 100644
--- 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
+++ 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
@@ -612,4 +612,56 @@ class MetastoreDataSourcesSuite extends QueryTest with 
BeforeAndAfterEach {
 val actualSchema = table(wide_schema).schema
 assert(schema === actualSchema)
   }
+
+  test(insert into a table) {
+def createDF(from: Int, to: Int): DataFrame =
+  createDataFrame((from to to).map(i = Tuple2(i, sstr$i))).toDF(c1, 
c2)
+
+createDF(0, 9).saveAsTable(insertParquet, parquet)
+checkAnswer(
+  sql(SELECT p.c1, p.c2 FROM insertParquet p WHERE p.c1  5),
+  (6 to 9).map(i = Row(i, sstr$i)))
+
+intercept[AnalysisException] {
+  createDF(10, 19).saveAsTable(insertParquet, parquet)
+}
+
+createDF(10, 19).saveAsTable(insertParquet, parquet, SaveMode.Append)
+checkAnswer(
+  sql(SELECT p.c1, p.c2 FROM insertParquet p WHERE p.c1  5),
+  (6 to 19).map(i = Row(i, sstr$i)))
+
+createDF(20, 29).saveAsTable(insertParquet, parquet, SaveMode.Append)
+checkAnswer(
+  sql(SELECT p.c1, c2 FROM insertParquet p WHERE p.c1  5 AND p.c1  25),
+  (6 to 24).map(i = Row(i, sstr$i)))
+
+intercept[AnalysisException] {
+  createDF(30, 39).saveAsTable(insertParquet)
+}
+
+createDF(30, 39).saveAsTable(insertParquet, SaveMode.Append)
+checkAnswer(
+  sql(SELECT p.c1, c2 FROM insertParquet p WHERE p.c1  5 AND p.c1  35),
+  (6 to 34).map(i = Row(i, sstr$i)))
+
+createDF(40, 49).insertInto(insertParquet)
+checkAnswer(
+  sql(SELECT p.c1, c2 FROM insertParquet p WHERE p.c1  5 AND p.c1  45),
+  (6 to 44).map(i = Row(i, sstr$i)))
+
+createDF(50, 59).saveAsTable(insertParquet, SaveMode.Overwrite)
+checkAnswer(
+  sql(SELECT p.c1, c2 FROM insertParquet p WHERE p.c1  51 AND p.c1  
55),
+  (52 to 54).map(i = Row(i, sstr$i)))
+createDF(60, 69).saveAsTable(insertParquet, SaveMode.Ignore)
+checkAnswer(
+  sql(SELECT p.c1, c2 FROM insertParquet p),
+  (50 to 59).map(i = Row(i, sstr$i)))
+
+createDF(70, 79).insertInto(insertParquet, overwrite = true)
+checkAnswer(
+  sql(SELECT p.c1, c2 FROM insertParquet p),
+  (70 to 79).map(i = Row(i, sstr$i)))
+  }
 }


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [Streaming][Minor]Fix some error docs in streaming examples

2015-03-02 Thread srowen

Repository: spark
Updated Branches:
  refs/heads/branch-1.3 6a2fc85e0 - 1fe677a36


[Streaming][Minor]Fix some error docs in streaming examples

Small changes, please help to review, thanks a lot.

Author: Saisai Shao saisai.s...@intel.com

Closes #4837 from jerryshao/doc-fix and squashes the following commits:

545291a [Saisai Shao] Fix some error docs in streaming examples

(cherry picked from commit d8fb40edea7c8c811814f1ff288d59178928964b)
Signed-off-by: Sean Owen so...@cloudera.com


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1fe677a3
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1fe677a3
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1fe677a3

Branch: refs/heads/branch-1.3
Commit: 1fe677a36d96dfa50b7fb5520b102452a6e97091
Parents: 6a2fc85
Author: Saisai Shao saisai.s...@intel.com
Authored: Mon Mar 2 08:49:19 2015 +
Committer: Sean Owen so...@cloudera.com
Committed: Mon Mar 2 08:49:28 2015 +

--
 .../apache/spark/examples/streaming/DirectKafkaWordCount.scala| 3 ++-
 examples/src/main/python/streaming/kafka_wordcount.py | 2 +-
 python/pyspark/streaming/kafka.py | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/1fe677a3/examples/scala-2.10/src/main/scala/org/apache/spark/examples/streaming/DirectKafkaWordCount.scala
--
diff --git 
a/examples/scala-2.10/src/main/scala/org/apache/spark/examples/streaming/DirectKafkaWordCount.scala
 
b/examples/scala-2.10/src/main/scala/org/apache/spark/examples/streaming/DirectKafkaWordCount.scala
index deb08fd..1c8a20b 100644
--- 
a/examples/scala-2.10/src/main/scala/org/apache/spark/examples/streaming/DirectKafkaWordCount.scala
+++ 
b/examples/scala-2.10/src/main/scala/org/apache/spark/examples/streaming/DirectKafkaWordCount.scala
@@ -30,7 +30,8 @@ import org.apache.spark.SparkConf
  *   topics is a list of one or more kafka topics to consume from
  *
  * Example:
- *$ bin/run-example streaming.KafkaWordCount 
broker1-host:port,broker2-host:port topic1,topic2
+ *$ bin/run-example streaming.DirectKafkaWordCount 
broker1-host:port,broker2-host:port \
+ *topic1,topic2
  */
 object DirectKafkaWordCount {
   def main(args: Array[String]) {

http://git-wip-us.apache.org/repos/asf/spark/blob/1fe677a3/examples/src/main/python/streaming/kafka_wordcount.py
--
diff --git a/examples/src/main/python/streaming/kafka_wordcount.py 
b/examples/src/main/python/streaming/kafka_wordcount.py
index ed398a8..f82f161 100644
--- a/examples/src/main/python/streaming/kafka_wordcount.py
+++ b/examples/src/main/python/streaming/kafka_wordcount.py
@@ -23,7 +23,7 @@
  http://kafka.apache.org/documentation.html#quickstart
 
  and then run the example
-`$ bin/spark-submit --driver-class-path 
external/kafka-assembly/target/scala-*/\
+`$ bin/spark-submit --jars external/kafka-assembly/target/scala-*/\
   spark-streaming-kafka-assembly-*.jar 
examples/src/main/python/streaming/kafka_wordcount.py \
   localhost:2181 test`
 

http://git-wip-us.apache.org/repos/asf/spark/blob/1fe677a3/python/pyspark/streaming/kafka.py
--
diff --git a/python/pyspark/streaming/kafka.py 
b/python/pyspark/streaming/kafka.py
index 0002dc1..f083ed1 100644
--- a/python/pyspark/streaming/kafka.py
+++ b/python/pyspark/streaming/kafka.py
@@ -82,7 +82,7 @@ 

 
   2. Download the JAR of the artifact from Maven Central 
http://search.maven.org/,
  Group Id = org.apache.spark, Artifact Id = 
spark-streaming-kafka-assembly, Version = %s.
- Then, innclude the jar in the spark-submit command as
+ Then, include the jar in the spark-submit command as
 
  $ bin/spark-submit --jars spark-streaming-kafka-assembly.jar ...
 


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [Streaming][Minor]Fix some error docs in streaming examples

2015-03-02 Thread srowen

Repository: spark
Updated Branches:
  refs/heads/master 3f00bb3ef - d8fb40ede


[Streaming][Minor]Fix some error docs in streaming examples

Small changes, please help to review, thanks a lot.

Author: Saisai Shao saisai.s...@intel.com

Closes #4837 from jerryshao/doc-fix and squashes the following commits:

545291a [Saisai Shao] Fix some error docs in streaming examples


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d8fb40ed
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d8fb40ed
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d8fb40ed

Branch: refs/heads/master
Commit: d8fb40edea7c8c811814f1ff288d59178928964b
Parents: 3f00bb3
Author: Saisai Shao saisai.s...@intel.com
Authored: Mon Mar 2 08:49:19 2015 +
Committer: Sean Owen so...@cloudera.com
Committed: Mon Mar 2 08:49:19 2015 +

--
 .../apache/spark/examples/streaming/DirectKafkaWordCount.scala| 3 ++-
 examples/src/main/python/streaming/kafka_wordcount.py | 2 +-
 python/pyspark/streaming/kafka.py | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/d8fb40ed/examples/scala-2.10/src/main/scala/org/apache/spark/examples/streaming/DirectKafkaWordCount.scala
--
diff --git 
a/examples/scala-2.10/src/main/scala/org/apache/spark/examples/streaming/DirectKafkaWordCount.scala
 
b/examples/scala-2.10/src/main/scala/org/apache/spark/examples/streaming/DirectKafkaWordCount.scala
index deb08fd..1c8a20b 100644
--- 
a/examples/scala-2.10/src/main/scala/org/apache/spark/examples/streaming/DirectKafkaWordCount.scala
+++ 
b/examples/scala-2.10/src/main/scala/org/apache/spark/examples/streaming/DirectKafkaWordCount.scala
@@ -30,7 +30,8 @@ import org.apache.spark.SparkConf
  *   topics is a list of one or more kafka topics to consume from
  *
  * Example:
- *$ bin/run-example streaming.KafkaWordCount 
broker1-host:port,broker2-host:port topic1,topic2
+ *$ bin/run-example streaming.DirectKafkaWordCount 
broker1-host:port,broker2-host:port \
+ *topic1,topic2
  */
 object DirectKafkaWordCount {
   def main(args: Array[String]) {

http://git-wip-us.apache.org/repos/asf/spark/blob/d8fb40ed/examples/src/main/python/streaming/kafka_wordcount.py
--
diff --git a/examples/src/main/python/streaming/kafka_wordcount.py 
b/examples/src/main/python/streaming/kafka_wordcount.py
index ed398a8..f82f161 100644
--- a/examples/src/main/python/streaming/kafka_wordcount.py
+++ b/examples/src/main/python/streaming/kafka_wordcount.py
@@ -23,7 +23,7 @@
  http://kafka.apache.org/documentation.html#quickstart
 
  and then run the example
-`$ bin/spark-submit --driver-class-path 
external/kafka-assembly/target/scala-*/\
+`$ bin/spark-submit --jars external/kafka-assembly/target/scala-*/\
   spark-streaming-kafka-assembly-*.jar 
examples/src/main/python/streaming/kafka_wordcount.py \
   localhost:2181 test`
 

http://git-wip-us.apache.org/repos/asf/spark/blob/d8fb40ed/python/pyspark/streaming/kafka.py
--
diff --git a/python/pyspark/streaming/kafka.py 
b/python/pyspark/streaming/kafka.py
index 0002dc1..f083ed1 100644
--- a/python/pyspark/streaming/kafka.py
+++ b/python/pyspark/streaming/kafka.py
@@ -82,7 +82,7 @@ 

 
   2. Download the JAR of the artifact from Maven Central 
http://search.maven.org/,
  Group Id = org.apache.spark, Artifact Id = 
spark-streaming-kafka-assembly, Version = %s.
- Then, innclude the jar in the spark-submit command as
+ Then, include the jar in the spark-submit command as
 
  $ bin/spark-submit --jars spark-streaming-kafka-assembly.jar ...
 


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: SPARK-3357 [CORE] Internal log messages should be set at DEBUG level instead of INFO

2015-03-02 Thread srowen

Repository: spark
Updated Branches:
  refs/heads/master d8fb40ede - 948c2390a


SPARK-3357 [CORE] Internal log messages should be set at DEBUG level instead of 
INFO

Demote some 'noisy' log messages to debug level. I added a few more, to include 
everything that gets logged in stanzas like this:

```
15/03/01 00:03:54 INFO BlockManager: Removing broadcast 0
15/03/01 00:03:54 INFO BlockManager: Removing block broadcast_0_piece0
15/03/01 00:03:54 INFO MemoryStore: Block broadcast_0_piece0 of size 839 
dropped from memory (free 277976091)
15/03/01 00:03:54 INFO BlockManagerInfo: Removed broadcast_0_piece0 on 
localhost:49524 in memory (size: 839.0 B, free: 265.1 MB)
15/03/01 00:03:54 INFO BlockManagerMaster: Updated info of block 
broadcast_0_piece0
15/03/01 00:03:54 INFO BlockManager: Removing block broadcast_0
15/03/01 00:03:54 INFO MemoryStore: Block broadcast_0 of size 1088 dropped from 
memory (free 277977179)
15/03/01 00:03:54 INFO ContextCleaner: Cleaned broadcast 0
```

as well as regular messages like

```
15/03/01 00:02:33 INFO MemoryStore: ensureFreeSpace(2640) called with 
curMem=47322, maxMem=278019440
```

WDYT? good or should some be left alone?

CC mengxr who suggested some of this.

Author: Sean Owen so...@cloudera.com

Closes #4838 from srowen/SPARK-3357 and squashes the following commits:

dce75c1 [Sean Owen] Back out some debug level changes
d9b784d [Sean Owen] Demote some 'noisy' log messages to debug level


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/948c2390
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/948c2390
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/948c2390

Branch: refs/heads/master
Commit: 948c2390ab004ad5cf3876d87c05d3e43a9aa9e0
Parents: d8fb40e
Author: Sean Owen so...@cloudera.com
Authored: Mon Mar 2 08:51:03 2015 +
Committer: Sean Owen so...@cloudera.com
Committed: Mon Mar 2 08:51:03 2015 +

--
 core/src/main/scala/org/apache/spark/ContextCleaner.scala| 4 ++--
 core/src/main/scala/org/apache/spark/storage/BlockManager.scala  | 4 ++--
 .../main/scala/org/apache/spark/storage/BlockManagerMaster.scala | 2 +-
 core/src/main/scala/org/apache/spark/storage/MemoryStore.scala   | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/948c2390/core/src/main/scala/org/apache/spark/ContextCleaner.scala
--
diff --git a/core/src/main/scala/org/apache/spark/ContextCleaner.scala 
b/core/src/main/scala/org/apache/spark/ContextCleaner.scala
index 434f1e4..4a9d007 100644
--- a/core/src/main/scala/org/apache/spark/ContextCleaner.scala
+++ b/core/src/main/scala/org/apache/spark/ContextCleaner.scala
@@ -188,10 +188,10 @@ private[spark] class ContextCleaner(sc: SparkContext) 
extends Logging {
   /** Perform broadcast cleanup. */
   def doCleanupBroadcast(broadcastId: Long, blocking: Boolean) {
 try {
-  logDebug(Cleaning broadcast  + broadcastId)
+  logDebug(sCleaning broadcast $broadcastId)
   broadcastManager.unbroadcast(broadcastId, true, blocking)
   listeners.foreach(_.broadcastCleaned(broadcastId))
-  logInfo(Cleaned broadcast  + broadcastId)
+  logDebug(sCleaned broadcast $broadcastId)
 } catch {
   case e: Exception = logError(Error cleaning broadcast  + broadcastId, 
e)
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/948c2390/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
--
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala 
b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
index 86dbd89..c8b7763 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
@@ -1074,7 +1074,7 @@ private[spark] class BlockManager(
* Remove all blocks belonging to the given broadcast.
*/
   def removeBroadcast(broadcastId: Long, tellMaster: Boolean): Int = {
-logInfo(sRemoving broadcast $broadcastId)
+logDebug(sRemoving broadcast $broadcastId)
 val blocksToRemove = blockInfo.keys.collect {
   case bid @ BroadcastBlockId(`broadcastId`, _) = bid
 }
@@ -1086,7 +1086,7 @@ private[spark] class BlockManager(
* Remove a block from both memory and disk.
*/
   def removeBlock(blockId: BlockId, tellMaster: Boolean = true): Unit = {
-logInfo(sRemoving block $blockId)
+logDebug(sRemoving block $blockId)
 val info = blockInfo.get(blockId).orNull
 if (info != null) {
   info.synchronized {

http://git-wip-us.apache.org/repos/asf/spark/blob/948c2390/core/src/main/scala/org/apache/spark/storage/BlockManagerMaster.scala

spark git commit: [SPARK-6103][Graphx]remove unused class to import in EdgeRDDImpl

2015-03-02 Thread srowen

Repository: spark
Updated Branches:
  refs/heads/master 948c2390a - 49c7a8f6f


[SPARK-6103][Graphx]remove unused class to import in EdgeRDDImpl

Class TaskContext is unused in EdgeRDDImpl, so we need to remove it from import 
list.

Author: Lianhui Wang lianhuiwan...@gmail.com

Closes #4846 from lianhuiwang/SPARK-6103 and squashes the following commits:

31aed64 [Lianhui Wang] remove unused class to import in EdgeRDDImpl


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/49c7a8f6
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/49c7a8f6
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/49c7a8f6

Branch: refs/heads/master
Commit: 49c7a8f6f33d64d7e6c35391f83121440844a41d
Parents: 948c239
Author: Lianhui Wang lianhuiwan...@gmail.com
Authored: Mon Mar 2 09:06:56 2015 +
Committer: Sean Owen so...@cloudera.com
Committed: Mon Mar 2 09:06:56 2015 +

--
 .../src/main/scala/org/apache/spark/graphx/impl/EdgeRDDImpl.scala  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/49c7a8f6/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgeRDDImpl.scala
--
diff --git 
a/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgeRDDImpl.scala 
b/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgeRDDImpl.scala
index 56cb416..43a3aea 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgeRDDImpl.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgeRDDImpl.scala
@@ -19,7 +19,7 @@ package org.apache.spark.graphx.impl
 
 import scala.reflect.{classTag, ClassTag}
 
-import org.apache.spark.{OneToOneDependency, HashPartitioner, TaskContext}
+import org.apache.spark.{OneToOneDependency, HashPartitioner}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.storage.StorageLevel
 


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-6052][SQL]In JSON schema inference, we should always set containsNull of an ArrayType to true

2015-03-02 Thread lian

Repository: spark
Updated Branches:
  refs/heads/master 39a54b40a - 3efd8bb6c


[SPARK-6052][SQL]In JSON schema inference, we should always set containsNull of 
an ArrayType to true

Always set `containsNull = true` when infer the schema of JSON datasets. If we 
set `containsNull` based on records we scanned, we may miss arrays with null 
values when we do sampling. Also, because future data can have arrays with null 
values, if we convert JSON data to parquet, always setting `containsNull = 
true` is a more robust way to go.

JIRA: https://issues.apache.org/jira/browse/SPARK-6052

Author: Yin Huai yh...@databricks.com

Closes #4806 from yhuai/jsonArrayContainsNull and squashes the following 
commits:

05eab9d [Yin Huai] Change containsNull to true.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3efd8bb6
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3efd8bb6
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3efd8bb6

Branch: refs/heads/master
Commit: 3efd8bb6cf139ce094ff631c7a9c1eb93fdcd566
Parents: 39a54b4
Author: Yin Huai yh...@databricks.com
Authored: Mon Mar 2 23:18:07 2015 +0800
Committer: Cheng Lian l...@databricks.com
Committed: Mon Mar 2 23:18:07 2015 +0800

--
 .../org/apache/spark/sql/json/JsonRDD.scala |  9 +++--
 .../org/apache/spark/sql/json/JsonSuite.scala   | 38 ++--
 2 files changed, 23 insertions(+), 24 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/3efd8bb6/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
--
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
index d83bdc2..e54a2a3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
@@ -199,13 +199,12 @@ private[sql] object JsonRDD extends Logging {
* type conflicts.
*/
   private def typeOfArray(l: Seq[Any]): ArrayType = {
-val containsNull = l.exists(v = v == null)
 val elements = l.flatMap(v = Option(v))
 if (elements.isEmpty) {
   // If this JSON array is empty, we use NullType as a placeholder.
   // If this array is not empty in other JSON objects, we can resolve
   // the type after we have passed through all JSON objects.
-  ArrayType(NullType, containsNull)
+  ArrayType(NullType, containsNull = true)
 } else {
   val elementType = elements.map {
 e = e match {
@@ -217,7 +216,7 @@ private[sql] object JsonRDD extends Logging {
 }
   }.reduce((type1: DataType, type2: DataType) = compatibleType(type1, 
type2))
 
-  ArrayType(elementType, containsNull)
+  ArrayType(elementType, containsNull = true)
 }
   }
 
@@ -245,7 +244,7 @@ private[sql] object JsonRDD extends Logging {
 // The value associated with the key is an array.
 // Handle inner structs of an array.
 def buildKeyPathForInnerStructs(v: Any, t: DataType): Seq[(String, 
DataType)] = t match {
-  case ArrayType(e: StructType, containsNull) = {
+  case ArrayType(e: StructType, _) = {
 // The elements of this arrays are structs.
 v.asInstanceOf[Seq[Map[String, Any]]].flatMap(Option(_)).flatMap {
   element = allKeysWithValueTypes(element)
@@ -253,7 +252,7 @@ private[sql] object JsonRDD extends Logging {
   case (k, t) = (s$key.$k, t)
 }
   }
-  case ArrayType(t1, containsNull) =
+  case ArrayType(t1, _) =
 v.asInstanceOf[Seq[Any]].flatMap(Option(_)).flatMap {
   element = buildKeyPathForInnerStructs(element, t1)
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/3efd8bb6/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
--
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
index 005f20b..9d94d34 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
@@ -248,26 +248,26 @@ class JsonSuite extends QueryTest {
 val jsonDF = jsonRDD(complexFieldAndType1)
 
 val expectedSchema = StructType(
-  StructField(arrayOfArray1, ArrayType(ArrayType(StringType, false), 
false), true) ::
-  StructField(arrayOfArray2, ArrayType(ArrayType(DoubleType, false), 
false), true) ::
-  StructField(arrayOfBigInteger, ArrayType(DecimalType.Unlimited, 
false), true) ::
-  StructField(arrayOfBoolean, ArrayType(BooleanType, false), true) ::
-

spark git commit: [SPARK-6111] Fixed usage string in documentation.

2015-03-02 Thread srowen

Repository: spark
Updated Branches:
  refs/heads/branch-1.3 a3fef2c02 - b2b7f011e


[SPARK-6111] Fixed usage string in documentation.

Usage info in documentation does not match actual usage info.

Doc string usage says ```Usage: network_wordcount.py zk topic``` whereas 
the actual usage is ```Usage: kafka_wordcount.py zk topic```

Author: Kenneth Myers myer...@us.ibm.com

Closes #4852 from kennethmyers/kafka_wordcount_documentation_fix and squashes 
the following commits:

3855325 [Kenneth Myers] Fixed usage string in documentation.

(cherry picked from commit 95ac68bf127b5370c13d6bc15adbda78228829cc)
Signed-off-by: Sean Owen so...@cloudera.com


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b2b7f011
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b2b7f011
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b2b7f011

Branch: refs/heads/branch-1.3
Commit: b2b7f011e3e17ac6263b5cac926a48fe89d4638a
Parents: a3fef2c
Author: Kenneth Myers myer...@us.ibm.com
Authored: Mon Mar 2 17:25:24 2015 +
Committer: Sean Owen so...@cloudera.com
Committed: Mon Mar 2 17:25:37 2015 +

--
 examples/src/main/python/streaming/kafka_wordcount.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/b2b7f011/examples/src/main/python/streaming/kafka_wordcount.py
--
diff --git a/examples/src/main/python/streaming/kafka_wordcount.py 
b/examples/src/main/python/streaming/kafka_wordcount.py
index f82f161..51e1ff8 100644
--- a/examples/src/main/python/streaming/kafka_wordcount.py
+++ b/examples/src/main/python/streaming/kafka_wordcount.py
@@ -17,7 +17,7 @@
 
 
  Counts words in UTF8 encoded, '\n' delimited text received from the network 
every second.
- Usage: network_wordcount.py zk topic
+ Usage: kafka_wordcount.py zk topic
 
  To run this on your local machine, you need to setup Kafka and create a 
producer first, see
  http://kafka.apache.org/documentation.html#quickstart


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-6080] [PySpark] correct LogisticRegressionWithLBFGS regType parameter for pyspark

2015-03-02 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-1.3 f47610890 - 4ffaf8568


[SPARK-6080] [PySpark] correct LogisticRegressionWithLBFGS regType parameter 
for pyspark

Currently LogisticRegressionWithLBFGS in python/pyspark/mllib/classification.py 
will invoke callMLlibFunc with a wrong regType parameter.
It was assigned to str(regType) which translate None(Python) to 
None(Java/Scala). The right way should be translate None(Python) to 
null(Java/Scala) just as what we did at LogisticRegressionWithSGD.

Author: Yanbo Liang yblia...@gmail.com

Closes #4831 from yanboliang/pyspark_classification and squashes the following 
commits:

12db65a [Yanbo Liang] correct LogisticRegressionWithLBFGS regType parameter for 
pyspark

(cherry picked from commit af2effdd7b54316af0c02e781911acfb148b962b)
Signed-off-by: Xiangrui Meng m...@databricks.com


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4ffaf856
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4ffaf856
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4ffaf856

Branch: refs/heads/branch-1.3
Commit: 4ffaf856882fb1f4a5bfc24e5a05c74ba950e282
Parents: f476108
Author: Yanbo Liang yblia...@gmail.com
Authored: Mon Mar 2 10:17:24 2015 -0800
Committer: Xiangrui Meng m...@databricks.com
Committed: Mon Mar 2 10:17:32 2015 -0800

--
 python/pyspark/mllib/classification.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/4ffaf856/python/pyspark/mllib/classification.py
--
diff --git a/python/pyspark/mllib/classification.py 
b/python/pyspark/mllib/classification.py
index 00e2e76..e476517 100644
--- a/python/pyspark/mllib/classification.py
+++ b/python/pyspark/mllib/classification.py
@@ -207,7 +207,7 @@ class LogisticRegressionWithLBFGS(object):
 
 def train(rdd, i):
 return callMLlibFunc(trainLogisticRegressionModelWithLBFGS, rdd, 
int(iterations), i,
- float(regParam), str(regType), 
bool(intercept), int(corrections),
+ float(regParam), regType, bool(intercept), 
int(corrections),
  float(tolerance))
 
 return _regression_train_wrapper(train, LogisticRegressionModel, data, 
initialWeights)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: aggregateMessages example in graphX doc

2015-03-02 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master 9ce12aaf2 - e7d8ae444


aggregateMessages example in graphX doc

Examples illustrating difference between legacy mapReduceTriplets usage and 
aggregateMessages usage has type issues on the reduce for both operators.

Being just an example-  changed example to reduce the message String by 
concatenation. Although non-optimal for performance.

Author: DEBORAH SIEGEL deborahsiegel@DEBORAHs-MacBook-Pro.local

Closes #4853 from d3borah/master and squashes the following commits:

db54173 [DEBORAH SIEGEL] fixed aggregateMessages example in graphX doc


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e7d8ae44
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e7d8ae44
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e7d8ae44

Branch: refs/heads/master
Commit: e7d8ae444fead27fe85a879f2f7a4cfdd8c47b16
Parents: 9ce12aa
Author: DEBORAH SIEGEL deborahsiegel@DEBORAHs-MacBook-Pro.local
Authored: Mon Mar 2 10:15:32 2015 -0800
Committer: Reynold Xin r...@databricks.com
Committed: Mon Mar 2 10:15:32 2015 -0800

--
 docs/graphx-programming-guide.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/e7d8ae44/docs/graphx-programming-guide.md
--
diff --git a/docs/graphx-programming-guide.md b/docs/graphx-programming-guide.md
index 28bdf81..c601d79 100644
--- a/docs/graphx-programming-guide.md
+++ b/docs/graphx-programming-guide.md
@@ -663,7 +663,7 @@ val graph: Graph[Int, Float] = ...
 def msgFun(triplet: Triplet[Int, Float]): Iterator[(Int, String)] = {
   Iterator((triplet.dstId, Hi))
 }
-def reduceFun(a: Int, b: Int): Int = a + b
+def reduceFun(a: String, b: String): String = a +   + b
 val result = graph.mapReduceTriplets[String](msgFun, reduceFun)
 {% endhighlight %}
 
@@ -674,7 +674,7 @@ val graph: Graph[Int, Float] = ...
 def msgFun(triplet: EdgeContext[Int, Float, String]) {
   triplet.sendToDst(Hi)
 }
-def reduceFun(a: Int, b: Int): Int = a + b
+def reduceFun(a: String, b: String): String = a +   + b
 val result = graph.aggregateMessages[String](msgFun, reduceFun)
 {% endhighlight %}
 


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-5741][SQL] Support the path contains comma in HiveContext

2015-03-02 Thread marmbrus

Repository: spark
Updated Branches:
  refs/heads/branch-1.3 b2b7f011e - f47610890


[SPARK-5741][SQL] Support the path contains comma in HiveContext

When run ```select * from nzhang_part where hr = 'file,';```, it throws 
exception ```java.lang.IllegalArgumentException: Can not create a Path from an 
empty string```
. Because the path of hdfs contains comma, and FileInputFormat.setInputPaths 
will split path by comma.

### SQL
```
set hive.merge.mapfiles=true;
set hive.merge.mapredfiles=true;
set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;

create table nzhang_part like srcpart;

insert overwrite table nzhang_part partition (ds='2010-08-15', hr) select key, 
value, hr from srcpart where ds='2008-04-08';

insert overwrite table nzhang_part partition (ds='2010-08-15', hr=11) select 
key, value from srcpart where ds='2008-04-08';

insert overwrite table nzhang_part partition (ds='2010-08-15', hr)
select * from (
select key, value, hr from srcpart where ds='2008-04-08'
union all
select '1' as key, '1' as value, 'file,' as hr from src limit 1) s;

select * from nzhang_part where hr = 'file,';
```

### Error Log
```
15/02/10 14:33:16 ERROR SparkSQLDriver: Failed in [select * from nzhang_part 
where hr = 'file,']
java.lang.IllegalArgumentException: Can not create a Path from an empty string
at org.apache.hadoop.fs.Path.checkPathArg(Path.java:127)
at org.apache.hadoop.fs.Path.init(Path.java:135)
at org.apache.hadoop.util.StringUtils.stringToPath(StringUtils.java:241)
at 
org.apache.hadoop.mapred.FileInputFormat.setInputPaths(FileInputFormat.java:400)
at 
org.apache.spark.sql.hive.HadoopTableReader$.initializeLocalJobConfFunc(TableReader.scala:251)
at 
org.apache.spark.sql.hive.HadoopTableReader$$anonfun$11.apply(TableReader.scala:229)
at 
org.apache.spark.sql.hive.HadoopTableReader$$anonfun$11.apply(TableReader.scala:229)
at 
org.apache.spark.rdd.HadoopRDD$$anonfun$getJobConf$6.apply(HadoopRDD.scala:172)
at 
org.apache.spark.rdd.HadoopRDD$$anonfun$getJobConf$6.apply(HadoopRDD.scala:172)
at scala.Option.map(Option.scala:145)
at org.apache.spark.rdd.HadoopRDD.getJobConf(HadoopRDD.scala:172)
at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:196)

Author: q00251598 qiyad...@huawei.com

Closes #4532 from watermen/SPARK-5741 and squashes the following commits:

9758ab1 [q00251598] fix bug
1db1a1c [q00251598] use setInputPaths(Job job, Path... inputPaths)
b788a72 [q00251598] change FileInputFormat.setInputPaths to jobConf.set and add 
test suite

(cherry picked from commit 9ce12aaf283a2793e719bdc956dd858922636e8d)
Signed-off-by: Michael Armbrust mich...@databricks.com


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f4761089
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f4761089
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f4761089

Branch: refs/heads/branch-1.3
Commit: f476108901c42ea61873f02dc2fee15896550d30
Parents: b2b7f01
Author: q00251598 qiyad...@huawei.com
Authored: Mon Mar 2 10:13:11 2015 -0800
Committer: Michael Armbrust mich...@databricks.com
Committed: Mon Mar 2 10:13:30 2015 -0800

--
 .../hive/execution/HiveCompatibilitySuite.scala |1 +
 .../org/apache/spark/sql/hive/TableReader.scala |2 +-
 .../merge4-0-b12e5c70d6d29757471b900b6160fa8a   |1 +
 .../merge4-1-593999fae618b6b38322bc9ae4e0c027   |1 +
 .../merge4-10-692a197bd688b48f762e72978f54aa32  |0
 .../merge4-11-f407e661307b23a5d52a08a3e7af19b   | 1500 ++
 .../merge4-12-62541540a18d68a3cb8497a741061d11  |0
 .../merge4-13-ed1103f06609365b40e78d13c654cc71  |0
 .../merge4-14-ba5dbcd0527b8ddab284bc322255bfc7  |3 +
 .../merge4-15-68f50dc2ad6ff803a372bdd88dd8e19a  |1 +
 .../merge4-2-43d53504df013e6b35f81811138a167a   |1 +
 .../merge4-3-a4fb8359a2179ec70777aad6366071b7   |1 +
 .../merge4-4-16367c381d4b189b3640c92511244bfe   |1 +
 .../merge4-5-3d24d877366c42030f6d9a596665720d   |0
 .../merge4-6-b3a76420183795720ab3a384046e5af|0
 .../merge4-7-631a45828eae3f5f562d992efe4cd56d   |0
 .../merge4-8-f407e661307b23a5d52a08a3e7af19b| 1000 
 .../merge4-9-ad3dc168c8b6f048717e39ab16b0a319   |0
 18 files changed, 2511 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/f4761089/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
--
diff --git 
a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
 
b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
index c6ead45..6126ce7 100644
---

42 matches

Mail list logo