date:20160527

spark git commit: [SPARK-15553][SQL] Dataset.createTempView should use CreateViewCommand

2016-05-27 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 ada319844 -> 36045106d


[SPARK-15553][SQL] Dataset.createTempView should use CreateViewCommand

## What changes were proposed in this pull request?

Let `Dataset.createTempView` and `Dataset.createOrReplaceTempView` use 
`CreateViewCommand`, rather than calling `SparkSession.createTempView`. 
Besides, this patch also removes `SparkSession.createTempView`.

## How was this patch tested?
Existing tests.

Author: Liang-Chi Hsieh 

Closes #13327 from viirya/dataset-createtempview.

(cherry picked from commit f1b220d1d4d12121fe0b3b175da44488da68)
Signed-off-by: Reynold Xin 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/36045106
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/36045106
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/36045106

Branch: refs/heads/branch-2.0
Commit: 36045106d43b3952c55bae4439dbc86892399b3c
Parents: ada3198
Author: Liang-Chi Hsieh 
Authored: Fri May 27 21:24:08 2016 -0700
Committer: Reynold Xin 
Committed: Fri May 27 21:24:14 2016 -0700

--
 .../spark/sql/catalyst/catalog/interface.scala  |  5 +
 .../scala/org/apache/spark/sql/Dataset.scala| 23 +++-
 .../scala/org/apache/spark/sql/SQLContext.scala |  2 +-
 .../org/apache/spark/sql/SparkSession.scala | 11 --
 .../spark/sql/execution/SparkSqlParser.scala| 18 +++
 .../spark/sql/execution/command/cache.scala |  3 +--
 .../spark/sql/execution/command/views.scala |  8 +--
 7 files changed, 39 insertions(+), 31 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/36045106/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
index 4a073d1..77731b1 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
@@ -50,6 +50,11 @@ case class CatalogStorageFormat(
 compressed: Boolean,
 serdeProperties: Map[String, String])
 
+object CatalogStorageFormat {
+  /** Empty storage format for default values and copies. */
+  val EmptyStorageFormat = CatalogStorageFormat(locationUri = None, 
inputFormat = None,
+outputFormat = None, serde = None, compressed = false, serdeProperties = 
Map.empty)
+}
 
 /**
  * A column in a table.

http://git-wip-us.apache.org/repos/asf/spark/blob/36045106/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
--
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index abd16f2..7aeec20 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -35,6 +35,7 @@ import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst._
 import org.apache.spark.sql.catalyst.analysis._
+import org.apache.spark.sql.catalyst.catalog._
 import org.apache.spark.sql.catalyst.encoders._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.aggregate._
@@ -44,7 +45,7 @@ import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.util.usePrettyExpression
 import org.apache.spark.sql.execution.{FileRelation, LogicalRDD, 
QueryExecution, SQLExecution}
-import org.apache.spark.sql.execution.command.ExplainCommand
+import org.apache.spark.sql.execution.command.{CreateViewCommand, 
ExplainCommand}
 import org.apache.spark.sql.execution.datasources.{CreateTableUsingAsSelect, 
LogicalRelation}
 import org.apache.spark.sql.execution.datasources.json.JacksonGenerator
 import org.apache.spark.sql.execution.python.EvaluatePython
@@ -2329,8 +2330,14 @@ class Dataset[T] private[sql](
* @since 2.0.0
*/
   @throws[AnalysisException]
-  def createTempView(viewName: String): Unit = {
-sparkSession.createTempView(viewName, toDF(), replaceIfExists = false)
+  def createTempView(viewName: String): Unit = withPlan {
+val tableDesc = CatalogTable(
+  identifier = 
sparkSession.sessionState.sqlParser.parseTableIdentifier(viewName),
+  tableType = CatalogTableType.VIEW,
+  schema = Seq.empty[CatalogColumn],
+  storage = CatalogStorageFormat.EmptyStorageFormat)
+

spark git commit: [SPARK-15553][SQL] Dataset.createTempView should use CreateViewCommand

2016-05-27 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master 73178c755 -> f1b220eee


[SPARK-15553][SQL] Dataset.createTempView should use CreateViewCommand

## What changes were proposed in this pull request?

Let `Dataset.createTempView` and `Dataset.createOrReplaceTempView` use 
`CreateViewCommand`, rather than calling `SparkSession.createTempView`. 
Besides, this patch also removes `SparkSession.createTempView`.

## How was this patch tested?
Existing tests.

Author: Liang-Chi Hsieh 

Closes #13327 from viirya/dataset-createtempview.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f1b220ee
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f1b220ee
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f1b220ee

Branch: refs/heads/master
Commit: f1b220d1d4d12121fe0b3b175da44488da68
Parents: 73178c7
Author: Liang-Chi Hsieh 
Authored: Fri May 27 21:24:08 2016 -0700
Committer: Reynold Xin 
Committed: Fri May 27 21:24:08 2016 -0700

--
 .../spark/sql/catalyst/catalog/interface.scala  |  5 +
 .../scala/org/apache/spark/sql/Dataset.scala| 23 +++-
 .../scala/org/apache/spark/sql/SQLContext.scala |  2 +-
 .../org/apache/spark/sql/SparkSession.scala | 11 --
 .../spark/sql/execution/SparkSqlParser.scala| 18 +++
 .../spark/sql/execution/command/cache.scala |  3 +--
 .../spark/sql/execution/command/views.scala |  8 +--
 7 files changed, 39 insertions(+), 31 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/f1b220ee/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
index 4a073d1..77731b1 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
@@ -50,6 +50,11 @@ case class CatalogStorageFormat(
 compressed: Boolean,
 serdeProperties: Map[String, String])
 
+object CatalogStorageFormat {
+  /** Empty storage format for default values and copies. */
+  val EmptyStorageFormat = CatalogStorageFormat(locationUri = None, 
inputFormat = None,
+outputFormat = None, serde = None, compressed = false, serdeProperties = 
Map.empty)
+}
 
 /**
  * A column in a table.

http://git-wip-us.apache.org/repos/asf/spark/blob/f1b220ee/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
--
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index abd16f2..7aeec20 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -35,6 +35,7 @@ import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst._
 import org.apache.spark.sql.catalyst.analysis._
+import org.apache.spark.sql.catalyst.catalog._
 import org.apache.spark.sql.catalyst.encoders._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.aggregate._
@@ -44,7 +45,7 @@ import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.util.usePrettyExpression
 import org.apache.spark.sql.execution.{FileRelation, LogicalRDD, 
QueryExecution, SQLExecution}
-import org.apache.spark.sql.execution.command.ExplainCommand
+import org.apache.spark.sql.execution.command.{CreateViewCommand, 
ExplainCommand}
 import org.apache.spark.sql.execution.datasources.{CreateTableUsingAsSelect, 
LogicalRelation}
 import org.apache.spark.sql.execution.datasources.json.JacksonGenerator
 import org.apache.spark.sql.execution.python.EvaluatePython
@@ -2329,8 +2330,14 @@ class Dataset[T] private[sql](
* @since 2.0.0
*/
   @throws[AnalysisException]
-  def createTempView(viewName: String): Unit = {
-sparkSession.createTempView(viewName, toDF(), replaceIfExists = false)
+  def createTempView(viewName: String): Unit = withPlan {
+val tableDesc = CatalogTable(
+  identifier = 
sparkSession.sessionState.sqlParser.parseTableIdentifier(viewName),
+  tableType = CatalogTableType.VIEW,
+  schema = Seq.empty[CatalogColumn],
+  storage = CatalogStorageFormat.EmptyStorageFormat)
+CreateViewCommand(tableDesc, logicalPlan, allowExisting = false, replace = 
false,
+  isTemporary = true, sql = "")
   }
 
   /**
@@

[1/2] spark git commit: [SPARK-15633][MINOR] Make package name for Java tests consistent

2016-05-27 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 3801fb4f3 -> ada319844


http://git-wip-us.apache.org/repos/asf/spark/blob/ada31984/external/java8-tests/src/test/java/test/org/apache/spark/java8/dstream/Java8APISuite.java
--
diff --git 
a/external/java8-tests/src/test/java/test/org/apache/spark/java8/dstream/Java8APISuite.java
 
b/external/java8-tests/src/test/java/test/org/apache/spark/java8/dstream/Java8APISuite.java
new file mode 100644
index 000..cf5607f
--- /dev/null
+++ 
b/external/java8-tests/src/test/java/test/org/apache/spark/java8/dstream/Java8APISuite.java
@@ -0,0 +1,910 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package test.org.apache.spark.java8.dstream;
+
+import java.io.Serializable;
+import java.util.*;
+
+import scala.Tuple2;
+
+import com.google.common.collect.Lists;
+import com.google.common.collect.Sets;
+import org.junit.Assert;
+import org.junit.Test;
+
+import org.apache.spark.Accumulator;
+import org.apache.spark.HashPartitioner;
+import org.apache.spark.api.java.Optional;
+import org.apache.spark.api.java.JavaPairRDD;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.function.PairFunction;
+import org.apache.spark.streaming.*;
+import org.apache.spark.streaming.api.java.JavaDStream;
+import org.apache.spark.streaming.api.java.JavaPairDStream;
+import org.apache.spark.streaming.api.java.JavaMapWithStateDStream;
+
+/**
+ * Most of these tests replicate org.apache.spark.streaming.JavaAPISuite using 
java 8
+ * lambda syntax.
+ */
+@SuppressWarnings("unchecked")
+public class Java8APISuite extends LocalJavaStreamingContext implements 
Serializable {
+
+  @Test
+  public void testMap() {
+List inputData = Arrays.asList(
+  Arrays.asList("hello", "world"),
+  Arrays.asList("goodnight", "moon"));
+
+List expected = Arrays.asList(
+  Arrays.asList(5, 5),
+  Arrays.asList(9, 4));
+
+JavaDStream stream = JavaTestUtils.attachTestInputStream(ssc, 
inputData, 1);
+JavaDStream letterCount = stream.map(String::length);
+JavaTestUtils.attachTestOutputStream(letterCount);
+List result = JavaTestUtils.runStreams(ssc, 2, 2);
+
+assertOrderInvariantEquals(expected, result);
+  }
+
+  @Test
+  public void testFilter() {
+List inputData = Arrays.asList(
+  Arrays.asList("giants", "dodgers"),
+  Arrays.asList("yankees", "red sox"));
+
+List expected = Arrays.asList(
+  Arrays.asList("giants"),
+  Arrays.asList("yankees"));
+
+JavaDStream stream = JavaTestUtils.attachTestInputStream(ssc, 
inputData, 1);
+JavaDStream filtered = stream.filter(s -> s.contains("a"));
+JavaTestUtils.attachTestOutputStream(filtered);
+List result = JavaTestUtils.runStreams(ssc, 2, 2);
+
+assertOrderInvariantEquals(expected, result);
+  }
+
+  @Test
+  public void testMapPartitions() {
+List inputData = Arrays.asList(
+  Arrays.asList("giants", "dodgers"),
+  Arrays.asList("yankees", "red sox"));
+
+List expected = Arrays.asList(
+  Arrays.asList("GIANTSDODGERS"),
+  Arrays.asList("YANKEESRED SOX"));
+
+JavaDStream stream = JavaTestUtils.attachTestInputStream(ssc, 
inputData, 1);
+JavaDStream mapped = stream.mapPartitions(in -> {
+  String out = "";
+  while (in.hasNext()) {
+out = out + in.next().toUpperCase();
+  }
+  return Lists.newArrayList(out).iterator();
+});
+JavaTestUtils.attachTestOutputStream(mapped);
+List result = JavaTestUtils.runStreams(ssc, 2, 2);
+
+Assert.assertEquals(expected, result);
+  }
+
+  @Test
+  public void testReduce() {
+List inputData = Arrays.asList(
+  Arrays.asList(1, 2, 3),
+  Arrays.asList(4, 5, 6),
+  Arrays.asList(7, 8, 9));
+
+List expected = Arrays.asList(
+  Arrays.asList(6),
+  Arrays.asList(15),
+  Arrays.asList(24));
+
+JavaDStream stream = JavaTestUtils.attachTestInputStream(ssc, 
inputData, 1);
+JavaDStream reduced = stream.reduce((x, y) -> x + y);
+JavaTestUtils.attachTestOutputStream(reduced);
+List result =

[2/2] spark git commit: [SPARK-15633][MINOR] Make package name for Java tests consistent

2016-05-27 Thread rxin

[SPARK-15633][MINOR] Make package name for Java tests consistent

## What changes were proposed in this pull request?
This is a simple patch that makes package names for Java 8 test suites 
consistent. I moved everything to test.org.apache.spark to we can test package 
private APIs properly. Also added "java8" as the package name so we can easily 
run all the tests related to Java 8.

## How was this patch tested?
This is a test only change.

Author: Reynold Xin 

Closes #13364 from rxin/SPARK-15633.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/73178c75
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/73178c75
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/73178c75

Branch: refs/heads/master
Commit: 73178c75565e20f53e6ee1478f3d976732c64438
Parents: 9893dc9
Author: Reynold Xin 
Authored: Fri May 27 21:20:02 2016 -0700
Committer: Reynold Xin 
Committed: Fri May 27 21:20:02 2016 -0700

--
 .../scala/org/apache/spark/SparkFunSuite.scala  |   2 +-
 .../java/org/apache/spark/Java8APISuite.java| 393 
 .../spark/sql/Java8DatasetAggregatorSuite.java  |  61 --
 .../apache/spark/streaming/Java8APISuite.java   | 909 --
 .../apache/spark/java8/Java8RDDAPISuite.java| 395 
 .../spark/java8/dstream/Java8APISuite.java  | 910 +++
 .../java8/sql/Java8DatasetAggregatorSuite.java  |  62 ++
 .../scala/org/apache/spark/JDK8ScalaSuite.scala |  27 -
 .../org/apache/spark/java8/JDK8ScalaSuite.scala |  30 +
 .../spark/sql/JavaDatasetAggregatorSuite.java   | 134 +++
 .../sql/JavaDatasetAggregatorSuiteBase.java |  75 ++
 .../org/apache/spark/sql/JavaSaveLoadSuite.java | 106 +++
 .../sql/sources/JavaDatasetAggregatorSuite.java | 134 ---
 .../sources/JavaDatasetAggregatorSuiteBase.java |  75 --
 .../spark/sql/sources/JavaSaveLoadSuite.java| 106 ---
 15 files changed, 1713 insertions(+), 1706 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/73178c75/core/src/test/scala/org/apache/spark/SparkFunSuite.scala
--
diff --git a/core/src/test/scala/org/apache/spark/SparkFunSuite.scala 
b/core/src/test/scala/org/apache/spark/SparkFunSuite.scala
index 0081bca..cd87680 100644
--- a/core/src/test/scala/org/apache/spark/SparkFunSuite.scala
+++ b/core/src/test/scala/org/apache/spark/SparkFunSuite.scala
@@ -26,7 +26,7 @@ import org.apache.spark.util.AccumulatorContext
 /**
  * Base abstract class for all unit tests in Spark for handling common 
functionality.
  */
-private[spark] abstract class SparkFunSuite
+abstract class SparkFunSuite
   extends FunSuite
   with BeforeAndAfterAll
   with Logging {

http://git-wip-us.apache.org/repos/asf/spark/blob/73178c75/external/java8-tests/src/test/java/org/apache/spark/Java8APISuite.java
--
diff --git 
a/external/java8-tests/src/test/java/org/apache/spark/Java8APISuite.java 
b/external/java8-tests/src/test/java/org/apache/spark/Java8APISuite.java
deleted file mode 100644
index 6ac5ca9..000
--- a/external/java8-tests/src/test/java/org/apache/spark/Java8APISuite.java
+++ /dev/null
@@ -1,393 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark;
-
-import java.io.File;
-import java.io.Serializable;
-import java.util.*;
-
-import scala.Tuple2;
-
-import com.google.common.collect.Iterables;
-import com.google.common.io.Files;
-import org.apache.hadoop.io.IntWritable;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapred.SequenceFileOutputFormat;
-import org.junit.After;
-import org.junit.Assert;
-import org.junit.Before;
-import org.junit.Test;
-
-import org.apache.spark.api.java.JavaDoubleRDD;
-import org.apache.spark.api.java.JavaPairRDD;
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.api.java.Optional;
-import

[1/2] spark git commit: [SPARK-15633][MINOR] Make package name for Java tests consistent

2016-05-27 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master 9893dc975 -> 73178c755


http://git-wip-us.apache.org/repos/asf/spark/blob/73178c75/external/java8-tests/src/test/java/test/org/apache/spark/java8/dstream/Java8APISuite.java
--
diff --git 
a/external/java8-tests/src/test/java/test/org/apache/spark/java8/dstream/Java8APISuite.java
 
b/external/java8-tests/src/test/java/test/org/apache/spark/java8/dstream/Java8APISuite.java
new file mode 100644
index 000..cf5607f
--- /dev/null
+++ 
b/external/java8-tests/src/test/java/test/org/apache/spark/java8/dstream/Java8APISuite.java
@@ -0,0 +1,910 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package test.org.apache.spark.java8.dstream;
+
+import java.io.Serializable;
+import java.util.*;
+
+import scala.Tuple2;
+
+import com.google.common.collect.Lists;
+import com.google.common.collect.Sets;
+import org.junit.Assert;
+import org.junit.Test;
+
+import org.apache.spark.Accumulator;
+import org.apache.spark.HashPartitioner;
+import org.apache.spark.api.java.Optional;
+import org.apache.spark.api.java.JavaPairRDD;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.function.PairFunction;
+import org.apache.spark.streaming.*;
+import org.apache.spark.streaming.api.java.JavaDStream;
+import org.apache.spark.streaming.api.java.JavaPairDStream;
+import org.apache.spark.streaming.api.java.JavaMapWithStateDStream;
+
+/**
+ * Most of these tests replicate org.apache.spark.streaming.JavaAPISuite using 
java 8
+ * lambda syntax.
+ */
+@SuppressWarnings("unchecked")
+public class Java8APISuite extends LocalJavaStreamingContext implements 
Serializable {
+
+  @Test
+  public void testMap() {
+List inputData = Arrays.asList(
+  Arrays.asList("hello", "world"),
+  Arrays.asList("goodnight", "moon"));
+
+List expected = Arrays.asList(
+  Arrays.asList(5, 5),
+  Arrays.asList(9, 4));
+
+JavaDStream stream = JavaTestUtils.attachTestInputStream(ssc, 
inputData, 1);
+JavaDStream letterCount = stream.map(String::length);
+JavaTestUtils.attachTestOutputStream(letterCount);
+List result = JavaTestUtils.runStreams(ssc, 2, 2);
+
+assertOrderInvariantEquals(expected, result);
+  }
+
+  @Test
+  public void testFilter() {
+List inputData = Arrays.asList(
+  Arrays.asList("giants", "dodgers"),
+  Arrays.asList("yankees", "red sox"));
+
+List expected = Arrays.asList(
+  Arrays.asList("giants"),
+  Arrays.asList("yankees"));
+
+JavaDStream stream = JavaTestUtils.attachTestInputStream(ssc, 
inputData, 1);
+JavaDStream filtered = stream.filter(s -> s.contains("a"));
+JavaTestUtils.attachTestOutputStream(filtered);
+List result = JavaTestUtils.runStreams(ssc, 2, 2);
+
+assertOrderInvariantEquals(expected, result);
+  }
+
+  @Test
+  public void testMapPartitions() {
+List inputData = Arrays.asList(
+  Arrays.asList("giants", "dodgers"),
+  Arrays.asList("yankees", "red sox"));
+
+List expected = Arrays.asList(
+  Arrays.asList("GIANTSDODGERS"),
+  Arrays.asList("YANKEESRED SOX"));
+
+JavaDStream stream = JavaTestUtils.attachTestInputStream(ssc, 
inputData, 1);
+JavaDStream mapped = stream.mapPartitions(in -> {
+  String out = "";
+  while (in.hasNext()) {
+out = out + in.next().toUpperCase();
+  }
+  return Lists.newArrayList(out).iterator();
+});
+JavaTestUtils.attachTestOutputStream(mapped);
+List result = JavaTestUtils.runStreams(ssc, 2, 2);
+
+Assert.assertEquals(expected, result);
+  }
+
+  @Test
+  public void testReduce() {
+List inputData = Arrays.asList(
+  Arrays.asList(1, 2, 3),
+  Arrays.asList(4, 5, 6),
+  Arrays.asList(7, 8, 9));
+
+List expected = Arrays.asList(
+  Arrays.asList(6),
+  Arrays.asList(15),
+  Arrays.asList(24));
+
+JavaDStream stream = JavaTestUtils.attachTestInputStream(ssc, 
inputData, 1);
+JavaDStream reduced = stream.reduce((x, y) -> x + y);
+JavaTestUtils.attachTestOutputStream(reduced);
+List result =

spark git commit: [SPARK-15610][ML] update error message for k in pca

2016-05-27 Thread srowen

Repository: spark
Updated Branches:
  refs/heads/master 88c9c467a -> 9893dc975


[SPARK-15610][ML] update error message for k in pca

## What changes were proposed in this pull request?
Fix the wrong bound of `k` in `PCA`
`require(k <= sources.first().size, ...`  ->  `require(k < sources.first().size`

BTW, remove unused import in `ml.ElementwiseProduct`

## How was this patch tested?

manual tests

Author: Zheng RuiFeng 

Closes #13356 from zhengruifeng/fix_pca.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9893dc97
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9893dc97
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9893dc97

Branch: refs/heads/master
Commit: 9893dc975784551a62f65bbd709f8972e0204b2a
Parents: 88c9c46
Author: Zheng RuiFeng 
Authored: Fri May 27 21:57:41 2016 -0500
Committer: Sean Owen 
Committed: Fri May 27 21:57:41 2016 -0500

--
 .../scala/org/apache/spark/ml/feature/ElementwiseProduct.scala | 1 -
 mllib/src/main/scala/org/apache/spark/mllib/feature/PCA.scala  | 6 +++---
 2 files changed, 3 insertions(+), 4 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/9893dc97/mllib/src/main/scala/org/apache/spark/ml/feature/ElementwiseProduct.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/feature/ElementwiseProduct.scala 
b/mllib/src/main/scala/org/apache/spark/ml/feature/ElementwiseProduct.scala
index 91989c3..9d2e60f 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/ElementwiseProduct.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ElementwiseProduct.scala
@@ -23,7 +23,6 @@ import org.apache.spark.ml.linalg.{Vector, VectorUDT}
 import org.apache.spark.ml.param.Param
 import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, 
Identifiable}
 import org.apache.spark.mllib.feature
-import org.apache.spark.mllib.linalg.{Vectors => OldVectors}
 import org.apache.spark.mllib.linalg.VectorImplicits._
 import org.apache.spark.sql.types.DataType
 

http://git-wip-us.apache.org/repos/asf/spark/blob/9893dc97/mllib/src/main/scala/org/apache/spark/mllib/feature/PCA.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/PCA.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/feature/PCA.scala
index 30c403e..15b7220 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/PCA.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/PCA.scala
@@ -40,8 +40,9 @@ class PCA @Since("1.4.0") (@Since("1.4.0") val k: Int) {
*/
   @Since("1.4.0")
   def fit(sources: RDD[Vector]): PCAModel = {
-require(k <= sources.first().size,
-  s"source vector size is ${sources.first().size} must be greater than 
k=$k")
+val numFeatures = sources.first().size
+require(k <= numFeatures,
+  s"source vector size $numFeatures must be no less than k=$k")
 
 val mat = new RowMatrix(sources)
 val (pc, explainedVariance) = 
mat.computePrincipalComponentsAndExplainedVariance(k)
@@ -58,7 +59,6 @@ class PCA @Since("1.4.0") (@Since("1.4.0") val k: Int) {
   case m =>
 throw new IllegalArgumentException("Unsupported matrix format. 
Expected " +
   s"SparseMatrix or DenseMatrix. Instead got: ${m.getClass}")
-
 }
 val denseExplainedVariance = explainedVariance match {
   case dv: DenseVector =>


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-15610][ML] update error message for k in pca

2016-05-27 Thread srowen

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 6d82e0c1b -> 3801fb4f3


[SPARK-15610][ML] update error message for k in pca

## What changes were proposed in this pull request?
Fix the wrong bound of `k` in `PCA`
`require(k <= sources.first().size, ...`  ->  `require(k < sources.first().size`

BTW, remove unused import in `ml.ElementwiseProduct`

## How was this patch tested?

manual tests

Author: Zheng RuiFeng 

Closes #13356 from zhengruifeng/fix_pca.

(cherry picked from commit 9893dc975784551a62f65bbd709f8972e0204b2a)
Signed-off-by: Sean Owen 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3801fb4f
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3801fb4f
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3801fb4f

Branch: refs/heads/branch-2.0
Commit: 3801fb4f35ba1ffb8dbaf8326eff927b738551f2
Parents: 6d82e0c
Author: Zheng RuiFeng 
Authored: Fri May 27 21:57:41 2016 -0500
Committer: Sean Owen 
Committed: Fri May 27 21:57:48 2016 -0500

--
 .../scala/org/apache/spark/ml/feature/ElementwiseProduct.scala | 1 -
 mllib/src/main/scala/org/apache/spark/mllib/feature/PCA.scala  | 6 +++---
 2 files changed, 3 insertions(+), 4 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/3801fb4f/mllib/src/main/scala/org/apache/spark/ml/feature/ElementwiseProduct.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/feature/ElementwiseProduct.scala 
b/mllib/src/main/scala/org/apache/spark/ml/feature/ElementwiseProduct.scala
index 91989c3..9d2e60f 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/ElementwiseProduct.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ElementwiseProduct.scala
@@ -23,7 +23,6 @@ import org.apache.spark.ml.linalg.{Vector, VectorUDT}
 import org.apache.spark.ml.param.Param
 import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, 
Identifiable}
 import org.apache.spark.mllib.feature
-import org.apache.spark.mllib.linalg.{Vectors => OldVectors}
 import org.apache.spark.mllib.linalg.VectorImplicits._
 import org.apache.spark.sql.types.DataType
 

http://git-wip-us.apache.org/repos/asf/spark/blob/3801fb4f/mllib/src/main/scala/org/apache/spark/mllib/feature/PCA.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/PCA.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/feature/PCA.scala
index 30c403e..15b7220 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/PCA.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/PCA.scala
@@ -40,8 +40,9 @@ class PCA @Since("1.4.0") (@Since("1.4.0") val k: Int) {
*/
   @Since("1.4.0")
   def fit(sources: RDD[Vector]): PCAModel = {
-require(k <= sources.first().size,
-  s"source vector size is ${sources.first().size} must be greater than 
k=$k")
+val numFeatures = sources.first().size
+require(k <= numFeatures,
+  s"source vector size $numFeatures must be no less than k=$k")
 
 val mat = new RowMatrix(sources)
 val (pc, explainedVariance) = 
mat.computePrincipalComponentsAndExplainedVariance(k)
@@ -58,7 +59,6 @@ class PCA @Since("1.4.0") (@Since("1.4.0") val k: Int) {
   case m =>
 throw new IllegalArgumentException("Unsupported matrix format. 
Expected " +
   s"SparseMatrix or DenseMatrix. Instead got: ${m.getClass}")
-
 }
 val denseExplainedVariance = explainedVariance match {
   case dv: DenseVector =>


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-15562][ML] Delete temp directory after program exit in DataFrameExample

2016-05-27 Thread srowen

Repository: spark
Updated Branches:
  refs/heads/master 5d4dafe8f -> 88c9c467a


[SPARK-15562][ML] Delete temp directory after program exit in DataFrameExample

## What changes were proposed in this pull request?
Temp directory used to save records is not deleted after program exit in 
DataFrameExample. Although it called deleteOnExit, it doesn't work as the 
directory is not empty. Similar things happend in ContextCleanerSuite. Update 
the code to make sure temp directory is deleted after program exit.

## How was this patch tested?

unit tests and local build.

Author: dding3 

Closes #13328 from dding3/master.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/88c9c467
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/88c9c467
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/88c9c467

Branch: refs/heads/master
Commit: 88c9c467a31630c558719679ca0894873a268b27
Parents: 5d4dafe
Author: dding3 
Authored: Fri May 27 21:01:50 2016 -0500
Committer: Sean Owen 
Committed: Fri May 27 21:01:50 2016 -0500

--
 core/src/test/scala/org/apache/spark/ContextCleanerSuite.scala   | 4 ++--
 .../scala/org/apache/spark/examples/ml/DataFrameExample.scala| 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/88c9c467/core/src/test/scala/org/apache/spark/ContextCleanerSuite.scala
--
diff --git a/core/src/test/scala/org/apache/spark/ContextCleanerSuite.scala 
b/core/src/test/scala/org/apache/spark/ContextCleanerSuite.scala
index 69ff6c7..6724af9 100644
--- a/core/src/test/scala/org/apache/spark/ContextCleanerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ContextCleanerSuite.scala
@@ -32,6 +32,7 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.rdd.{RDD, ReliableRDDCheckpointData}
 import org.apache.spark.shuffle.sort.SortShuffleManager
 import org.apache.spark.storage._
+import org.apache.spark.util.Utils
 
 /**
  * An abstract base class for context cleaner tests, which sets up a context 
with a config
@@ -206,8 +207,7 @@ class ContextCleanerSuite extends ContextCleanerSuiteBase {
   }
 
   test("automatically cleanup normal checkpoint") {
-val checkpointDir = java.io.File.createTempFile("temp", "")
-checkpointDir.deleteOnExit()
+val checkpointDir = Utils.createTempDir()
 checkpointDir.delete()
 var rdd = newPairRDD()
 sc.setCheckpointDir(checkpointDir.toString)

http://git-wip-us.apache.org/repos/asf/spark/blob/88c9c467/examples/src/main/scala/org/apache/spark/examples/ml/DataFrameExample.scala
--
diff --git 
a/examples/src/main/scala/org/apache/spark/examples/ml/DataFrameExample.scala 
b/examples/src/main/scala/org/apache/spark/examples/ml/DataFrameExample.scala
index c69027b..11faa61 100644
--- 
a/examples/src/main/scala/org/apache/spark/examples/ml/DataFrameExample.scala
+++ 
b/examples/src/main/scala/org/apache/spark/examples/ml/DataFrameExample.scala
@@ -28,6 +28,7 @@ import org.apache.spark.ml.linalg.Vector
 import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer
 import org.apache.spark.sql.{DataFrame, Row, SparkSession}
+import org.apache.spark.util.Utils
 
 /**
  * An example of how to use [[org.apache.spark.sql.DataFrame]] for ML. Run with
@@ -86,8 +87,7 @@ object DataFrameExample {
 println(s"Selected features column with average values:\n 
${featureSummary.mean.toString}")
 
 // Save the records in a parquet file.
-val tmpDir = Files.createTempDir()
-tmpDir.deleteOnExit()
+val tmpDir = Utils.createTempDir()
 val outputDir = new File(tmpDir, "dataframe").toString
 println(s"Saving to $outputDir as Parquet file.")
 df.write.parquet(outputDir)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-15562][ML] Delete temp directory after program exit in DataFrameExample

2016-05-27 Thread srowen

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 8467e2102 -> 6d82e0c1b


[SPARK-15562][ML] Delete temp directory after program exit in DataFrameExample

## What changes were proposed in this pull request?
Temp directory used to save records is not deleted after program exit in 
DataFrameExample. Although it called deleteOnExit, it doesn't work as the 
directory is not empty. Similar things happend in ContextCleanerSuite. Update 
the code to make sure temp directory is deleted after program exit.

## How was this patch tested?

unit tests and local build.

Author: dding3 

Closes #13328 from dding3/master.

(cherry picked from commit 88c9c467a31630c558719679ca0894873a268b27)
Signed-off-by: Sean Owen 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6d82e0c1
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6d82e0c1
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6d82e0c1

Branch: refs/heads/branch-2.0
Commit: 6d82e0c1b8b4368e91aeebfc80430a61762c7e88
Parents: 8467e21
Author: dding3 
Authored: Fri May 27 21:01:50 2016 -0500
Committer: Sean Owen 
Committed: Fri May 27 21:01:56 2016 -0500

--
 core/src/test/scala/org/apache/spark/ContextCleanerSuite.scala   | 4 ++--
 .../scala/org/apache/spark/examples/ml/DataFrameExample.scala| 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/6d82e0c1/core/src/test/scala/org/apache/spark/ContextCleanerSuite.scala
--
diff --git a/core/src/test/scala/org/apache/spark/ContextCleanerSuite.scala 
b/core/src/test/scala/org/apache/spark/ContextCleanerSuite.scala
index 69ff6c7..6724af9 100644
--- a/core/src/test/scala/org/apache/spark/ContextCleanerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ContextCleanerSuite.scala
@@ -32,6 +32,7 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.rdd.{RDD, ReliableRDDCheckpointData}
 import org.apache.spark.shuffle.sort.SortShuffleManager
 import org.apache.spark.storage._
+import org.apache.spark.util.Utils
 
 /**
  * An abstract base class for context cleaner tests, which sets up a context 
with a config
@@ -206,8 +207,7 @@ class ContextCleanerSuite extends ContextCleanerSuiteBase {
   }
 
   test("automatically cleanup normal checkpoint") {
-val checkpointDir = java.io.File.createTempFile("temp", "")
-checkpointDir.deleteOnExit()
+val checkpointDir = Utils.createTempDir()
 checkpointDir.delete()
 var rdd = newPairRDD()
 sc.setCheckpointDir(checkpointDir.toString)

http://git-wip-us.apache.org/repos/asf/spark/blob/6d82e0c1/examples/src/main/scala/org/apache/spark/examples/ml/DataFrameExample.scala
--
diff --git 
a/examples/src/main/scala/org/apache/spark/examples/ml/DataFrameExample.scala 
b/examples/src/main/scala/org/apache/spark/examples/ml/DataFrameExample.scala
index c69027b..11faa61 100644
--- 
a/examples/src/main/scala/org/apache/spark/examples/ml/DataFrameExample.scala
+++ 
b/examples/src/main/scala/org/apache/spark/examples/ml/DataFrameExample.scala
@@ -28,6 +28,7 @@ import org.apache.spark.ml.linalg.Vector
 import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer
 import org.apache.spark.sql.{DataFrame, Row, SparkSession}
+import org.apache.spark.util.Utils
 
 /**
  * An example of how to use [[org.apache.spark.sql.DataFrame]] for ML. Run with
@@ -86,8 +87,7 @@ object DataFrameExample {
 println(s"Selected features column with average values:\n 
${featureSummary.mean.toString}")
 
 // Save the records in a parquet file.
-val tmpDir = Files.createTempDir()
-tmpDir.deleteOnExit()
+val tmpDir = Utils.createTempDir()
 val outputDir = new File(tmpDir, "dataframe").toString
 println(s"Saving to $outputDir as Parquet file.")
 df.write.parquet(outputDir)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-15449][MLLIB][EXAMPLE] Wrong Data Format - Documentation Issue

2016-05-27 Thread srowen

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 80a40e8e2 -> 8467e2102


[SPARK-15449][MLLIB][EXAMPLE] Wrong Data Format - Documentation Issue

## What changes were proposed in this pull request?

(Please fill in changes proposed in this fix)
In the MLLib naivebayes example, scala and python example doesn't use libsvm 
data, but Java does.

I make changes in scala and python example to use the libsvm data as the same 
as Java example.

## How was this patch tested?

Manual tests

Author: wm...@hotmail.com 

Closes #13301 from wangmiao1981/example.

(cherry picked from commit 5d4dafe8fdea49dcbd6b0e4c23e3791fa30c8911)
Signed-off-by: Sean Owen 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8467e210
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8467e210
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8467e210

Branch: refs/heads/branch-2.0
Commit: 8467e2102886da1cefb43f2aaa69864375fe91bc
Parents: 80a40e8
Author: wm...@hotmail.com 
Authored: Fri May 27 20:59:24 2016 -0500
Committer: Sean Owen 
Committed: Fri May 27 20:59:34 2016 -0500

--
 data/mllib/sample_naive_bayes_data.txt| 12 
 .../spark/examples/mllib/JavaNaiveBayesExample.java   |  4 ++--
 examples/src/main/python/mllib/naive_bayes_example.py | 13 -
 .../spark/examples/mllib/NaiveBayesExample.scala  | 14 --
 4 files changed, 10 insertions(+), 33 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/8467e210/data/mllib/sample_naive_bayes_data.txt
--
diff --git a/data/mllib/sample_naive_bayes_data.txt 
b/data/mllib/sample_naive_bayes_data.txt
deleted file mode 100644
index bd22bea..000
--- a/data/mllib/sample_naive_bayes_data.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-0,1 0 0
-0,2 0 0
-0,3 0 0
-0,4 0 0
-1,0 1 0
-1,0 2 0
-1,0 3 0
-1,0 4 0
-2,0 0 1
-2,0 0 2
-2,0 0 3
-2,0 0 4
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/spark/blob/8467e210/examples/src/main/java/org/apache/spark/examples/mllib/JavaNaiveBayesExample.java
--
diff --git 
a/examples/src/main/java/org/apache/spark/examples/mllib/JavaNaiveBayesExample.java
 
b/examples/src/main/java/org/apache/spark/examples/mllib/JavaNaiveBayesExample.java
index 2b17dbb..f4ec04b 100644
--- 
a/examples/src/main/java/org/apache/spark/examples/mllib/JavaNaiveBayesExample.java
+++ 
b/examples/src/main/java/org/apache/spark/examples/mllib/JavaNaiveBayesExample.java
@@ -36,9 +36,9 @@ public class JavaNaiveBayesExample {
 SparkConf sparkConf = new SparkConf().setAppName("JavaNaiveBayesExample");
 JavaSparkContext jsc = new JavaSparkContext(sparkConf);
 // $example on$
-String path = "data/mllib/sample_naive_bayes_data.txt";
+String path = "data/mllib/sample_libsvm_data.txt";
 JavaRDD inputData = MLUtils.loadLibSVMFile(jsc.sc(), 
path).toJavaRDD();
-JavaRDD[] tmp = inputData.randomSplit(new double[]{0.6, 
0.4}, 12345);
+JavaRDD[] tmp = inputData.randomSplit(new double[]{0.6, 
0.4});
 JavaRDD training = tmp[0]; // training set
 JavaRDD test = tmp[1]; // test set
 final NaiveBayesModel model = NaiveBayes.train(training.rdd(), 1.0);

http://git-wip-us.apache.org/repos/asf/spark/blob/8467e210/examples/src/main/python/mllib/naive_bayes_example.py
--
diff --git a/examples/src/main/python/mllib/naive_bayes_example.py 
b/examples/src/main/python/mllib/naive_bayes_example.py
index 35724f7..749353b 100644
--- a/examples/src/main/python/mllib/naive_bayes_example.py
+++ b/examples/src/main/python/mllib/naive_bayes_example.py
@@ -29,15 +29,9 @@ import shutil
 from pyspark import SparkContext
 # $example on$
 from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel
-from pyspark.mllib.linalg import Vectors
-from pyspark.mllib.regression import LabeledPoint
+from pyspark.mllib.util import MLUtils
 
 
-def parseLine(line):
-parts = line.split(',')
-label = float(parts[0])
-features = Vectors.dense([float(x) for x in parts[1].split(' ')])
-return LabeledPoint(label, features)
 # $example off$
 
 if __name__ == "__main__":
@@ -45,10 +39,11 @@ if __name__ == "__main__":
 sc = SparkContext(appName="PythonNaiveBayesExample")
 
 # $example on$
-data = sc.textFile('data/mllib/sample_naive_bayes_data.txt').map(parseLine)
+# Load and parse the data file.
+data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
 
 # Split data approximately into training (60%) and test (40%)
-training, test = data.randomSplit([0.6, 0.4], seed=0)
+

spark git commit: [SPARK-15449][MLLIB][EXAMPLE] Wrong Data Format - Documentation Issue

2016-05-27 Thread srowen

Repository: spark
Updated Branches:
  refs/heads/master 4a2fb8b87 -> 5d4dafe8f


[SPARK-15449][MLLIB][EXAMPLE] Wrong Data Format - Documentation Issue

## What changes were proposed in this pull request?

(Please fill in changes proposed in this fix)
In the MLLib naivebayes example, scala and python example doesn't use libsvm 
data, but Java does.

I make changes in scala and python example to use the libsvm data as the same 
as Java example.

## How was this patch tested?

Manual tests

Author: wm...@hotmail.com 

Closes #13301 from wangmiao1981/example.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5d4dafe8
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5d4dafe8
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5d4dafe8

Branch: refs/heads/master
Commit: 5d4dafe8fdea49dcbd6b0e4c23e3791fa30c8911
Parents: 4a2fb8b
Author: wm...@hotmail.com 
Authored: Fri May 27 20:59:24 2016 -0500
Committer: Sean Owen 
Committed: Fri May 27 20:59:24 2016 -0500

--
 data/mllib/sample_naive_bayes_data.txt| 12 
 .../spark/examples/mllib/JavaNaiveBayesExample.java   |  4 ++--
 examples/src/main/python/mllib/naive_bayes_example.py | 13 -
 .../spark/examples/mllib/NaiveBayesExample.scala  | 14 --
 4 files changed, 10 insertions(+), 33 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/5d4dafe8/data/mllib/sample_naive_bayes_data.txt
--
diff --git a/data/mllib/sample_naive_bayes_data.txt 
b/data/mllib/sample_naive_bayes_data.txt
deleted file mode 100644
index bd22bea..000
--- a/data/mllib/sample_naive_bayes_data.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-0,1 0 0
-0,2 0 0
-0,3 0 0
-0,4 0 0
-1,0 1 0
-1,0 2 0
-1,0 3 0
-1,0 4 0
-2,0 0 1
-2,0 0 2
-2,0 0 3
-2,0 0 4
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/spark/blob/5d4dafe8/examples/src/main/java/org/apache/spark/examples/mllib/JavaNaiveBayesExample.java
--
diff --git 
a/examples/src/main/java/org/apache/spark/examples/mllib/JavaNaiveBayesExample.java
 
b/examples/src/main/java/org/apache/spark/examples/mllib/JavaNaiveBayesExample.java
index 2b17dbb..f4ec04b 100644
--- 
a/examples/src/main/java/org/apache/spark/examples/mllib/JavaNaiveBayesExample.java
+++ 
b/examples/src/main/java/org/apache/spark/examples/mllib/JavaNaiveBayesExample.java
@@ -36,9 +36,9 @@ public class JavaNaiveBayesExample {
 SparkConf sparkConf = new SparkConf().setAppName("JavaNaiveBayesExample");
 JavaSparkContext jsc = new JavaSparkContext(sparkConf);
 // $example on$
-String path = "data/mllib/sample_naive_bayes_data.txt";
+String path = "data/mllib/sample_libsvm_data.txt";
 JavaRDD inputData = MLUtils.loadLibSVMFile(jsc.sc(), 
path).toJavaRDD();
-JavaRDD[] tmp = inputData.randomSplit(new double[]{0.6, 
0.4}, 12345);
+JavaRDD[] tmp = inputData.randomSplit(new double[]{0.6, 
0.4});
 JavaRDD training = tmp[0]; // training set
 JavaRDD test = tmp[1]; // test set
 final NaiveBayesModel model = NaiveBayes.train(training.rdd(), 1.0);

http://git-wip-us.apache.org/repos/asf/spark/blob/5d4dafe8/examples/src/main/python/mllib/naive_bayes_example.py
--
diff --git a/examples/src/main/python/mllib/naive_bayes_example.py 
b/examples/src/main/python/mllib/naive_bayes_example.py
index 35724f7..749353b 100644
--- a/examples/src/main/python/mllib/naive_bayes_example.py
+++ b/examples/src/main/python/mllib/naive_bayes_example.py
@@ -29,15 +29,9 @@ import shutil
 from pyspark import SparkContext
 # $example on$
 from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel
-from pyspark.mllib.linalg import Vectors
-from pyspark.mllib.regression import LabeledPoint
+from pyspark.mllib.util import MLUtils
 
 
-def parseLine(line):
-parts = line.split(',')
-label = float(parts[0])
-features = Vectors.dense([float(x) for x in parts[1].split(' ')])
-return LabeledPoint(label, features)
 # $example off$
 
 if __name__ == "__main__":
@@ -45,10 +39,11 @@ if __name__ == "__main__":
 sc = SparkContext(appName="PythonNaiveBayesExample")
 
 # $example on$
-data = sc.textFile('data/mllib/sample_naive_bayes_data.txt').map(parseLine)
+# Load and parse the data file.
+data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
 
 # Split data approximately into training (60%) and test (40%)
-training, test = data.randomSplit([0.6, 0.4], seed=0)
+training, test = data.randomSplit([0.6, 0.4])
 
 # Train a naive Bayes model.
 model = NaiveBayes.train(training,

spark git commit: [SPARK-15594][SQL] ALTER TABLE SERDEPROPERTIES does not respect partition spec

2016-05-27 Thread yhuai

Repository: spark
Updated Branches:
  refs/heads/master 776d183c8 -> 4a2fb8b87


[SPARK-15594][SQL] ALTER TABLE SERDEPROPERTIES does not respect partition spec

## What changes were proposed in this pull request?

These commands ignore the partition spec and change the storage properties of 
the table itself:
```
ALTER TABLE table_name PARTITION (a=1, b=2) SET SERDE 'my_serde'
ALTER TABLE table_name PARTITION (a=1, b=2) SET SERDEPROPERTIES ('key1'='val1')
```
Now they change the storage properties of the specified partition.

## How was this patch tested?

DDLSuite

Author: Andrew Or 

Closes #13343 from andrewor14/alter-table-serdeproperties.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4a2fb8b8
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4a2fb8b8
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4a2fb8b8

Branch: refs/heads/master
Commit: 4a2fb8b87ca4517e0f4a1d7a1a1b3c08c1c1294d
Parents: 776d183
Author: Andrew Or 
Authored: Fri May 27 17:27:24 2016 -0700
Committer: Yin Huai 
Committed: Fri May 27 17:27:24 2016 -0700

--
 .../spark/sql/execution/command/ddl.scala   | 26 ++--
 .../spark/sql/execution/command/DDLSuite.scala  | 64 
 2 files changed, 84 insertions(+), 6 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/4a2fb8b8/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
index 95bac94..5fd0b83 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
@@ -293,7 +293,7 @@ case class AlterTableSerDePropertiesCommand(
 tableName: TableIdentifier,
 serdeClassName: Option[String],
 serdeProperties: Option[Map[String, String]],
-partition: Option[Map[String, String]])
+partSpec: Option[TablePartitionSpec])
   extends RunnableCommand {
 
   // should never happen if we parsed things correctly
@@ -306,15 +306,29 @@ case class AlterTableSerDePropertiesCommand(
   "ALTER TABLE SERDEPROPERTIES")
 val catalog = sparkSession.sessionState.catalog
 val table = catalog.getTableMetadata(tableName)
-// Do not support setting serde for datasource tables
+// For datasource tables, disallow setting serde or specifying partition
+if (partSpec.isDefined && DDLUtils.isDatasourceTable(table)) {
+  throw new AnalysisException("Operation not allowed: ALTER TABLE SET " +
+"[SERDE | SERDEPROPERTIES] for a specific partition is not supported " 
+
+"for tables created with the datasource API")
+}
 if (serdeClassName.isDefined && DDLUtils.isDatasourceTable(table)) {
   throw new AnalysisException("Operation not allowed: ALTER TABLE SET 
SERDE is " +
 "not supported for tables created with the datasource API")
 }
-val newTable = table.withNewStorage(
-  serde = serdeClassName.orElse(table.storage.serde),
-  serdeProperties = table.storage.serdeProperties ++ 
serdeProperties.getOrElse(Map()))
-catalog.alterTable(newTable)
+if (partSpec.isEmpty) {
+  val newTable = table.withNewStorage(
+serde = serdeClassName.orElse(table.storage.serde),
+serdeProperties = table.storage.serdeProperties ++ 
serdeProperties.getOrElse(Map()))
+  catalog.alterTable(newTable)
+} else {
+  val spec = partSpec.get
+  val part = catalog.getPartition(tableName, spec)
+  val newPart = part.copy(storage = part.storage.copy(
+serde = serdeClassName.orElse(part.storage.serde),
+serdeProperties = part.storage.serdeProperties ++ 
serdeProperties.getOrElse(Map(
+  catalog.alterPartitions(tableName, Seq(newPart))
+}
 Seq.empty[Row]
   }
 

http://git-wip-us.apache.org/repos/asf/spark/blob/4a2fb8b8/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
--
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
index ccb4006..5d45cfb 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
@@ -538,6 +538,14 @@ class DDLSuite extends QueryTest with SharedSQLContext 
with BeforeAndAfterEach {
 testSetSerde(isDatasourceTable = true)
   }
 
+  test("alter table: set serde

spark git commit: [SPARK-15594][SQL] ALTER TABLE SERDEPROPERTIES does not respect partition spec

2016-05-27 Thread yhuai

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 dc6e94157 -> 80a40e8e2


[SPARK-15594][SQL] ALTER TABLE SERDEPROPERTIES does not respect partition spec

## What changes were proposed in this pull request?

These commands ignore the partition spec and change the storage properties of 
the table itself:
```
ALTER TABLE table_name PARTITION (a=1, b=2) SET SERDE 'my_serde'
ALTER TABLE table_name PARTITION (a=1, b=2) SET SERDEPROPERTIES ('key1'='val1')
```
Now they change the storage properties of the specified partition.

## How was this patch tested?

DDLSuite

Author: Andrew Or 

Closes #13343 from andrewor14/alter-table-serdeproperties.

(cherry picked from commit 4a2fb8b87ca4517e0f4a1d7a1a1b3c08c1c1294d)
Signed-off-by: Yin Huai 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/80a40e8e
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/80a40e8e
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/80a40e8e

Branch: refs/heads/branch-2.0
Commit: 80a40e8e2cc198c34dabbc431d4ca302319fbbad
Parents: dc6e941
Author: Andrew Or 
Authored: Fri May 27 17:27:24 2016 -0700
Committer: Yin Huai 
Committed: Fri May 27 17:27:38 2016 -0700

--
 .../spark/sql/execution/command/ddl.scala   | 26 ++--
 .../spark/sql/execution/command/DDLSuite.scala  | 64 
 2 files changed, 84 insertions(+), 6 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/80a40e8e/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
index 95bac94..5fd0b83 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
@@ -293,7 +293,7 @@ case class AlterTableSerDePropertiesCommand(
 tableName: TableIdentifier,
 serdeClassName: Option[String],
 serdeProperties: Option[Map[String, String]],
-partition: Option[Map[String, String]])
+partSpec: Option[TablePartitionSpec])
   extends RunnableCommand {
 
   // should never happen if we parsed things correctly
@@ -306,15 +306,29 @@ case class AlterTableSerDePropertiesCommand(
   "ALTER TABLE SERDEPROPERTIES")
 val catalog = sparkSession.sessionState.catalog
 val table = catalog.getTableMetadata(tableName)
-// Do not support setting serde for datasource tables
+// For datasource tables, disallow setting serde or specifying partition
+if (partSpec.isDefined && DDLUtils.isDatasourceTable(table)) {
+  throw new AnalysisException("Operation not allowed: ALTER TABLE SET " +
+"[SERDE | SERDEPROPERTIES] for a specific partition is not supported " 
+
+"for tables created with the datasource API")
+}
 if (serdeClassName.isDefined && DDLUtils.isDatasourceTable(table)) {
   throw new AnalysisException("Operation not allowed: ALTER TABLE SET 
SERDE is " +
 "not supported for tables created with the datasource API")
 }
-val newTable = table.withNewStorage(
-  serde = serdeClassName.orElse(table.storage.serde),
-  serdeProperties = table.storage.serdeProperties ++ 
serdeProperties.getOrElse(Map()))
-catalog.alterTable(newTable)
+if (partSpec.isEmpty) {
+  val newTable = table.withNewStorage(
+serde = serdeClassName.orElse(table.storage.serde),
+serdeProperties = table.storage.serdeProperties ++ 
serdeProperties.getOrElse(Map()))
+  catalog.alterTable(newTable)
+} else {
+  val spec = partSpec.get
+  val part = catalog.getPartition(tableName, spec)
+  val newPart = part.copy(storage = part.storage.copy(
+serde = serdeClassName.orElse(part.storage.serde),
+serdeProperties = part.storage.serdeProperties ++ 
serdeProperties.getOrElse(Map(
+  catalog.alterPartitions(tableName, Seq(newPart))
+}
 Seq.empty[Row]
   }
 

http://git-wip-us.apache.org/repos/asf/spark/blob/80a40e8e/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
--
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
index ccb4006..5d45cfb 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
@@ -538,6 +538,14 @@ class DDLSuite extends QueryTest with

spark git commit: [SPARK-9876][SQL] Update Parquet to 1.8.1.

2016-05-27 Thread lian

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 9c137b2e3 -> dc6e94157


[SPARK-9876][SQL] Update Parquet to 1.8.1.

## What changes were proposed in this pull request?

This includes minimal changes to get Spark using the current release of 
Parquet, 1.8.1.

## How was this patch tested?

This uses the existing Parquet tests.

Author: Ryan Blue 

Closes #13280 from rdblue/SPARK-9876-update-parquet.

(cherry picked from commit 776d183c82b424ef7c3cae30537d8afe9b9eee83)
Signed-off-by: Cheng Lian 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/dc6e9415
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/dc6e9415
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/dc6e9415

Branch: refs/heads/branch-2.0
Commit: dc6e94157ce08df91aa1a31db8e5ec733a1ab0c5
Parents: 9c137b2
Author: Ryan Blue 
Authored: Fri May 27 16:59:38 2016 -0700
Committer: Cheng Lian 
Committed: Fri May 27 16:59:50 2016 -0700

--
 dev/deps/spark-deps-hadoop-2.2  | 11 ++-
 dev/deps/spark-deps-hadoop-2.3  | 11 ++-
 dev/deps/spark-deps-hadoop-2.4  | 11 ++-
 dev/deps/spark-deps-hadoop-2.6  | 11 ++-
 dev/deps/spark-deps-hadoop-2.7  | 11 ++-
 pom.xml |  2 +-
 .../SpecificParquetRecordReaderBase.java| 20 +++--
 .../parquet/CatalystReadSupport.scala   | 12 ++-
 .../parquet/CatalystSchemaConverter.scala   | 16 
 .../datasources/parquet/ParquetFilters.scala| 83 
 .../parquet/ParquetSchemaSuite.scala| 20 +++--
 11 files changed, 91 insertions(+), 117 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/dc6e9415/dev/deps/spark-deps-hadoop-2.2
--
diff --git a/dev/deps/spark-deps-hadoop-2.2 b/dev/deps/spark-deps-hadoop-2.2
index 578691c..deec033 100644
--- a/dev/deps/spark-deps-hadoop-2.2
+++ b/dev/deps/spark-deps-hadoop-2.2
@@ -129,14 +129,13 @@ opencsv-2.3.jar
 oro-2.0.8.jar
 osgi-resource-locator-1.0.1.jar
 paranamer-2.8.jar
-parquet-column-1.7.0.jar
-parquet-common-1.7.0.jar
-parquet-encoding-1.7.0.jar
+parquet-column-1.8.1.jar
+parquet-common-1.8.1.jar
+parquet-encoding-1.8.1.jar
 parquet-format-2.3.0-incubating.jar
-parquet-generator-1.7.0.jar
-parquet-hadoop-1.7.0.jar
+parquet-hadoop-1.8.1.jar
 parquet-hadoop-bundle-1.6.0.jar
-parquet-jackson-1.7.0.jar
+parquet-jackson-1.8.1.jar
 pmml-model-1.2.15.jar
 pmml-schema-1.2.15.jar
 protobuf-java-2.5.0.jar

http://git-wip-us.apache.org/repos/asf/spark/blob/dc6e9415/dev/deps/spark-deps-hadoop-2.3
--
diff --git a/dev/deps/spark-deps-hadoop-2.3 b/dev/deps/spark-deps-hadoop-2.3
index fc6306f..43c7dd3 100644
--- a/dev/deps/spark-deps-hadoop-2.3
+++ b/dev/deps/spark-deps-hadoop-2.3
@@ -136,14 +136,13 @@ opencsv-2.3.jar
 oro-2.0.8.jar
 osgi-resource-locator-1.0.1.jar
 paranamer-2.8.jar
-parquet-column-1.7.0.jar
-parquet-common-1.7.0.jar
-parquet-encoding-1.7.0.jar
+parquet-column-1.8.1.jar
+parquet-common-1.8.1.jar
+parquet-encoding-1.8.1.jar
 parquet-format-2.3.0-incubating.jar
-parquet-generator-1.7.0.jar
-parquet-hadoop-1.7.0.jar
+parquet-hadoop-1.8.1.jar
 parquet-hadoop-bundle-1.6.0.jar
-parquet-jackson-1.7.0.jar
+parquet-jackson-1.8.1.jar
 pmml-model-1.2.15.jar
 pmml-schema-1.2.15.jar
 protobuf-java-2.5.0.jar

http://git-wip-us.apache.org/repos/asf/spark/blob/dc6e9415/dev/deps/spark-deps-hadoop-2.4
--
diff --git a/dev/deps/spark-deps-hadoop-2.4 b/dev/deps/spark-deps-hadoop-2.4
index dee1417..7186b30 100644
--- a/dev/deps/spark-deps-hadoop-2.4
+++ b/dev/deps/spark-deps-hadoop-2.4
@@ -136,14 +136,13 @@ opencsv-2.3.jar
 oro-2.0.8.jar
 osgi-resource-locator-1.0.1.jar
 paranamer-2.8.jar
-parquet-column-1.7.0.jar
-parquet-common-1.7.0.jar
-parquet-encoding-1.7.0.jar
+parquet-column-1.8.1.jar
+parquet-common-1.8.1.jar
+parquet-encoding-1.8.1.jar
 parquet-format-2.3.0-incubating.jar
-parquet-generator-1.7.0.jar
-parquet-hadoop-1.7.0.jar
+parquet-hadoop-1.8.1.jar
 parquet-hadoop-bundle-1.6.0.jar
-parquet-jackson-1.7.0.jar
+parquet-jackson-1.8.1.jar
 pmml-model-1.2.15.jar
 pmml-schema-1.2.15.jar
 protobuf-java-2.5.0.jar

http://git-wip-us.apache.org/repos/asf/spark/blob/dc6e9415/dev/deps/spark-deps-hadoop-2.6
--
diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6
index 9695661..3e4ed74 100644
--- a/dev/deps/spark-deps-hadoop-2.6
+++ b/dev/deps/spark-deps-hadoop-2.6
@@ -144,14 +144,13 @@ opencsv-2.3.jar
 oro-2.0.8.jar

spark git commit: [SPARK-9876][SQL] Update Parquet to 1.8.1.

2016-05-27 Thread lian

Repository: spark
Updated Branches:
  refs/heads/master 019afd9c7 -> 776d183c8


[SPARK-9876][SQL] Update Parquet to 1.8.1.

## What changes were proposed in this pull request?

This includes minimal changes to get Spark using the current release of 
Parquet, 1.8.1.

## How was this patch tested?

This uses the existing Parquet tests.

Author: Ryan Blue 

Closes #13280 from rdblue/SPARK-9876-update-parquet.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/776d183c
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/776d183c
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/776d183c

Branch: refs/heads/master
Commit: 776d183c82b424ef7c3cae30537d8afe9b9eee83
Parents: 019afd9
Author: Ryan Blue 
Authored: Fri May 27 16:59:38 2016 -0700
Committer: Cheng Lian 
Committed: Fri May 27 16:59:38 2016 -0700

--
 dev/deps/spark-deps-hadoop-2.2  | 11 ++-
 dev/deps/spark-deps-hadoop-2.3  | 11 ++-
 dev/deps/spark-deps-hadoop-2.4  | 11 ++-
 dev/deps/spark-deps-hadoop-2.6  | 11 ++-
 dev/deps/spark-deps-hadoop-2.7  | 11 ++-
 pom.xml |  2 +-
 .../SpecificParquetRecordReaderBase.java| 20 +++--
 .../parquet/CatalystReadSupport.scala   | 12 ++-
 .../parquet/CatalystSchemaConverter.scala   | 16 
 .../datasources/parquet/ParquetFilters.scala| 83 
 .../parquet/ParquetSchemaSuite.scala| 20 +++--
 11 files changed, 91 insertions(+), 117 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/776d183c/dev/deps/spark-deps-hadoop-2.2
--
diff --git a/dev/deps/spark-deps-hadoop-2.2 b/dev/deps/spark-deps-hadoop-2.2
index 578691c..deec033 100644
--- a/dev/deps/spark-deps-hadoop-2.2
+++ b/dev/deps/spark-deps-hadoop-2.2
@@ -129,14 +129,13 @@ opencsv-2.3.jar
 oro-2.0.8.jar
 osgi-resource-locator-1.0.1.jar
 paranamer-2.8.jar
-parquet-column-1.7.0.jar
-parquet-common-1.7.0.jar
-parquet-encoding-1.7.0.jar
+parquet-column-1.8.1.jar
+parquet-common-1.8.1.jar
+parquet-encoding-1.8.1.jar
 parquet-format-2.3.0-incubating.jar
-parquet-generator-1.7.0.jar
-parquet-hadoop-1.7.0.jar
+parquet-hadoop-1.8.1.jar
 parquet-hadoop-bundle-1.6.0.jar
-parquet-jackson-1.7.0.jar
+parquet-jackson-1.8.1.jar
 pmml-model-1.2.15.jar
 pmml-schema-1.2.15.jar
 protobuf-java-2.5.0.jar

http://git-wip-us.apache.org/repos/asf/spark/blob/776d183c/dev/deps/spark-deps-hadoop-2.3
--
diff --git a/dev/deps/spark-deps-hadoop-2.3 b/dev/deps/spark-deps-hadoop-2.3
index fc6306f..43c7dd3 100644
--- a/dev/deps/spark-deps-hadoop-2.3
+++ b/dev/deps/spark-deps-hadoop-2.3
@@ -136,14 +136,13 @@ opencsv-2.3.jar
 oro-2.0.8.jar
 osgi-resource-locator-1.0.1.jar
 paranamer-2.8.jar
-parquet-column-1.7.0.jar
-parquet-common-1.7.0.jar
-parquet-encoding-1.7.0.jar
+parquet-column-1.8.1.jar
+parquet-common-1.8.1.jar
+parquet-encoding-1.8.1.jar
 parquet-format-2.3.0-incubating.jar
-parquet-generator-1.7.0.jar
-parquet-hadoop-1.7.0.jar
+parquet-hadoop-1.8.1.jar
 parquet-hadoop-bundle-1.6.0.jar
-parquet-jackson-1.7.0.jar
+parquet-jackson-1.8.1.jar
 pmml-model-1.2.15.jar
 pmml-schema-1.2.15.jar
 protobuf-java-2.5.0.jar

http://git-wip-us.apache.org/repos/asf/spark/blob/776d183c/dev/deps/spark-deps-hadoop-2.4
--
diff --git a/dev/deps/spark-deps-hadoop-2.4 b/dev/deps/spark-deps-hadoop-2.4
index dee1417..7186b30 100644
--- a/dev/deps/spark-deps-hadoop-2.4
+++ b/dev/deps/spark-deps-hadoop-2.4
@@ -136,14 +136,13 @@ opencsv-2.3.jar
 oro-2.0.8.jar
 osgi-resource-locator-1.0.1.jar
 paranamer-2.8.jar
-parquet-column-1.7.0.jar
-parquet-common-1.7.0.jar
-parquet-encoding-1.7.0.jar
+parquet-column-1.8.1.jar
+parquet-common-1.8.1.jar
+parquet-encoding-1.8.1.jar
 parquet-format-2.3.0-incubating.jar
-parquet-generator-1.7.0.jar
-parquet-hadoop-1.7.0.jar
+parquet-hadoop-1.8.1.jar
 parquet-hadoop-bundle-1.6.0.jar
-parquet-jackson-1.7.0.jar
+parquet-jackson-1.8.1.jar
 pmml-model-1.2.15.jar
 pmml-schema-1.2.15.jar
 protobuf-java-2.5.0.jar

http://git-wip-us.apache.org/repos/asf/spark/blob/776d183c/dev/deps/spark-deps-hadoop-2.6
--
diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6
index 9695661..3e4ed74 100644
--- a/dev/deps/spark-deps-hadoop-2.6
+++ b/dev/deps/spark-deps-hadoop-2.6
@@ -144,14 +144,13 @@ opencsv-2.3.jar
 oro-2.0.8.jar
 osgi-resource-locator-1.0.1.jar
 paranamer-2.8.jar
-parquet-column-1.7.0.jar
-parquet-common-1.7.0.jar
-parquet-encoding-1.7.0.jar

spark git commit: [SPARK-15431][SQL][BRANCH-2.0-TEST] rework the clisuite test cases

2016-05-27 Thread yhuai

Repository: spark
Updated Branches:
  refs/heads/master 21b2605dc -> 019afd9c7


[SPARK-15431][SQL][BRANCH-2.0-TEST] rework the clisuite test cases

## What changes were proposed in this pull request?
This PR reworks on the CliSuite test cases for `LIST FILES/JARS` commands.

CC yhuai Thanks!

Author: Xin Wu 

Closes #13361 from xwu0226/SPARK-15431-clisuite-new.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/019afd9c
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/019afd9c
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/019afd9c

Branch: refs/heads/master
Commit: 019afd9c78a9f40e1d07f0a74868010206e90ed5
Parents: 21b2605
Author: Xin Wu 
Authored: Fri May 27 14:07:12 2016 -0700
Committer: Yin Huai 
Committed: Fri May 27 14:07:12 2016 -0700

--
 .../spark/sql/hive/thriftserver/CliSuite.scala  | 37 ++--
 1 file changed, 26 insertions(+), 11 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/019afd9c/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala
--
diff --git 
a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala
 
b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala
index 656fe97..75535ca 100644
--- 
a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala
+++ 
b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala
@@ -62,13 +62,13 @@ class CliSuite extends SparkFunSuite with BeforeAndAfterAll 
with Logging {
 
   /**
* Run a CLI operation and expect all the queries and expected answers to be 
returned.
+   *
* @param timeout maximum time for the commands to complete
* @param extraArgs any extra arguments
* @param errorResponses a sequence of strings whose presence in the stdout 
of the forked process
*   is taken as an immediate error condition. That is: 
if a line containing
*   with one of these strings is found, fail the test 
immediately.
*   The default value is `Seq("Error:")`
-   *
* @param queriesAndExpectedAnswers one or more tuples of query + answer
*/
   def runCliWithin(
@@ -239,22 +239,37 @@ class CliSuite extends SparkFunSuite with 
BeforeAndAfterAll with Logging {
   "" -> "This is a test for Spark-11624")
   }
 
-  ignore("list jars") {
+  test("list jars") {
 val jarFile = 
Thread.currentThread().getContextClassLoader.getResource("TestUDTF.jar")
 runCliWithin(2.minute)(
-  s"ADD JAR $jarFile" -> "",
-  s"LIST JARS" -> "TestUDTF.jar",
-  s"List JAR $jarFile" -> "TestUDTF.jar"
+  s"ADD JAR $jarFile;" -> "",
+  s"LIST JARS;" -> "TestUDTF.jar"
+)
+  }
+
+  test("list jar ") {
+val jarFile = 
Thread.currentThread().getContextClassLoader.getResource("TestUDTF.jar")
+runCliWithin(2.minute)(
+  s"ADD JAR $jarFile;" -> "",
+  s"List JAR $jarFile;" -> "TestUDTF.jar"
+)
+  }
+
+  test("list files") {
+val dataFilePath = Thread.currentThread().
+  getContextClassLoader.getResource("data/files/small_kv.txt")
+runCliWithin(2.minute)(
+  s"ADD FILE $dataFilePath;" -> "",
+  s"LIST FILES;" -> "small_kv.txt"
 )
   }
 
-  ignore("list files") {
-val dataFilePath = Thread.currentThread().getContextClassLoader
-  .getResource("data/files/small_kv.txt")
+  test("list file ") {
+val dataFilePath = Thread.currentThread().
+  getContextClassLoader.getResource("data/files/small_kv.txt")
 runCliWithin(2.minute)(
-  s"ADD FILE $dataFilePath" -> "",
-  s"LIST FILES" -> "small_kv.txt",
-  s"LIST FILE $dataFilePath" -> "small_kv.txt"
+  s"ADD FILE $dataFilePath;" -> "",
+  s"LIST FILE $dataFilePath;" -> "small_kv.txt"
 )
   }
 }


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-15431][SQL][BRANCH-2.0-TEST] rework the clisuite test cases

2016-05-27 Thread yhuai

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 dcf498e8a -> 9c137b2e3


[SPARK-15431][SQL][BRANCH-2.0-TEST] rework the clisuite test cases

## What changes were proposed in this pull request?
This PR reworks on the CliSuite test cases for `LIST FILES/JARS` commands.

CC yhuai Thanks!

Author: Xin Wu 

Closes #13361 from xwu0226/SPARK-15431-clisuite-new.

(cherry picked from commit 019afd9c78a9f40e1d07f0a74868010206e90ed5)
Signed-off-by: Yin Huai 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9c137b2e
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9c137b2e
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9c137b2e

Branch: refs/heads/branch-2.0
Commit: 9c137b2e361ad80845dbf086c173bc430c53d2a2
Parents: dcf498e
Author: Xin Wu 
Authored: Fri May 27 14:07:12 2016 -0700
Committer: Yin Huai 
Committed: Fri May 27 14:07:27 2016 -0700

--
 .../spark/sql/hive/thriftserver/CliSuite.scala  | 37 ++--
 1 file changed, 26 insertions(+), 11 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/9c137b2e/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala
--
diff --git 
a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala
 
b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala
index 656fe97..75535ca 100644
--- 
a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala
+++ 
b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala
@@ -62,13 +62,13 @@ class CliSuite extends SparkFunSuite with BeforeAndAfterAll 
with Logging {
 
   /**
* Run a CLI operation and expect all the queries and expected answers to be 
returned.
+   *
* @param timeout maximum time for the commands to complete
* @param extraArgs any extra arguments
* @param errorResponses a sequence of strings whose presence in the stdout 
of the forked process
*   is taken as an immediate error condition. That is: 
if a line containing
*   with one of these strings is found, fail the test 
immediately.
*   The default value is `Seq("Error:")`
-   *
* @param queriesAndExpectedAnswers one or more tuples of query + answer
*/
   def runCliWithin(
@@ -239,22 +239,37 @@ class CliSuite extends SparkFunSuite with 
BeforeAndAfterAll with Logging {
   "" -> "This is a test for Spark-11624")
   }
 
-  ignore("list jars") {
+  test("list jars") {
 val jarFile = 
Thread.currentThread().getContextClassLoader.getResource("TestUDTF.jar")
 runCliWithin(2.minute)(
-  s"ADD JAR $jarFile" -> "",
-  s"LIST JARS" -> "TestUDTF.jar",
-  s"List JAR $jarFile" -> "TestUDTF.jar"
+  s"ADD JAR $jarFile;" -> "",
+  s"LIST JARS;" -> "TestUDTF.jar"
+)
+  }
+
+  test("list jar ") {
+val jarFile = 
Thread.currentThread().getContextClassLoader.getResource("TestUDTF.jar")
+runCliWithin(2.minute)(
+  s"ADD JAR $jarFile;" -> "",
+  s"List JAR $jarFile;" -> "TestUDTF.jar"
+)
+  }
+
+  test("list files") {
+val dataFilePath = Thread.currentThread().
+  getContextClassLoader.getResource("data/files/small_kv.txt")
+runCliWithin(2.minute)(
+  s"ADD FILE $dataFilePath;" -> "",
+  s"LIST FILES;" -> "small_kv.txt"
 )
   }
 
-  ignore("list files") {
-val dataFilePath = Thread.currentThread().getContextClassLoader
-  .getResource("data/files/small_kv.txt")
+  test("list file ") {
+val dataFilePath = Thread.currentThread().
+  getContextClassLoader.getResource("data/files/small_kv.txt")
 runCliWithin(2.minute)(
-  s"ADD FILE $dataFilePath" -> "",
-  s"LIST FILES" -> "small_kv.txt",
-  s"LIST FILE $dataFilePath" -> "small_kv.txt"
+  s"ADD FILE $dataFilePath;" -> "",
+  s"LIST FILE $dataFilePath;" -> "small_kv.txt"
 )
   }
 }


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-15413][ML][MLLIB] Change `toBreeze` to `asBreeze` in Vector and Matrix

2016-05-27 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 a778d3c90 -> dcf498e8a


[SPARK-15413][ML][MLLIB] Change `toBreeze` to `asBreeze` in Vector and Matrix

## What changes were proposed in this pull request?

We're using `asML` to convert the mllib vector/matrix to ml vector/matrix now. 
Using `as` is more correct given that this conversion actually shares the same 
underline data structure. As a result, in this PR, `toBreeze` will be changed 
to `asBreeze`. This is a private API, as a result, it will not affect any 
user's application.

## How was this patch tested?

unit tests

Author: DB Tsai 

Closes #13198 from dbtsai/minor.

(cherry picked from commit 21b2605dc4900894ea7a911e039781ecc2a18c14)
Signed-off-by: Joseph K. Bradley 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/dcf498e8
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/dcf498e8
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/dcf498e8

Branch: refs/heads/branch-2.0
Commit: dcf498e8aafd2b53c5680cf7f3ada31829686b62
Parents: a778d3c
Author: DB Tsai 
Authored: Fri May 27 14:02:39 2016 -0700
Committer: Joseph K. Bradley 
Committed: Fri May 27 14:02:51 2016 -0700

--
 .../org/apache/spark/ml/linalg/Matrices.scala   | 16 ++--
 .../org/apache/spark/ml/linalg/Vectors.scala|  8 +++---
 .../distribution/MultivariateGaussian.scala |  8 +++---
 .../ml/linalg/BreezeMatrixConversionSuite.scala |  4 +--
 .../ml/linalg/BreezeVectorConversionSuite.scala |  4 +--
 .../apache/spark/ml/linalg/MatricesSuite.scala  | 14 +--
 .../apache/spark/ml/linalg/VectorsSuite.scala   |  2 +-
 .../scala/org/apache/spark/ml/ann/Layer.scala   |  8 +++---
 .../ml/classification/LogisticRegression.scala  |  2 +-
 .../spark/ml/clustering/GaussianMixture.scala   |  2 +-
 .../apache/spark/ml/feature/MaxAbsScaler.scala  |  2 +-
 .../apache/spark/ml/feature/MinMaxScaler.scala  |  2 +-
 .../ml/regression/AFTSurvivalRegression.scala   |  2 +-
 .../spark/ml/regression/LinearRegression.scala  |  2 +-
 .../apache/spark/mllib/classification/SVM.scala |  2 +-
 .../mllib/clustering/GaussianMixture.scala  |  2 +-
 .../mllib/clustering/GaussianMixtureModel.scala |  4 +--
 .../spark/mllib/clustering/LDAModel.scala   | 26 ++--
 .../spark/mllib/clustering/LDAOptimizer.scala   |  6 ++---
 .../mllib/clustering/StreamingKMeans.scala  |  4 +--
 .../apache/spark/mllib/linalg/Matrices.scala| 16 ++--
 .../org/apache/spark/mllib/linalg/Vectors.scala |  8 +++---
 .../mllib/linalg/distributed/BlockMatrix.scala  |  8 +++---
 .../mllib/linalg/distributed/RowMatrix.scala| 16 ++--
 .../mllib/optimization/GradientDescent.scala|  4 +--
 .../apache/spark/mllib/optimization/LBFGS.scala |  4 +--
 .../spark/mllib/optimization/Updater.scala  | 14 +--
 .../apache/spark/mllib/regression/Lasso.scala   |  2 +-
 .../mllib/regression/LinearRegression.scala |  2 +-
 .../mllib/regression/RidgeRegression.scala  |  2 +-
 .../stat/correlation/PearsonCorrelation.scala   |  2 +-
 .../distribution/MultivariateGaussian.scala |  8 +++---
 .../spark/mllib/stat/test/ChiSqTest.scala   |  2 +-
 .../ml/classification/NaiveBayesSuite.scala |  6 ++---
 .../LogisticRegressionSuite.scala   |  4 +--
 .../mllib/classification/NaiveBayesSuite.scala  |  4 +--
 .../spark/mllib/clustering/LDASuite.scala   |  4 +--
 .../mllib/clustering/StreamingKMeansSuite.scala |  2 +-
 .../spark/mllib/feature/NormalizerSuite.scala   | 16 ++--
 .../linalg/BreezeMatrixConversionSuite.scala|  4 +--
 .../linalg/BreezeVectorConversionSuite.scala|  4 +--
 .../spark/mllib/linalg/MatricesSuite.scala  | 14 +--
 .../spark/mllib/linalg/VectorsSuite.scala   |  2 +-
 .../linalg/distributed/BlockMatrixSuite.scala   |  2 +-
 .../distributed/IndexedRowMatrixSuite.scala | 10 
 .../linalg/distributed/RowMatrixSuite.scala | 14 +--
 .../spark/mllib/stat/CorrelationSuite.scala |  6 ++---
 .../apache/spark/mllib/util/MLUtilsSuite.scala  |  6 ++---
 project/MimaExcludes.scala  |  3 +++
 49 files changed, 156 insertions(+), 153 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/dcf498e8/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala
--
diff --git 
a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala 
b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala
index a47526d..0ea687b 100644
--- a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala
+++ b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala
@@

spark git commit: [SPARK-15413][ML][MLLIB] Change `toBreeze` to `asBreeze` in Vector and Matrix

2016-05-27 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/master 130b8d07b -> 21b2605dc


[SPARK-15413][ML][MLLIB] Change `toBreeze` to `asBreeze` in Vector and Matrix

## What changes were proposed in this pull request?

We're using `asML` to convert the mllib vector/matrix to ml vector/matrix now. 
Using `as` is more correct given that this conversion actually shares the same 
underline data structure. As a result, in this PR, `toBreeze` will be changed 
to `asBreeze`. This is a private API, as a result, it will not affect any 
user's application.

## How was this patch tested?

unit tests

Author: DB Tsai 

Closes #13198 from dbtsai/minor.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/21b2605d
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/21b2605d
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/21b2605d

Branch: refs/heads/master
Commit: 21b2605dc4900894ea7a911e039781ecc2a18c14
Parents: 130b8d0
Author: DB Tsai 
Authored: Fri May 27 14:02:39 2016 -0700
Committer: Joseph K. Bradley 
Committed: Fri May 27 14:02:39 2016 -0700

--
 .../org/apache/spark/ml/linalg/Matrices.scala   | 16 ++--
 .../org/apache/spark/ml/linalg/Vectors.scala|  8 +++---
 .../distribution/MultivariateGaussian.scala |  8 +++---
 .../ml/linalg/BreezeMatrixConversionSuite.scala |  4 +--
 .../ml/linalg/BreezeVectorConversionSuite.scala |  4 +--
 .../apache/spark/ml/linalg/MatricesSuite.scala  | 14 +--
 .../apache/spark/ml/linalg/VectorsSuite.scala   |  2 +-
 .../scala/org/apache/spark/ml/ann/Layer.scala   |  8 +++---
 .../ml/classification/LogisticRegression.scala  |  2 +-
 .../spark/ml/clustering/GaussianMixture.scala   |  2 +-
 .../apache/spark/ml/feature/MaxAbsScaler.scala  |  2 +-
 .../apache/spark/ml/feature/MinMaxScaler.scala  |  2 +-
 .../ml/regression/AFTSurvivalRegression.scala   |  2 +-
 .../spark/ml/regression/LinearRegression.scala  |  2 +-
 .../apache/spark/mllib/classification/SVM.scala |  2 +-
 .../mllib/clustering/GaussianMixture.scala  |  2 +-
 .../mllib/clustering/GaussianMixtureModel.scala |  4 +--
 .../spark/mllib/clustering/LDAModel.scala   | 26 ++--
 .../spark/mllib/clustering/LDAOptimizer.scala   |  6 ++---
 .../mllib/clustering/StreamingKMeans.scala  |  4 +--
 .../apache/spark/mllib/linalg/Matrices.scala| 16 ++--
 .../org/apache/spark/mllib/linalg/Vectors.scala |  8 +++---
 .../mllib/linalg/distributed/BlockMatrix.scala  |  8 +++---
 .../mllib/linalg/distributed/RowMatrix.scala| 16 ++--
 .../mllib/optimization/GradientDescent.scala|  4 +--
 .../apache/spark/mllib/optimization/LBFGS.scala |  4 +--
 .../spark/mllib/optimization/Updater.scala  | 14 +--
 .../apache/spark/mllib/regression/Lasso.scala   |  2 +-
 .../mllib/regression/LinearRegression.scala |  2 +-
 .../mllib/regression/RidgeRegression.scala  |  2 +-
 .../stat/correlation/PearsonCorrelation.scala   |  2 +-
 .../distribution/MultivariateGaussian.scala |  8 +++---
 .../spark/mllib/stat/test/ChiSqTest.scala   |  2 +-
 .../ml/classification/NaiveBayesSuite.scala |  6 ++---
 .../LogisticRegressionSuite.scala   |  4 +--
 .../mllib/classification/NaiveBayesSuite.scala  |  4 +--
 .../spark/mllib/clustering/LDASuite.scala   |  4 +--
 .../mllib/clustering/StreamingKMeansSuite.scala |  2 +-
 .../spark/mllib/feature/NormalizerSuite.scala   | 16 ++--
 .../linalg/BreezeMatrixConversionSuite.scala|  4 +--
 .../linalg/BreezeVectorConversionSuite.scala|  4 +--
 .../spark/mllib/linalg/MatricesSuite.scala  | 14 +--
 .../spark/mllib/linalg/VectorsSuite.scala   |  2 +-
 .../linalg/distributed/BlockMatrixSuite.scala   |  2 +-
 .../distributed/IndexedRowMatrixSuite.scala | 10 
 .../linalg/distributed/RowMatrixSuite.scala | 14 +--
 .../spark/mllib/stat/CorrelationSuite.scala |  6 ++---
 .../apache/spark/mllib/util/MLUtilsSuite.scala  |  6 ++---
 project/MimaExcludes.scala  |  3 +++
 49 files changed, 156 insertions(+), 153 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/21b2605d/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala
--
diff --git 
a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala 
b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala
index a47526d..0ea687b 100644
--- a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala
+++ b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala
@@ -69,7 +69,7 @@ sealed trait Matrix extends Serializable {
   def rowIter: Iterator[Vector] = this.transpose.colIter
 
   /** Converts to

spark git commit: [SPARK-15008][ML][PYSPARK] Add integration test for OneVsRest

2016-05-27 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 e6e2f293d -> a778d3c90


[SPARK-15008][ML][PYSPARK] Add integration test for OneVsRest

## What changes were proposed in this pull request?

1. Add `_transfer_param_map_to/from_java` for OneVsRest;

2. Add `_compare_params` in ml/tests.py to help compare params.

3. Add `test_onevsrest` as the integration test for OneVsRest.

## How was this patch tested?

Python unit test.

Author: yinxusen 

Closes #12875 from yinxusen/SPARK-15008.

(cherry picked from commit 130b8d07b8eb08f2ad522081a95032b90247094d)
Signed-off-by: Joseph K. Bradley 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a778d3c9
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a778d3c9
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a778d3c9

Branch: refs/heads/branch-2.0
Commit: a778d3c90599eb76e6bca87b7aa3c0f9910f24c5
Parents: e6e2f29
Author: yinxusen 
Authored: Fri May 27 13:18:29 2016 -0700
Committer: Joseph K. Bradley 
Committed: Fri May 27 13:18:36 2016 -0700

--
 python/pyspark/ml/tests.py | 69 +++--
 1 file changed, 46 insertions(+), 23 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/a778d3c9/python/pyspark/ml/tests.py
--
diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
index a7c93ac..4358175 100755
--- a/python/pyspark/ml/tests.py
+++ b/python/pyspark/ml/tests.py
@@ -747,12 +747,32 @@ class PersistenceTest(SparkSessionTestCase):
 except OSError:
 pass
 
+def _compare_params(self, m1, m2, param):
+"""
+Compare 2 ML Params instances for the given param, and assert both 
have the same param value
+and parent. The param must be a parameter of m1.
+"""
+# Prevent key not found error in case of some param in neither 
paramMap nor defaultParamMap.
+if m1.isDefined(param):
+paramValue1 = m1.getOrDefault(param)
+paramValue2 = m2.getOrDefault(m2.getParam(param.name))
+if isinstance(paramValue1, Params):
+self._compare_pipelines(paramValue1, paramValue2)
+else:
+self.assertEqual(paramValue1, paramValue2)  # for general 
types param
+# Assert parents are equal
+self.assertEqual(param.parent, m2.getParam(param.name).parent)
+else:
+# If m1 is not defined param, then m2 should not, too. See 
SPARK-14931.
+self.assertFalse(m2.isDefined(m2.getParam(param.name)))
+
 def _compare_pipelines(self, m1, m2):
 """
 Compare 2 ML types, asserting that they are equivalent.
 This currently supports:
  - basic types
  - Pipeline, PipelineModel
+ - OneVsRest, OneVsRestModel
 This checks:
  - uid
  - type
@@ -763,8 +783,7 @@ class PersistenceTest(SparkSessionTestCase):
 if isinstance(m1, JavaParams):
 self.assertEqual(len(m1.params), len(m2.params))
 for p in m1.params:
-self.assertEqual(m1.getOrDefault(p), m2.getOrDefault(p))
-self.assertEqual(p.parent, m2.getParam(p.name).parent)
+self._compare_params(m1, m2, p)
 elif isinstance(m1, Pipeline):
 self.assertEqual(len(m1.getStages()), len(m2.getStages()))
 for s1, s2 in zip(m1.getStages(), m2.getStages()):
@@ -773,6 +792,13 @@ class PersistenceTest(SparkSessionTestCase):
 self.assertEqual(len(m1.stages), len(m2.stages))
 for s1, s2 in zip(m1.stages, m2.stages):
 self._compare_pipelines(s1, s2)
+elif isinstance(m1, OneVsRest) or isinstance(m1, OneVsRestModel):
+for p in m1.params:
+self._compare_params(m1, m2, p)
+if isinstance(m1, OneVsRestModel):
+self.assertEqual(len(m1.models), len(m2.models))
+for x, y in zip(m1.models, m2.models):
+self._compare_pipelines(x, y)
 else:
 raise RuntimeError("_compare_pipelines does not yet support type: 
%s" % type(m1))
 
@@ -833,6 +859,24 @@ class PersistenceTest(SparkSessionTestCase):
 except OSError:
 pass
 
+def test_onevsrest(self):
+temp_path = tempfile.mkdtemp()
+df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
+ (1.0, Vectors.sparse(2, [], [])),
+ (2.0, Vectors.dense(0.5, 0.5))] * 10,
+["label", "features"])
+lr =

spark git commit: [SPARK-15008][ML][PYSPARK] Add integration test for OneVsRest

2016-05-27 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/master a3550e374 -> 130b8d07b


[SPARK-15008][ML][PYSPARK] Add integration test for OneVsRest

## What changes were proposed in this pull request?

1. Add `_transfer_param_map_to/from_java` for OneVsRest;

2. Add `_compare_params` in ml/tests.py to help compare params.

3. Add `test_onevsrest` as the integration test for OneVsRest.

## How was this patch tested?

Python unit test.

Author: yinxusen 

Closes #12875 from yinxusen/SPARK-15008.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/130b8d07
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/130b8d07
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/130b8d07

Branch: refs/heads/master
Commit: 130b8d07b8eb08f2ad522081a95032b90247094d
Parents: a3550e3
Author: yinxusen 
Authored: Fri May 27 13:18:29 2016 -0700
Committer: Joseph K. Bradley 
Committed: Fri May 27 13:18:29 2016 -0700

--
 python/pyspark/ml/tests.py | 69 +++--
 1 file changed, 46 insertions(+), 23 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/130b8d07/python/pyspark/ml/tests.py
--
diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
index a7c93ac..4358175 100755
--- a/python/pyspark/ml/tests.py
+++ b/python/pyspark/ml/tests.py
@@ -747,12 +747,32 @@ class PersistenceTest(SparkSessionTestCase):
 except OSError:
 pass
 
+def _compare_params(self, m1, m2, param):
+"""
+Compare 2 ML Params instances for the given param, and assert both 
have the same param value
+and parent. The param must be a parameter of m1.
+"""
+# Prevent key not found error in case of some param in neither 
paramMap nor defaultParamMap.
+if m1.isDefined(param):
+paramValue1 = m1.getOrDefault(param)
+paramValue2 = m2.getOrDefault(m2.getParam(param.name))
+if isinstance(paramValue1, Params):
+self._compare_pipelines(paramValue1, paramValue2)
+else:
+self.assertEqual(paramValue1, paramValue2)  # for general 
types param
+# Assert parents are equal
+self.assertEqual(param.parent, m2.getParam(param.name).parent)
+else:
+# If m1 is not defined param, then m2 should not, too. See 
SPARK-14931.
+self.assertFalse(m2.isDefined(m2.getParam(param.name)))
+
 def _compare_pipelines(self, m1, m2):
 """
 Compare 2 ML types, asserting that they are equivalent.
 This currently supports:
  - basic types
  - Pipeline, PipelineModel
+ - OneVsRest, OneVsRestModel
 This checks:
  - uid
  - type
@@ -763,8 +783,7 @@ class PersistenceTest(SparkSessionTestCase):
 if isinstance(m1, JavaParams):
 self.assertEqual(len(m1.params), len(m2.params))
 for p in m1.params:
-self.assertEqual(m1.getOrDefault(p), m2.getOrDefault(p))
-self.assertEqual(p.parent, m2.getParam(p.name).parent)
+self._compare_params(m1, m2, p)
 elif isinstance(m1, Pipeline):
 self.assertEqual(len(m1.getStages()), len(m2.getStages()))
 for s1, s2 in zip(m1.getStages(), m2.getStages()):
@@ -773,6 +792,13 @@ class PersistenceTest(SparkSessionTestCase):
 self.assertEqual(len(m1.stages), len(m2.stages))
 for s1, s2 in zip(m1.stages, m2.stages):
 self._compare_pipelines(s1, s2)
+elif isinstance(m1, OneVsRest) or isinstance(m1, OneVsRestModel):
+for p in m1.params:
+self._compare_params(m1, m2, p)
+if isinstance(m1, OneVsRestModel):
+self.assertEqual(len(m1.models), len(m2.models))
+for x, y in zip(m1.models, m2.models):
+self._compare_pipelines(x, y)
 else:
 raise RuntimeError("_compare_pipelines does not yet support type: 
%s" % type(m1))
 
@@ -833,6 +859,24 @@ class PersistenceTest(SparkSessionTestCase):
 except OSError:
 pass
 
+def test_onevsrest(self):
+temp_path = tempfile.mkdtemp()
+df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
+ (1.0, Vectors.sparse(2, [], [])),
+ (2.0, Vectors.dense(0.5, 0.5))] * 10,
+["label", "features"])
+lr = LogisticRegression(maxIter=5, regParam=0.01)
+ovr = OneVsRest(classifier=lr)
+model = ovr.fit(df)
+ovrPath = temp_path + "/ovr"
+

spark git commit: [SPARK-11959][SPARK-15484][DOC][ML] Document WLS and IRLS

2016-05-27 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 5dd1423f4 -> e6e2f293d


[SPARK-11959][SPARK-15484][DOC][ML] Document WLS and IRLS

## What changes were proposed in this pull request?
* Document ```WeightedLeastSquares```(normal equation) and 
```IterativelyReweightedLeastSquares```.
* Copy ```L-BFGS``` documents from ```spark.mllib``` to ```spark.ml```.

Due to the session ```Optimization of linear methods``` is used for developers, 
I think we should provide the brief introduction of the optimization method, 
necessary references and how it implements in Spark. It's not necessary to 
paste all mathematical formula and derivation here. If developers/users want to 
learn more, they can track reference.

## How was this patch tested?
Document update, no tests.

Author: Yanbo Liang 

Closes #13262 from yanboliang/spark-15484.

(cherry picked from commit a3550e3747e21c79a5110132dc127ee83879062a)
Signed-off-by: Joseph K. Bradley 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e6e2f293
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e6e2f293
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e6e2f293

Branch: refs/heads/branch-2.0
Commit: e6e2f293d6830ce118050e789773a09b3888fd30
Parents: 5dd1423
Author: Yanbo Liang 
Authored: Fri May 27 13:16:22 2016 -0700
Committer: Joseph K. Bradley 
Committed: Fri May 27 13:16:37 2016 -0700

--
 docs/ml-advanced.md | 85 ++--
 .../IterativelyReweightedLeastSquares.scala |  2 +-
 2 files changed, 81 insertions(+), 6 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/e6e2f293/docs/ml-advanced.md
--
diff --git a/docs/ml-advanced.md b/docs/ml-advanced.md
index 91731d7..1c5f844 100644
--- a/docs/ml-advanced.md
+++ b/docs/ml-advanced.md
@@ -4,10 +4,85 @@ title: Advanced topics - spark.ml
 displayTitle: Advanced topics - spark.ml
 ---
 
-# Optimization of linear methods
+* Table of contents
+{:toc}
+
+`\[
+\newcommand{\R}{\mathbb{R}}
+\newcommand{\E}{\mathbb{E}} 
+\newcommand{\x}{\mathbf{x}}
+\newcommand{\y}{\mathbf{y}}
+\newcommand{\wv}{\mathbf{w}}
+\newcommand{\av}{\mathbf{\alpha}}
+\newcommand{\bv}{\mathbf{b}}
+\newcommand{\N}{\mathbb{N}}
+\newcommand{\id}{\mathbf{I}} 
+\newcommand{\ind}{\mathbf{1}} 
+\newcommand{\0}{\mathbf{0}} 
+\newcommand{\unit}{\mathbf{e}} 
+\newcommand{\one}{\mathbf{1}} 
+\newcommand{\zero}{\mathbf{0}}
+\]`
+
+# Optimization of linear methods (developer)
+
+## Limited-memory BFGS (L-BFGS)
+[L-BFGS](http://en.wikipedia.org/wiki/Limited-memory_BFGS) is an optimization 
+algorithm in the family of quasi-Newton methods to solve the optimization 
problems of the form 
+`$\min_{\wv \in\R^d} \; f(\wv)$`. The L-BFGS method approximates the objective 
function locally as a 
+quadratic without evaluating the second partial derivatives of the objective 
function to construct the 
+Hessian matrix. The Hessian matrix is approximated by previous gradient 
evaluations, so there is no 
+vertical scalability issue (the number of training features) unlike computing 
the Hessian matrix 
+explicitly in Newton's method. As a result, L-BFGS often achieves faster 
convergence compared with 
+other first-order optimizations.
 
-The optimization algorithm underlying the implementation is called
 [Orthant-Wise Limited-memory
-QuasiNewton](http://research-srv.microsoft.com/en-us/um/people/jfgao/paper/icml07scalable.pdf)
-(OWL-QN). It is an extension of L-BFGS that can effectively handle L1
-regularization and elastic net.
+Quasi-Newton](http://research-srv.microsoft.com/en-us/um/people/jfgao/paper/icml07scalable.pdf)
+(OWL-QN) is an extension of L-BFGS that can effectively handle L1 and elastic 
net regularization.
+
+L-BFGS is used as a solver for 
[LinearRegression](api/scala/index.html#org.apache.spark.ml.regression.LinearRegression),
+[LogisticRegression](api/scala/index.html#org.apache.spark.ml.classification.LogisticRegression),
+[AFTSurvivalRegression](api/scala/index.html#org.apache.spark.ml.regression.AFTSurvivalRegression)
+and 
[MultilayerPerceptronClassifier](api/scala/index.html#org.apache.spark.ml.classification.MultilayerPerceptronClassifier).
+
+MLlib L-BFGS solver calls the corresponding implementation in 
[breeze](https://github.com/scalanlp/breeze/blob/master/math/src/main/scala/breeze/optimize/LBFGS.scala).
+
+## Normal equation solver for weighted least squares
+
+MLlib implements normal equation solver for [weighted least 
squares](https://en.wikipedia.org/wiki/Least_squares#Weighted_least_squares) by

spark git commit: [SPARK-11959][SPARK-15484][DOC][ML] Document WLS and IRLS

2016-05-27 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/master c96244f5a -> a3550e374


[SPARK-11959][SPARK-15484][DOC][ML] Document WLS and IRLS

## What changes were proposed in this pull request?
* Document ```WeightedLeastSquares```(normal equation) and 
```IterativelyReweightedLeastSquares```.
* Copy ```L-BFGS``` documents from ```spark.mllib``` to ```spark.ml```.

Due to the session ```Optimization of linear methods``` is used for developers, 
I think we should provide the brief introduction of the optimization method, 
necessary references and how it implements in Spark. It's not necessary to 
paste all mathematical formula and derivation here. If developers/users want to 
learn more, they can track reference.

## How was this patch tested?
Document update, no tests.

Author: Yanbo Liang 

Closes #13262 from yanboliang/spark-15484.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a3550e37
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a3550e37
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a3550e37

Branch: refs/heads/master
Commit: a3550e3747e21c79a5110132dc127ee83879062a
Parents: c96244f
Author: Yanbo Liang 
Authored: Fri May 27 13:16:22 2016 -0700
Committer: Joseph K. Bradley 
Committed: Fri May 27 13:16:22 2016 -0700

--
 docs/ml-advanced.md | 85 ++--
 .../IterativelyReweightedLeastSquares.scala |  2 +-
 2 files changed, 81 insertions(+), 6 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/a3550e37/docs/ml-advanced.md
--
diff --git a/docs/ml-advanced.md b/docs/ml-advanced.md
index 91731d7..1c5f844 100644
--- a/docs/ml-advanced.md
+++ b/docs/ml-advanced.md
@@ -4,10 +4,85 @@ title: Advanced topics - spark.ml
 displayTitle: Advanced topics - spark.ml
 ---
 
-# Optimization of linear methods
+* Table of contents
+{:toc}
+
+`\[
+\newcommand{\R}{\mathbb{R}}
+\newcommand{\E}{\mathbb{E}} 
+\newcommand{\x}{\mathbf{x}}
+\newcommand{\y}{\mathbf{y}}
+\newcommand{\wv}{\mathbf{w}}
+\newcommand{\av}{\mathbf{\alpha}}
+\newcommand{\bv}{\mathbf{b}}
+\newcommand{\N}{\mathbb{N}}
+\newcommand{\id}{\mathbf{I}} 
+\newcommand{\ind}{\mathbf{1}} 
+\newcommand{\0}{\mathbf{0}} 
+\newcommand{\unit}{\mathbf{e}} 
+\newcommand{\one}{\mathbf{1}} 
+\newcommand{\zero}{\mathbf{0}}
+\]`
+
+# Optimization of linear methods (developer)
+
+## Limited-memory BFGS (L-BFGS)
+[L-BFGS](http://en.wikipedia.org/wiki/Limited-memory_BFGS) is an optimization 
+algorithm in the family of quasi-Newton methods to solve the optimization 
problems of the form 
+`$\min_{\wv \in\R^d} \; f(\wv)$`. The L-BFGS method approximates the objective 
function locally as a 
+quadratic without evaluating the second partial derivatives of the objective 
function to construct the 
+Hessian matrix. The Hessian matrix is approximated by previous gradient 
evaluations, so there is no 
+vertical scalability issue (the number of training features) unlike computing 
the Hessian matrix 
+explicitly in Newton's method. As a result, L-BFGS often achieves faster 
convergence compared with 
+other first-order optimizations.
 
-The optimization algorithm underlying the implementation is called
 [Orthant-Wise Limited-memory
-QuasiNewton](http://research-srv.microsoft.com/en-us/um/people/jfgao/paper/icml07scalable.pdf)
-(OWL-QN). It is an extension of L-BFGS that can effectively handle L1
-regularization and elastic net.
+Quasi-Newton](http://research-srv.microsoft.com/en-us/um/people/jfgao/paper/icml07scalable.pdf)
+(OWL-QN) is an extension of L-BFGS that can effectively handle L1 and elastic 
net regularization.
+
+L-BFGS is used as a solver for 
[LinearRegression](api/scala/index.html#org.apache.spark.ml.regression.LinearRegression),
+[LogisticRegression](api/scala/index.html#org.apache.spark.ml.classification.LogisticRegression),
+[AFTSurvivalRegression](api/scala/index.html#org.apache.spark.ml.regression.AFTSurvivalRegression)
+and 
[MultilayerPerceptronClassifier](api/scala/index.html#org.apache.spark.ml.classification.MultilayerPerceptronClassifier).
+
+MLlib L-BFGS solver calls the corresponding implementation in 
[breeze](https://github.com/scalanlp/breeze/blob/master/math/src/main/scala/breeze/optimize/LBFGS.scala).
+
+## Normal equation solver for weighted least squares
+
+MLlib implements normal equation solver for [weighted least 
squares](https://en.wikipedia.org/wiki/Least_squares#Weighted_least_squares) by 
[WeightedLeastSquares](https://github.com/apache/spark/blob/master/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala).
+
+Given $n$ weighted observations $(w_i, a_i, b_i)$:
+
+* $w_i$ the weight of i-th observation
+* $a_i$

spark git commit: [SPARK-15186][ML][DOCS] Add user guide for generalized linear regression

2016-05-27 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 d76e066d3 -> 5dd1423f4


[SPARK-15186][ML][DOCS] Add user guide for generalized linear regression

## What changes were proposed in this pull request?

This patch adds a user guide section for generalized linear regression and 
includes the examples from [#12754](https://github.com/apache/spark/pull/12754).

## How was this patch tested?

Documentation only, no tests required.

## Approach

In general, it is a bit unclear what level of detail ought to be included in 
the user guide since there is a lot of variability within the current user 
guide. I tried to give a fairly brief mathematical introduction to GLMs, and 
cover what types of problems they could be used for. Additionally, I included a 
brief blurb on the IRLS solver. The input/output columns are given in a table 
as is found elsewhere in the docs (though, again, these appear rather 
intermittently in the current docs), as well as a table providing the supported 
families and their link functions.

Author: sethah 

Closes #13139 from sethah/SPARK-15186.

(cherry picked from commit c96244f5acd8b335e34694c171bab32d92e6e0fb)
Signed-off-by: Joseph K. Bradley 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5dd1423f
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5dd1423f
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5dd1423f

Branch: refs/heads/branch-2.0
Commit: 5dd1423f462f03b7ae625a93cdaf9d882969afb6
Parents: d76e066
Author: sethah 
Authored: Fri May 27 12:55:48 2016 -0700
Committer: Joseph K. Bradley 
Committed: Fri May 27 12:56:00 2016 -0700

--
 docs/ml-classification-regression.md | 132 ++
 1 file changed, 132 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/5dd1423f/docs/ml-classification-regression.md
--
diff --git a/docs/ml-classification-regression.md 
b/docs/ml-classification-regression.md
index f1a21f4..ff8dec6 100644
--- a/docs/ml-classification-regression.md
+++ b/docs/ml-classification-regression.md
@@ -374,6 +374,138 @@ regression model and extracting model summary statistics.
 
 
 
+## Generalized linear regression
+
+Contrasted with linear regression where the output is assumed to follow a 
Gaussian
+distribution, [generalized linear 
models](https://en.wikipedia.org/wiki/Generalized_linear_model) (GLMs) are 
specifications of linear models where the response variable $Y_i$ follows some
+distribution from the [exponential family of 
distributions](https://en.wikipedia.org/wiki/Exponential_family).
+Spark's `GeneralizedLinearRegression` interface
+allows for flexible specification of GLMs which can be used for various types 
of
+prediction problems including linear regression, Poisson regression, logistic 
regression, and others.
+Currently in `spark.ml`, only a subset of the exponential family distributions 
are supported and they are listed
+[below](#available-families).
+
+**NOTE**: Spark currently only supports up to 4096 features through its 
`GeneralizedLinearRegression`
+interface, and will throw an exception if this constraint is exceeded. See the 
[advanced section](ml-advanced) for more details.
+ Still, for linear and logistic regression, models with an increased number of 
features can be trained 
+ using the `LinearRegression` and `LogisticRegression` estimators.
+
+GLMs require exponential family distributions that can be written in their 
"canonical" or "natural" form, aka
+[natural exponential family 
distributions](https://en.wikipedia.org/wiki/Natural_exponential_family). The 
form of a natural exponential family distribution is given as:
+
+$$
+f_Y(y|\theta, \tau) = h(y, \tau)\exp{\left( \frac{\theta \cdot y - 
A(\theta)}{d(\tau)} \right)}
+$$
+
+where $\theta$ is the parameter of interest and $\tau$ is a dispersion 
parameter. In a GLM the response variable $Y_i$ is assumed to be drawn from a 
natural exponential family distribution:
+
+$$
+Y_i \sim f\left(\cdot|\theta_i, \tau \right)
+$$
+
+where the parameter of interest $\theta_i$ is related to the expected value of 
the response variable $\mu_i$ by
+
+$$
+\mu_i = A'(\theta_i)
+$$
+
+Here, $A'(\theta_i)$ is defined by the form of the distribution selected. GLMs 
also allow specification
+of a link function, which defines the relationship between the expected value 
of the response variable $\mu_i$
+and the so called _linear predictor_ $\eta_i$:
+
+$$
+g(\mu_i) = \eta_i = \vec{x_i}^T \cdot \vec{\beta}
+$$
+
+Often, the link function is chosen such that $A' = g^{-1}$, which yields a 
simplified relationship
+between the parameter of interest $\theta$ and the

spark git commit: [SPARK-15186][ML][DOCS] Add user guide for generalized linear regression

2016-05-27 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/master a96e4151a -> c96244f5a


[SPARK-15186][ML][DOCS] Add user guide for generalized linear regression

## What changes were proposed in this pull request?

This patch adds a user guide section for generalized linear regression and 
includes the examples from [#12754](https://github.com/apache/spark/pull/12754).

## How was this patch tested?

Documentation only, no tests required.

## Approach

In general, it is a bit unclear what level of detail ought to be included in 
the user guide since there is a lot of variability within the current user 
guide. I tried to give a fairly brief mathematical introduction to GLMs, and 
cover what types of problems they could be used for. Additionally, I included a 
brief blurb on the IRLS solver. The input/output columns are given in a table 
as is found elsewhere in the docs (though, again, these appear rather 
intermittently in the current docs), as well as a table providing the supported 
families and their link functions.

Author: sethah 

Closes #13139 from sethah/SPARK-15186.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c96244f5
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c96244f5
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c96244f5

Branch: refs/heads/master
Commit: c96244f5acd8b335e34694c171bab32d92e6e0fb
Parents: a96e415
Author: sethah 
Authored: Fri May 27 12:55:48 2016 -0700
Committer: Joseph K. Bradley 
Committed: Fri May 27 12:55:48 2016 -0700

--
 docs/ml-classification-regression.md | 132 ++
 1 file changed, 132 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/c96244f5/docs/ml-classification-regression.md
--
diff --git a/docs/ml-classification-regression.md 
b/docs/ml-classification-regression.md
index f1a21f4..ff8dec6 100644
--- a/docs/ml-classification-regression.md
+++ b/docs/ml-classification-regression.md
@@ -374,6 +374,138 @@ regression model and extracting model summary statistics.
 
 
 
+## Generalized linear regression
+
+Contrasted with linear regression where the output is assumed to follow a 
Gaussian
+distribution, [generalized linear 
models](https://en.wikipedia.org/wiki/Generalized_linear_model) (GLMs) are 
specifications of linear models where the response variable $Y_i$ follows some
+distribution from the [exponential family of 
distributions](https://en.wikipedia.org/wiki/Exponential_family).
+Spark's `GeneralizedLinearRegression` interface
+allows for flexible specification of GLMs which can be used for various types 
of
+prediction problems including linear regression, Poisson regression, logistic 
regression, and others.
+Currently in `spark.ml`, only a subset of the exponential family distributions 
are supported and they are listed
+[below](#available-families).
+
+**NOTE**: Spark currently only supports up to 4096 features through its 
`GeneralizedLinearRegression`
+interface, and will throw an exception if this constraint is exceeded. See the 
[advanced section](ml-advanced) for more details.
+ Still, for linear and logistic regression, models with an increased number of 
features can be trained 
+ using the `LinearRegression` and `LogisticRegression` estimators.
+
+GLMs require exponential family distributions that can be written in their 
"canonical" or "natural" form, aka
+[natural exponential family 
distributions](https://en.wikipedia.org/wiki/Natural_exponential_family). The 
form of a natural exponential family distribution is given as:
+
+$$
+f_Y(y|\theta, \tau) = h(y, \tau)\exp{\left( \frac{\theta \cdot y - 
A(\theta)}{d(\tau)} \right)}
+$$
+
+where $\theta$ is the parameter of interest and $\tau$ is a dispersion 
parameter. In a GLM the response variable $Y_i$ is assumed to be drawn from a 
natural exponential family distribution:
+
+$$
+Y_i \sim f\left(\cdot|\theta_i, \tau \right)
+$$
+
+where the parameter of interest $\theta_i$ is related to the expected value of 
the response variable $\mu_i$ by
+
+$$
+\mu_i = A'(\theta_i)
+$$
+
+Here, $A'(\theta_i)$ is defined by the form of the distribution selected. GLMs 
also allow specification
+of a link function, which defines the relationship between the expected value 
of the response variable $\mu_i$
+and the so called _linear predictor_ $\eta_i$:
+
+$$
+g(\mu_i) = \eta_i = \vec{x_i}^T \cdot \vec{\beta}
+$$
+
+Often, the link function is chosen such that $A' = g^{-1}$, which yields a 
simplified relationship
+between the parameter of interest $\theta$ and the linear predictor $\eta$. In 
this case, the link
+function $g(\mu)$ is said to be the "canonical" link function.
+
+$$
+\theta_i =

spark git commit: [SPARK-14400][SQL] ScriptTransformation does not fail the job for bad user command

2016-05-27 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master b376a4eab -> a96e4151a


[SPARK-14400][SQL] ScriptTransformation does not fail the job for bad user 
command

## What changes were proposed in this pull request?

- Refer to the Jira for the problem: jira : 
https://issues.apache.org/jira/browse/SPARK-14400
- The fix is to check if the process has exited with a non-zero exit code in 
`hasNext()`. I have moved this and checking of writer thread exception to a 
separate method.

## How was this patch tested?

- Ran a job which had incorrect transform script command and saw that the job 
fails
- Existing unit tests for `ScriptTransformationSuite`. Added a new unit test

Author: Tejas Patil 

Closes #12194 from tejasapatil/script_transform.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a96e4151
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a96e4151
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a96e4151

Branch: refs/heads/master
Commit: a96e4151a9d429cfaf457c07b4ce174890a3b39b
Parents: b376a4e
Author: Tejas Patil 
Authored: Fri May 27 12:05:11 2016 -0700
Committer: Reynold Xin 
Committed: Fri May 27 12:05:11 2016 -0700

--
 .../spark/sql/execution/SparkPlanTest.scala |  7 +-
 .../hive/execution/ScriptTransformation.scala   | 90 +---
 .../execution/ScriptTransformationSuite.scala   | 18 +++-
 3 files changed, 81 insertions(+), 34 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/a96e4151/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanTest.scala
--
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanTest.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanTest.scala
index 9fe0e96..b29e822 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanTest.scala
@@ -231,7 +231,12 @@ object SparkPlanTest {
 }
   }
 
-  private def executePlan(outputPlan: SparkPlan, spark: SQLContext): Seq[Row] 
= {
+  /**
+   * Runs the plan
+   * @param outputPlan SparkPlan to be executed
+   * @param spark SqlContext used for execution of the plan
+   */
+  def executePlan(outputPlan: SparkPlan, spark: SQLContext): Seq[Row] = {
 val execution = new QueryExecution(spark.sparkSession, null) {
   override lazy val sparkPlan: SparkPlan = outputPlan transform {
 case plan: SparkPlan =>

http://git-wip-us.apache.org/repos/asf/spark/blob/a96e4151/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala
--
diff --git 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala
 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala
index f6e6a75..9e25e1d 100644
--- 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala
+++ 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala
@@ -32,7 +32,7 @@ import org.apache.hadoop.hive.serde2.AbstractSerDe
 import org.apache.hadoop.hive.serde2.objectinspector._
 import org.apache.hadoop.io.Writable
 
-import org.apache.spark.TaskContext
+import org.apache.spark.{SparkException, TaskContext}
 import org.apache.spark.internal.Logging
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
@@ -127,45 +127,71 @@ case class ScriptTransformation(
 }
 val mutableRow = new SpecificMutableRow(output.map(_.dataType))
 
+private def checkFailureAndPropagate(cause: Throwable = null): Unit = {
+  if (writerThread.exception.isDefined) {
+throw writerThread.exception.get
+  }
+
+  // Checks if the proc is still alive (incase the command ran was bad)
+  // The ideal way to do this is to use Java 8's Process#isAlive()
+  // but it cannot be used because Spark still supports Java 7.
+  // Following is a workaround used to check if a process is alive in 
Java 7
+  // TODO: Once builds are switched to Java 8, this can be changed
+  try {
+val exitCode = proc.exitValue()
+if (exitCode != 0) {
+  logError(stderrBuffer.toString) // log the stderr circular buffer
+  throw new SparkException(s"Subprocess exited with status 
$exitCode. " +
+s"Error: ${stderrBuffer.toString}", cause)
+}
+  } catch {
+case _: IllegalThreadStateException =>
+// This means that the process is still

spark git commit: [SPARK-14400][SQL] ScriptTransformation does not fail the job for bad user command

2016-05-27 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 5ea58898c -> d76e066d3


[SPARK-14400][SQL] ScriptTransformation does not fail the job for bad user 
command

## What changes were proposed in this pull request?

- Refer to the Jira for the problem: jira : 
https://issues.apache.org/jira/browse/SPARK-14400
- The fix is to check if the process has exited with a non-zero exit code in 
`hasNext()`. I have moved this and checking of writer thread exception to a 
separate method.

## How was this patch tested?

- Ran a job which had incorrect transform script command and saw that the job 
fails
- Existing unit tests for `ScriptTransformationSuite`. Added a new unit test

Author: Tejas Patil 

Closes #12194 from tejasapatil/script_transform.

(cherry picked from commit a96e4151a9d429cfaf457c07b4ce174890a3b39b)
Signed-off-by: Reynold Xin 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d76e066d
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d76e066d
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d76e066d

Branch: refs/heads/branch-2.0
Commit: d76e066d3355a9942af3ae4f23d81a948c236e5e
Parents: 5ea5889
Author: Tejas Patil 
Authored: Fri May 27 12:05:11 2016 -0700
Committer: Reynold Xin 
Committed: Fri May 27 12:05:17 2016 -0700

--
 .../spark/sql/execution/SparkPlanTest.scala |  7 +-
 .../hive/execution/ScriptTransformation.scala   | 90 +---
 .../execution/ScriptTransformationSuite.scala   | 18 +++-
 3 files changed, 81 insertions(+), 34 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/d76e066d/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanTest.scala
--
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanTest.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanTest.scala
index 9fe0e96..b29e822 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanTest.scala
@@ -231,7 +231,12 @@ object SparkPlanTest {
 }
   }
 
-  private def executePlan(outputPlan: SparkPlan, spark: SQLContext): Seq[Row] 
= {
+  /**
+   * Runs the plan
+   * @param outputPlan SparkPlan to be executed
+   * @param spark SqlContext used for execution of the plan
+   */
+  def executePlan(outputPlan: SparkPlan, spark: SQLContext): Seq[Row] = {
 val execution = new QueryExecution(spark.sparkSession, null) {
   override lazy val sparkPlan: SparkPlan = outputPlan transform {
 case plan: SparkPlan =>

http://git-wip-us.apache.org/repos/asf/spark/blob/d76e066d/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala
--
diff --git 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala
 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala
index f6e6a75..9e25e1d 100644
--- 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala
+++ 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala
@@ -32,7 +32,7 @@ import org.apache.hadoop.hive.serde2.AbstractSerDe
 import org.apache.hadoop.hive.serde2.objectinspector._
 import org.apache.hadoop.io.Writable
 
-import org.apache.spark.TaskContext
+import org.apache.spark.{SparkException, TaskContext}
 import org.apache.spark.internal.Logging
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
@@ -127,45 +127,71 @@ case class ScriptTransformation(
 }
 val mutableRow = new SpecificMutableRow(output.map(_.dataType))
 
+private def checkFailureAndPropagate(cause: Throwable = null): Unit = {
+  if (writerThread.exception.isDefined) {
+throw writerThread.exception.get
+  }
+
+  // Checks if the proc is still alive (incase the command ran was bad)
+  // The ideal way to do this is to use Java 8's Process#isAlive()
+  // but it cannot be used because Spark still supports Java 7.
+  // Following is a workaround used to check if a process is alive in 
Java 7
+  // TODO: Once builds are switched to Java 8, this can be changed
+  try {
+val exitCode = proc.exitValue()
+if (exitCode != 0) {
+  logError(stderrBuffer.toString) // log the stderr circular buffer
+  throw new SparkException(s"Subprocess exited with status 
$exitCode. " +
+s"Error: ${stderrBuffer.toString}", cause)
+}

spark git commit: [HOTFIX] Scala 2.10 compile GaussianMixtureModel

2016-05-27 Thread andrewor14

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 17f43cc87 -> 5ea58898c


[HOTFIX] Scala 2.10 compile GaussianMixtureModel


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5ea58898
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5ea58898
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5ea58898

Branch: refs/heads/branch-2.0
Commit: 5ea58898cc9413fd0b04b60db230c8894d8bb9ef
Parents: 17f43cc
Author: Andrew Or 
Authored: Fri May 27 11:43:01 2016 -0700
Committer: Andrew Or 
Committed: Fri May 27 11:43:50 2016 -0700

--
 .../org/apache/spark/mllib/clustering/GaussianMixtureModel.scala   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/5ea58898/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
 
b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
index 4b06816..f470b0f 100644
--- 
a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
@@ -170,7 +170,7 @@ object GaussianMixtureModel extends 
Loader[GaussianMixtureModel] {
   (weight, new MultivariateGaussian(mu, sigma))
   }.unzip
 
-  new GaussianMixtureModel(weights, gaussians)
+  new GaussianMixtureModel(weights.toArray, gaussians.toArray)
 }
   }
 


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [HOTFIX] Scala 2.10 compile GaussianMixtureModel

2016-05-27 Thread andrewor14

Repository: spark
Updated Branches:
  refs/heads/master 1b98fa2e4 -> b376a4eab


[HOTFIX] Scala 2.10 compile GaussianMixtureModel


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b376a4ea
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b376a4ea
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b376a4ea

Branch: refs/heads/master
Commit: b376a4eabc82d622ea26290345c01465af7a628d
Parents: 1b98fa2
Author: Andrew Or 
Authored: Fri May 27 11:43:01 2016 -0700
Committer: Andrew Or 
Committed: Fri May 27 11:43:01 2016 -0700

--
 .../org/apache/spark/mllib/clustering/GaussianMixtureModel.scala   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/b376a4ea/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
 
b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
index 4b06816..f470b0f 100644
--- 
a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
@@ -170,7 +170,7 @@ object GaussianMixtureModel extends 
Loader[GaussianMixtureModel] {
   (weight, new MultivariateGaussian(mu, sigma))
   }.unzip
 
-  new GaussianMixtureModel(weights, gaussians)
+  new GaussianMixtureModel(weights.toArray, gaussians.toArray)
 }
   }
 


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [YARN][DOC][MINOR] Remove several obsolete env variables and update the doc

2016-05-27 Thread vanzin

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 074989af9 -> 17f43cc87


[YARN][DOC][MINOR] Remove several obsolete env variables and update the doc

## What changes were proposed in this pull request?

Remove several obsolete env variables not supported for Spark on YARN now, also 
updates the docs to include several changes with 2.0.

## How was this patch tested?

N/A

CC vanzin tgravescs

Author: jerryshao 

Closes #13296 from jerryshao/yarn-doc.

(cherry picked from commit 1b98fa2e4382d3d8385cf1ac25d7fd3ae5650475)
Signed-off-by: Marcelo Vanzin 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/17f43cc8
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/17f43cc8
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/17f43cc8

Branch: refs/heads/branch-2.0
Commit: 17f43cc87ed8b3c77b7c34163340da8e2da48eb1
Parents: 074989a
Author: jerryshao 
Authored: Fri May 27 11:31:25 2016 -0700
Committer: Marcelo Vanzin 
Committed: Fri May 27 11:31:37 2016 -0700

--
 conf/spark-env.sh.template | 4 
 docs/running-on-yarn.md| 4 
 2 files changed, 4 insertions(+), 4 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/17f43cc8/conf/spark-env.sh.template
--
diff --git a/conf/spark-env.sh.template b/conf/spark-env.sh.template
index a031cd6..9cffdc3 100755
--- a/conf/spark-env.sh.template
+++ b/conf/spark-env.sh.template
@@ -40,10 +40,6 @@
 # - SPARK_EXECUTOR_CORES, Number of cores for the executors (Default: 1).
 # - SPARK_EXECUTOR_MEMORY, Memory per Executor (e.g. 1000M, 2G) (Default: 1G)
 # - SPARK_DRIVER_MEMORY, Memory for Driver (e.g. 1000M, 2G) (Default: 1G)
-# - SPARK_YARN_APP_NAME, The name of your application (Default: Spark)
-# - SPARK_YARN_QUEUE, The hadoop queue to use for allocation requests 
(Default: 'default')
-# - SPARK_YARN_DIST_FILES, Comma separated list of files to be distributed 
with the job.
-# - SPARK_YARN_DIST_ARCHIVES, Comma separated list of archives to be 
distributed with the job.
 
 # Options for the daemons used in the standalone deploy mode
 # - SPARK_MASTER_IP, to bind the master to a different IP address or hostname

http://git-wip-us.apache.org/repos/asf/spark/blob/17f43cc8/docs/running-on-yarn.md
--
diff --git a/docs/running-on-yarn.md b/docs/running-on-yarn.md
index f2fbe3c..9833806 100644
--- a/docs/running-on-yarn.md
+++ b/docs/running-on-yarn.md
@@ -60,6 +60,8 @@ Running Spark on YARN requires a binary distribution of Spark 
which is built wit
 Binary distributions can be downloaded from the [downloads 
page](http://spark.apache.org/downloads.html) of the project website.
 To build Spark yourself, refer to [Building Spark](building-spark.html).
 
+To make Spark runtime jars accessible from YARN side, you can specify 
`spark.yarn.archive` or `spark.yarn.jars`. For details please refer to [Spark 
Properties](running-on-yarn.html#spark-properties). If neither 
`spark.yarn.archive` nor `spark.yarn.jars` is specified, Spark will create a 
zip file with all jars under `$SPARK_HOME/jars` and upload it to the 
distributed cache.
+
 # Configuration
 
 Most of the configs are the same for Spark on YARN as for other deployment 
modes. See the [configuration page](configuration.html) for more information on 
those.  These are configs that are specific to Spark on YARN.
@@ -99,6 +101,8 @@ to the same log file).
 
 If you need a reference to the proper location to put log files in the YARN so 
that YARN can properly display and aggregate them, use 
`spark.yarn.app.container.log.dir` in your `log4j.properties`. For example, 
`log4j.appender.file_appender.File=${spark.yarn.app.container.log.dir}/spark.log`.
 For streaming applications, configuring `RollingFileAppender` and setting file 
location to YARN's log directory will avoid disk overflow caused by large log 
files, and logs can be accessed using YARN's log utility.
 
+To use a custom metrics.properties for the application master and executors, 
update the `$SPARK_CONF_DIR/metrics.properties` file. It will automatically be 
uploaded with other configurations, so you don't need to specify it manually 
with `--files`.
+
  Spark Properties
 
 


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [YARN][DOC][MINOR] Remove several obsolete env variables and update the doc

2016-05-27 Thread vanzin

Repository: spark
Updated Branches:
  refs/heads/master 623aae590 -> 1b98fa2e4


[YARN][DOC][MINOR] Remove several obsolete env variables and update the doc

## What changes were proposed in this pull request?

Remove several obsolete env variables not supported for Spark on YARN now, also 
updates the docs to include several changes with 2.0.

## How was this patch tested?

N/A

CC vanzin tgravescs

Author: jerryshao 

Closes #13296 from jerryshao/yarn-doc.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1b98fa2e
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1b98fa2e
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1b98fa2e

Branch: refs/heads/master
Commit: 1b98fa2e4382d3d8385cf1ac25d7fd3ae5650475
Parents: 623aae5
Author: jerryshao 
Authored: Fri May 27 11:31:25 2016 -0700
Committer: Marcelo Vanzin 
Committed: Fri May 27 11:31:25 2016 -0700

--
 conf/spark-env.sh.template | 4 
 docs/running-on-yarn.md| 4 
 2 files changed, 4 insertions(+), 4 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/1b98fa2e/conf/spark-env.sh.template
--
diff --git a/conf/spark-env.sh.template b/conf/spark-env.sh.template
index a031cd6..9cffdc3 100755
--- a/conf/spark-env.sh.template
+++ b/conf/spark-env.sh.template
@@ -40,10 +40,6 @@
 # - SPARK_EXECUTOR_CORES, Number of cores for the executors (Default: 1).
 # - SPARK_EXECUTOR_MEMORY, Memory per Executor (e.g. 1000M, 2G) (Default: 1G)
 # - SPARK_DRIVER_MEMORY, Memory for Driver (e.g. 1000M, 2G) (Default: 1G)
-# - SPARK_YARN_APP_NAME, The name of your application (Default: Spark)
-# - SPARK_YARN_QUEUE, The hadoop queue to use for allocation requests 
(Default: 'default')
-# - SPARK_YARN_DIST_FILES, Comma separated list of files to be distributed 
with the job.
-# - SPARK_YARN_DIST_ARCHIVES, Comma separated list of archives to be 
distributed with the job.
 
 # Options for the daemons used in the standalone deploy mode
 # - SPARK_MASTER_IP, to bind the master to a different IP address or hostname

http://git-wip-us.apache.org/repos/asf/spark/blob/1b98fa2e/docs/running-on-yarn.md
--
diff --git a/docs/running-on-yarn.md b/docs/running-on-yarn.md
index f2fbe3c..9833806 100644
--- a/docs/running-on-yarn.md
+++ b/docs/running-on-yarn.md
@@ -60,6 +60,8 @@ Running Spark on YARN requires a binary distribution of Spark 
which is built wit
 Binary distributions can be downloaded from the [downloads 
page](http://spark.apache.org/downloads.html) of the project website.
 To build Spark yourself, refer to [Building Spark](building-spark.html).
 
+To make Spark runtime jars accessible from YARN side, you can specify 
`spark.yarn.archive` or `spark.yarn.jars`. For details please refer to [Spark 
Properties](running-on-yarn.html#spark-properties). If neither 
`spark.yarn.archive` nor `spark.yarn.jars` is specified, Spark will create a 
zip file with all jars under `$SPARK_HOME/jars` and upload it to the 
distributed cache.
+
 # Configuration
 
 Most of the configs are the same for Spark on YARN as for other deployment 
modes. See the [configuration page](configuration.html) for more information on 
those.  These are configs that are specific to Spark on YARN.
@@ -99,6 +101,8 @@ to the same log file).
 
 If you need a reference to the proper location to put log files in the YARN so 
that YARN can properly display and aggregate them, use 
`spark.yarn.app.container.log.dir` in your `log4j.properties`. For example, 
`log4j.appender.file_appender.File=${spark.yarn.app.container.log.dir}/spark.log`.
 For streaming applications, configuring `RollingFileAppender` and setting file 
location to YARN's log directory will avoid disk overflow caused by large log 
files, and logs can be accessed using YARN's log utility.
 
+To use a custom metrics.properties for the application master and executors, 
update the `$SPARK_CONF_DIR/metrics.properties` file. It will automatically be 
uploaded with other configurations, so you don't need to specify it manually 
with `--files`.
+
  Spark Properties
 
 


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-15531][DEPLOY] spark-class tries to use too much memory when running Launcher

2016-05-27 Thread vanzin

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 30e87b55b -> 074989af9


[SPARK-15531][DEPLOY] spark-class tries to use too much memory when running 
Launcher

## What changes were proposed in this pull request?

Explicitly limit launcher JVM memory to modest 128m

## How was this patch tested?

Jenkins tests.

Author: Sean Owen 

Closes #13360 from srowen/SPARK-15531.

(cherry picked from commit 623aae5907f4ba8b7079c21328e0c0b5fef7bb00)
Signed-off-by: Marcelo Vanzin 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/074989af
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/074989af
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/074989af

Branch: refs/heads/branch-2.0
Commit: 074989af945b8ebb2779f94b8714752b67f3e82f
Parents: 30e87b5
Author: Sean Owen 
Authored: Fri May 27 11:28:28 2016 -0700
Committer: Marcelo Vanzin 
Committed: Fri May 27 11:28:39 2016 -0700

--
 bin/spark-class  | 2 +-
 bin/spark-class2.cmd | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/074989af/bin/spark-class
--
diff --git a/bin/spark-class b/bin/spark-class
index 23a60c6..658e076 100755
--- a/bin/spark-class
+++ b/bin/spark-class
@@ -68,7 +68,7 @@ fi
 # The exit code of the launcher is appended to the output, so the parent shell 
removes it from the
 # command array and checks the value to see if the launcher succeeded.
 build_command() {
-  "$RUNNER" -cp "$LAUNCH_CLASSPATH" org.apache.spark.launcher.Main "$@"
+  "$RUNNER" -Xmx128m -cp "$LAUNCH_CLASSPATH" org.apache.spark.launcher.Main 
"$@"
   printf "%d\0" $?
 }
 

http://git-wip-us.apache.org/repos/asf/spark/blob/074989af/bin/spark-class2.cmd
--
diff --git a/bin/spark-class2.cmd b/bin/spark-class2.cmd
index db68021..869c0b2 100644
--- a/bin/spark-class2.cmd
+++ b/bin/spark-class2.cmd
@@ -55,7 +55,7 @@ if not "x%JAVA_HOME%"=="x" set RUNNER=%JAVA_HOME%\bin\java
 rem The launcher library prints the command to be executed in a single line 
suitable for being
 rem executed by the batch interpreter. So read all the output of the launcher 
into a variable.
 set LAUNCHER_OUTPUT=%temp%\spark-class-launcher-output-%RANDOM%.txt
-"%RUNNER%" -cp "%LAUNCH_CLASSPATH%" org.apache.spark.launcher.Main %* > 
%LAUNCHER_OUTPUT%
+"%RUNNER%" -Xmx128m -cp "%LAUNCH_CLASSPATH%" org.apache.spark.launcher.Main %* 
> %LAUNCHER_OUTPUT%
 for /f "tokens=*" %%i in (%LAUNCHER_OUTPUT%) do (
   set SPARK_CMD=%%i
 )


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-15531][DEPLOY] spark-class tries to use too much memory when running Launcher

2016-05-27 Thread vanzin

Repository: spark
Updated Branches:
  refs/heads/master ce756daa4 -> 623aae590


[SPARK-15531][DEPLOY] spark-class tries to use too much memory when running 
Launcher

## What changes were proposed in this pull request?

Explicitly limit launcher JVM memory to modest 128m

## How was this patch tested?

Jenkins tests.

Author: Sean Owen 

Closes #13360 from srowen/SPARK-15531.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/623aae59
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/623aae59
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/623aae59

Branch: refs/heads/master
Commit: 623aae5907f4ba8b7079c21328e0c0b5fef7bb00
Parents: ce756da
Author: Sean Owen 
Authored: Fri May 27 11:28:28 2016 -0700
Committer: Marcelo Vanzin 
Committed: Fri May 27 11:28:28 2016 -0700

--
 bin/spark-class  | 2 +-
 bin/spark-class2.cmd | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/623aae59/bin/spark-class
--
diff --git a/bin/spark-class b/bin/spark-class
index 23a60c6..658e076 100755
--- a/bin/spark-class
+++ b/bin/spark-class
@@ -68,7 +68,7 @@ fi
 # The exit code of the launcher is appended to the output, so the parent shell 
removes it from the
 # command array and checks the value to see if the launcher succeeded.
 build_command() {
-  "$RUNNER" -cp "$LAUNCH_CLASSPATH" org.apache.spark.launcher.Main "$@"
+  "$RUNNER" -Xmx128m -cp "$LAUNCH_CLASSPATH" org.apache.spark.launcher.Main 
"$@"
   printf "%d\0" $?
 }
 

http://git-wip-us.apache.org/repos/asf/spark/blob/623aae59/bin/spark-class2.cmd
--
diff --git a/bin/spark-class2.cmd b/bin/spark-class2.cmd
index db68021..869c0b2 100644
--- a/bin/spark-class2.cmd
+++ b/bin/spark-class2.cmd
@@ -55,7 +55,7 @@ if not "x%JAVA_HOME%"=="x" set RUNNER=%JAVA_HOME%\bin\java
 rem The launcher library prints the command to be executed in a single line 
suitable for being
 rem executed by the batch interpreter. So read all the output of the launcher 
into a variable.
 set LAUNCHER_OUTPUT=%temp%\spark-class-launcher-output-%RANDOM%.txt
-"%RUNNER%" -cp "%LAUNCH_CLASSPATH%" org.apache.spark.launcher.Main %* > 
%LAUNCHER_OUTPUT%
+"%RUNNER%" -Xmx128m -cp "%LAUNCH_CLASSPATH%" org.apache.spark.launcher.Main %* 
> %LAUNCHER_OUTPUT%
 for /f "tokens=*" %%i in (%LAUNCHER_OUTPUT%) do (
   set SPARK_CMD=%%i
 )


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-15569] Reduce frequency of updateBytesWritten function in Disk…

2016-05-27 Thread andrewor14

Repository: spark
Updated Branches:
  refs/heads/master 5bdbedf22 -> ce756daa4


[SPARK-15569] Reduce frequency of updateBytesWritten function in Diskâ¦

## What changes were proposed in this pull request?

Profiling a Spark job spilling large amount of intermediate data we found that 
significant portion of time is being spent in 
DiskObjectWriter.updateBytesWritten function. Looking at the code, we see that 
the function is being called too frequently to update the number of bytes 
written to disk. We should reduce the frequency to avoid this.

## How was this patch tested?

Tested by running the job on cluster and saw 20% CPU gain  by this change.

Author: Sital Kedia 

Closes #13332 from sitalkedia/DiskObjectWriter.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ce756daa
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ce756daa
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ce756daa

Branch: refs/heads/master
Commit: ce756daa4f012ebdc5a41bf5a89ff11b6dfdab8c
Parents: 5bdbedf
Author: Sital Kedia 
Authored: Fri May 27 11:22:39 2016 -0700
Committer: Andrew Or 
Committed: Fri May 27 11:22:39 2016 -0700

--
 .../apache/spark/storage/DiskBlockObjectWriter.scala|  3 +--
 .../spark/storage/DiskBlockObjectWriterSuite.scala  | 12 ++--
 2 files changed, 7 insertions(+), 8 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/ce756daa/core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala
--
diff --git 
a/core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala 
b/core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala
index ab97d2e..5b493f4 100644
--- a/core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala
+++ b/core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala
@@ -203,8 +203,7 @@ private[spark] class DiskBlockObjectWriter(
 numRecordsWritten += 1
 writeMetrics.incRecordsWritten(1)
 
-// TODO: call updateBytesWritten() less frequently.
-if (numRecordsWritten % 32 == 0) {
+if (numRecordsWritten % 16384 == 0) {
   updateBytesWritten()
 }
   }

http://git-wip-us.apache.org/repos/asf/spark/blob/ce756daa/core/src/test/scala/org/apache/spark/storage/DiskBlockObjectWriterSuite.scala
--
diff --git 
a/core/src/test/scala/org/apache/spark/storage/DiskBlockObjectWriterSuite.scala 
b/core/src/test/scala/org/apache/spark/storage/DiskBlockObjectWriterSuite.scala
index 8eff3c2..ec4ef4b 100644
--- 
a/core/src/test/scala/org/apache/spark/storage/DiskBlockObjectWriterSuite.scala
+++ 
b/core/src/test/scala/org/apache/spark/storage/DiskBlockObjectWriterSuite.scala
@@ -53,13 +53,13 @@ class DiskBlockObjectWriterSuite extends SparkFunSuite with 
BeforeAndAfterEach {
 assert(writeMetrics.recordsWritten === 1)
 // Metrics don't update on every write
 assert(writeMetrics.bytesWritten == 0)
-// After 32 writes, metrics should update
-for (i <- 0 until 32) {
+// After 16384 writes, metrics should update
+for (i <- 0 until 16384) {
   writer.flush()
   writer.write(Long.box(i), Long.box(i))
 }
 assert(writeMetrics.bytesWritten > 0)
-assert(writeMetrics.recordsWritten === 33)
+assert(writeMetrics.recordsWritten === 16385)
 writer.commitAndClose()
 assert(file.length() == writeMetrics.bytesWritten)
   }
@@ -75,13 +75,13 @@ class DiskBlockObjectWriterSuite extends SparkFunSuite with 
BeforeAndAfterEach {
 assert(writeMetrics.recordsWritten === 1)
 // Metrics don't update on every write
 assert(writeMetrics.bytesWritten == 0)
-// After 32 writes, metrics should update
-for (i <- 0 until 32) {
+// After 16384 writes, metrics should update
+for (i <- 0 until 16384) {
   writer.flush()
   writer.write(Long.box(i), Long.box(i))
 }
 assert(writeMetrics.bytesWritten > 0)
-assert(writeMetrics.recordsWritten === 33)
+assert(writeMetrics.recordsWritten === 16385)
 writer.revertPartialWritesAndClose()
 assert(writeMetrics.bytesWritten == 0)
 assert(writeMetrics.recordsWritten == 0)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-15569] Reduce frequency of updateBytesWritten function in Disk…

2016-05-27 Thread andrewor14

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 89fdb6972 -> 30e87b55b


[SPARK-15569] Reduce frequency of updateBytesWritten function in Diskâ¦

## What changes were proposed in this pull request?

Profiling a Spark job spilling large amount of intermediate data we found that 
significant portion of time is being spent in 
DiskObjectWriter.updateBytesWritten function. Looking at the code, we see that 
the function is being called too frequently to update the number of bytes 
written to disk. We should reduce the frequency to avoid this.

## How was this patch tested?

Tested by running the job on cluster and saw 20% CPU gain  by this change.

Author: Sital Kedia 

Closes #13332 from sitalkedia/DiskObjectWriter.

(cherry picked from commit ce756daa4f012ebdc5a41bf5a89ff11b6dfdab8c)
Signed-off-by: Andrew Or 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/30e87b55
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/30e87b55
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/30e87b55

Branch: refs/heads/branch-2.0
Commit: 30e87b55b6f59ca029778087710effc768fafc35
Parents: 89fdb69
Author: Sital Kedia 
Authored: Fri May 27 11:22:39 2016 -0700
Committer: Andrew Or 
Committed: Fri May 27 11:22:48 2016 -0700

--
 .../apache/spark/storage/DiskBlockObjectWriter.scala|  3 +--
 .../spark/storage/DiskBlockObjectWriterSuite.scala  | 12 ++--
 2 files changed, 7 insertions(+), 8 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/30e87b55/core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala
--
diff --git 
a/core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala 
b/core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala
index ab97d2e..5b493f4 100644
--- a/core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala
+++ b/core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala
@@ -203,8 +203,7 @@ private[spark] class DiskBlockObjectWriter(
 numRecordsWritten += 1
 writeMetrics.incRecordsWritten(1)
 
-// TODO: call updateBytesWritten() less frequently.
-if (numRecordsWritten % 32 == 0) {
+if (numRecordsWritten % 16384 == 0) {
   updateBytesWritten()
 }
   }

http://git-wip-us.apache.org/repos/asf/spark/blob/30e87b55/core/src/test/scala/org/apache/spark/storage/DiskBlockObjectWriterSuite.scala
--
diff --git 
a/core/src/test/scala/org/apache/spark/storage/DiskBlockObjectWriterSuite.scala 
b/core/src/test/scala/org/apache/spark/storage/DiskBlockObjectWriterSuite.scala
index 8eff3c2..ec4ef4b 100644
--- 
a/core/src/test/scala/org/apache/spark/storage/DiskBlockObjectWriterSuite.scala
+++ 
b/core/src/test/scala/org/apache/spark/storage/DiskBlockObjectWriterSuite.scala
@@ -53,13 +53,13 @@ class DiskBlockObjectWriterSuite extends SparkFunSuite with 
BeforeAndAfterEach {
 assert(writeMetrics.recordsWritten === 1)
 // Metrics don't update on every write
 assert(writeMetrics.bytesWritten == 0)
-// After 32 writes, metrics should update
-for (i <- 0 until 32) {
+// After 16384 writes, metrics should update
+for (i <- 0 until 16384) {
   writer.flush()
   writer.write(Long.box(i), Long.box(i))
 }
 assert(writeMetrics.bytesWritten > 0)
-assert(writeMetrics.recordsWritten === 33)
+assert(writeMetrics.recordsWritten === 16385)
 writer.commitAndClose()
 assert(file.length() == writeMetrics.bytesWritten)
   }
@@ -75,13 +75,13 @@ class DiskBlockObjectWriterSuite extends SparkFunSuite with 
BeforeAndAfterEach {
 assert(writeMetrics.recordsWritten === 1)
 // Metrics don't update on every write
 assert(writeMetrics.bytesWritten == 0)
-// After 32 writes, metrics should update
-for (i <- 0 until 32) {
+// After 16384 writes, metrics should update
+for (i <- 0 until 16384) {
   writer.flush()
   writer.write(Long.box(i), Long.box(i))
 }
 assert(writeMetrics.bytesWritten > 0)
-assert(writeMetrics.recordsWritten === 33)
+assert(writeMetrics.recordsWritten === 16385)
 writer.revertPartialWritesAndClose()
 assert(writeMetrics.bytesWritten == 0)
 assert(writeMetrics.recordsWritten == 0)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [MINOR][DOCS] Typo fixes in Dataset scaladoc

2016-05-27 Thread andrewor14

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 f52a95248 -> 89fdb6972


[MINOR][DOCS] Typo fixes in Dataset scaladoc

## What changes were proposed in this pull request?

Minor typo fixes in Dataset scaladoc
* Corrected context type as SparkSession, not SQLContext.
liancheng rxin andrewor14

## How was this patch tested?

Compiled locally

Author: Xinh Huynh 

Closes #13330 from xinhhuynh/fix-dataset-typos.

(cherry picked from commit 5bdbedf2201efa6c34392aa9eff709761f027e1d)
Signed-off-by: Andrew Or 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/89fdb697
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/89fdb697
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/89fdb697

Branch: refs/heads/branch-2.0
Commit: 89fdb6972d5410f250bc56f8a834c939ee6653d2
Parents: f52a952
Author: Xinh Huynh 
Authored: Fri May 27 11:13:53 2016 -0700
Committer: Andrew Or 
Committed: Fri May 27 11:14:01 2016 -0700

--
 sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/89fdb697/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
--
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index 85f0cf8..abd16f2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -121,7 +121,7 @@ private[sql] object Dataset {
  *
  * A more concrete example in Scala:
  * {{{
- *   // To create Dataset[Row] using SQLContext
+ *   // To create Dataset[Row] using SparkSession
  *   val people = spark.read.parquet("...")
  *   val department = spark.read.parquet("...")
  *
@@ -133,7 +133,7 @@ private[sql] object Dataset {
  *
  * and in Java:
  * {{{
- *   // To create Dataset using SQLContext
+ *   // To create Dataset using SparkSession
  *   Dataset people = spark.read().parquet("...");
  *   Dataset department = spark.read().parquet("...");
  *


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [MINOR][DOCS] Typo fixes in Dataset scaladoc

2016-05-27 Thread andrewor14

Repository: spark
Updated Branches:
  refs/heads/master a52e68133 -> 5bdbedf22


[MINOR][DOCS] Typo fixes in Dataset scaladoc

## What changes were proposed in this pull request?

Minor typo fixes in Dataset scaladoc
* Corrected context type as SparkSession, not SQLContext.
liancheng rxin andrewor14

## How was this patch tested?

Compiled locally

Author: Xinh Huynh 

Closes #13330 from xinhhuynh/fix-dataset-typos.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5bdbedf2
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5bdbedf2
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5bdbedf2

Branch: refs/heads/master
Commit: 5bdbedf2201efa6c34392aa9eff709761f027e1d
Parents: a52e681
Author: Xinh Huynh 
Authored: Fri May 27 11:13:53 2016 -0700
Committer: Andrew Or 
Committed: Fri May 27 11:13:53 2016 -0700

--
 sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/5bdbedf2/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
--
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index 85f0cf8..abd16f2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -121,7 +121,7 @@ private[sql] object Dataset {
  *
  * A more concrete example in Scala:
  * {{{
- *   // To create Dataset[Row] using SQLContext
+ *   // To create Dataset[Row] using SparkSession
  *   val people = spark.read.parquet("...")
  *   val department = spark.read.parquet("...")
  *
@@ -133,7 +133,7 @@ private[sql] object Dataset {
  *
  * and in Java:
  * {{{
- *   // To create Dataset using SQLContext
+ *   // To create Dataset using SparkSession
  *   Dataset people = spark.read().parquet("...");
  *   Dataset department = spark.read().parquet("...");
  *


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-15597][SQL] Add SparkSession.emptyDataset

2016-05-27 Thread andrewor14

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 e69639f43 -> f52a95248


[SPARK-15597][SQL] Add SparkSession.emptyDataset

## What changes were proposed in this pull request?
This patch adds a new function emptyDataset to SparkSession, for creating an 
empty dataset.

## How was this patch tested?
Added a test case.

Author: Reynold Xin 

Closes #13344 from rxin/SPARK-15597.

(cherry picked from commit a52e6813392ba4bdb1b818694b7ced8f6caa6a2b)
Signed-off-by: Andrew Or 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f52a9524
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f52a9524
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f52a9524

Branch: refs/heads/branch-2.0
Commit: f52a9524865b8c56058a65b29a1aaacffb709f69
Parents: e69639f
Author: Reynold Xin 
Authored: Fri May 27 11:13:09 2016 -0700
Committer: Andrew Or 
Committed: Fri May 27 11:13:17 2016 -0700

--
 .../main/scala/org/apache/spark/sql/SparkSession.scala  | 12 
 .../test/scala/org/apache/spark/sql/DatasetSuite.scala  |  6 ++
 2 files changed, 18 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/f52a9524/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
--
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
index aa60048..c9276cf 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
@@ -224,6 +224,18 @@ class SparkSession private(
 
   /**
* :: Experimental ::
+   * Creates a new [[Dataset]] of type T containing zero elements.
+   *
+   * @return 2.0.0
+   */
+  @Experimental
+  def emptyDataset[T: Encoder]: Dataset[T] = {
+val encoder = implicitly[Encoder[T]]
+new Dataset(self, LocalRelation(encoder.schema.toAttributes), encoder)
+  }
+
+  /**
+   * :: Experimental ::
* Creates a [[DataFrame]] from an RDD of Product (e.g. case classes, 
tuples).
*
* @group dataframes

http://git-wip-us.apache.org/repos/asf/spark/blob/f52a9524/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
--
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
index 2a65916..e395007 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
@@ -46,6 +46,12 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
   1, 1, 1)
   }
 
+  test("emptyDataset") {
+val ds = spark.emptyDataset[Int]
+assert(ds.count() == 0L)
+assert(ds.collect() sameElements Array.empty[Int])
+  }
+
   test("range") {
 assert(spark.range(10).map(_ + 1).reduce(_ + _) == 55)
 assert(spark.range(10).map{ case i: java.lang.Long => i + 1 }.reduce(_ + 
_) == 55)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-15597][SQL] Add SparkSession.emptyDataset

2016-05-27 Thread andrewor14

Repository: spark
Updated Branches:
  refs/heads/master 635fb30f8 -> a52e68133


[SPARK-15597][SQL] Add SparkSession.emptyDataset

## What changes were proposed in this pull request?
This patch adds a new function emptyDataset to SparkSession, for creating an 
empty dataset.

## How was this patch tested?
Added a test case.

Author: Reynold Xin 

Closes #13344 from rxin/SPARK-15597.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a52e6813
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a52e6813
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a52e6813

Branch: refs/heads/master
Commit: a52e6813392ba4bdb1b818694b7ced8f6caa6a2b
Parents: 635fb30
Author: Reynold Xin 
Authored: Fri May 27 11:13:09 2016 -0700
Committer: Andrew Or 
Committed: Fri May 27 11:13:09 2016 -0700

--
 .../main/scala/org/apache/spark/sql/SparkSession.scala  | 12 
 .../test/scala/org/apache/spark/sql/DatasetSuite.scala  |  6 ++
 2 files changed, 18 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/a52e6813/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
--
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
index aa60048..c9276cf 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
@@ -224,6 +224,18 @@ class SparkSession private(
 
   /**
* :: Experimental ::
+   * Creates a new [[Dataset]] of type T containing zero elements.
+   *
+   * @return 2.0.0
+   */
+  @Experimental
+  def emptyDataset[T: Encoder]: Dataset[T] = {
+val encoder = implicitly[Encoder[T]]
+new Dataset(self, LocalRelation(encoder.schema.toAttributes), encoder)
+  }
+
+  /**
+   * :: Experimental ::
* Creates a [[DataFrame]] from an RDD of Product (e.g. case classes, 
tuples).
*
* @group dataframes

http://git-wip-us.apache.org/repos/asf/spark/blob/a52e6813/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
--
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
index 2a65916..e395007 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
@@ -46,6 +46,12 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
   1, 1, 1)
   }
 
+  test("emptyDataset") {
+val ds = spark.emptyDataset[Int]
+assert(ds.count() == 0L)
+assert(ds.collect() sameElements Array.empty[Int])
+  }
+
   test("range") {
 assert(spark.range(10).map(_ + 1).reduce(_ + _) == 55)
 assert(spark.range(10).map{ case i: java.lang.Long => i + 1 }.reduce(_ + 
_) == 55)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-15599][SQL][DOCS] API docs for `createDataset` functions in SparkSession

2016-05-27 Thread andrewor14

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 a14c88acc -> e69639f43


[SPARK-15599][SQL][DOCS] API docs for `createDataset` functions in SparkSession

## What changes were proposed in this pull request?

Adds API docs and usage examples for the 3 `createDataset` calls in 
`SparkSession`

## How was this patch tested?

N/A

Author: Sameer Agarwal 

Closes #13345 from sameeragarwal/dataset-doc.

(cherry picked from commit 635fb30f83a66cc56f5fecfed5bff77873bf49a6)
Signed-off-by: Andrew Or 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e69639f4
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e69639f4
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e69639f4

Branch: refs/heads/branch-2.0
Commit: e69639f4334aae3ace5e50452603dd667467ea9a
Parents: a14c88a
Author: Sameer Agarwal 
Authored: Fri May 27 11:11:31 2016 -0700
Committer: Andrew Or 
Committed: Fri May 27 11:11:40 2016 -0700

--
 .../org/apache/spark/sql/SparkSession.scala | 63 
 1 file changed, 63 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/e69639f4/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
--
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
index 5dabe0e..aa60048 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
@@ -376,6 +376,40 @@ class SparkSession private(
 Dataset.ofRows(self, LogicalRelation(baseRelation))
   }
 
+  /* --- *
+   |  Methods for creating DataSets  |
+   * --- */
+
+  /**
+   * :: Experimental ::
+   * Creates a [[Dataset]] from a local Seq of data of a given type. This 
method requires an
+   * encoder (to convert a JVM object of type `T` to and from the internal 
Spark SQL representation)
+   * that is generally created automatically through implicits from a 
`SparkSession`, or can be
+   * created explicitly by calling static methods on [[Encoders]].
+   *
+   * == Example ==
+   *
+   * {{{
+   *
+   *   import spark.implicits._
+   *   case class Person(name: String, age: Long)
+   *   val data = Seq(Person("Michael", 29), Person("Andy", 30), 
Person("Justin", 19))
+   *   val ds = spark.createDataset(data)
+   *
+   *   ds.show()
+   *   // +---+---+
+   *   // |   name|age|
+   *   // +---+---+
+   *   // |Michael| 29|
+   *   // |   Andy| 30|
+   *   // | Justin| 19|
+   *   // +---+---+
+   * }}}
+   *
+   * @since 2.0.0
+   * @group dataset
+   */
+  @Experimental
   def createDataset[T : Encoder](data: Seq[T]): Dataset[T] = {
 val enc = encoderFor[T]
 val attributes = enc.schema.toAttributes
@@ -384,6 +418,17 @@ class SparkSession private(
 Dataset[T](self, plan)
   }
 
+  /**
+   * :: Experimental ::
+   * Creates a [[Dataset]] from an RDD of a given type. This method requires an
+   * encoder (to convert a JVM object of type `T` to and from the internal 
Spark SQL representation)
+   * that is generally created automatically through implicits from a 
`SparkSession`, or can be
+   * created explicitly by calling static methods on [[Encoders]].
+   *
+   * @since 2.0.0
+   * @group dataset
+   */
+  @Experimental
   def createDataset[T : Encoder](data: RDD[T]): Dataset[T] = {
 val enc = encoderFor[T]
 val attributes = enc.schema.toAttributes
@@ -392,6 +437,24 @@ class SparkSession private(
 Dataset[T](self, plan)
   }
 
+  /**
+   * :: Experimental ::
+   * Creates a [[Dataset]] from a [[java.util.List]] of a given type. This 
method requires an
+   * encoder (to convert a JVM object of type `T` to and from the internal 
Spark SQL representation)
+   * that is generally created automatically through implicits from a 
`SparkSession`, or can be
+   * created explicitly by calling static methods on [[Encoders]].
+   *
+   * == Java Example ==
+   *
+   * {{{
+   * List data = Arrays.asList("hello", "world");
+   * Dataset ds = spark.createDataset(data, Encoders.STRING());
+   * }}}
+   *
+   * @since 2.0.0
+   * @group dataset
+   */
+  @Experimental
   def createDataset[T : Encoder](data: java.util.List[T]): Dataset[T] = {
 createDataset(data.asScala)
   }


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-15599][SQL][DOCS] API docs for `createDataset` functions in SparkSession

2016-05-27 Thread andrewor14

Repository: spark
Updated Branches:
  refs/heads/master 4538443e2 -> 635fb30f8


[SPARK-15599][SQL][DOCS] API docs for `createDataset` functions in SparkSession

## What changes were proposed in this pull request?

Adds API docs and usage examples for the 3 `createDataset` calls in 
`SparkSession`

## How was this patch tested?

N/A

Author: Sameer Agarwal 

Closes #13345 from sameeragarwal/dataset-doc.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/635fb30f
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/635fb30f
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/635fb30f

Branch: refs/heads/master
Commit: 635fb30f83a66cc56f5fecfed5bff77873bf49a6
Parents: 4538443
Author: Sameer Agarwal 
Authored: Fri May 27 11:11:31 2016 -0700
Committer: Andrew Or 
Committed: Fri May 27 11:11:31 2016 -0700

--
 .../org/apache/spark/sql/SparkSession.scala | 63 
 1 file changed, 63 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/635fb30f/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
--
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
index 5dabe0e..aa60048 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
@@ -376,6 +376,40 @@ class SparkSession private(
 Dataset.ofRows(self, LogicalRelation(baseRelation))
   }
 
+  /* --- *
+   |  Methods for creating DataSets  |
+   * --- */
+
+  /**
+   * :: Experimental ::
+   * Creates a [[Dataset]] from a local Seq of data of a given type. This 
method requires an
+   * encoder (to convert a JVM object of type `T` to and from the internal 
Spark SQL representation)
+   * that is generally created automatically through implicits from a 
`SparkSession`, or can be
+   * created explicitly by calling static methods on [[Encoders]].
+   *
+   * == Example ==
+   *
+   * {{{
+   *
+   *   import spark.implicits._
+   *   case class Person(name: String, age: Long)
+   *   val data = Seq(Person("Michael", 29), Person("Andy", 30), 
Person("Justin", 19))
+   *   val ds = spark.createDataset(data)
+   *
+   *   ds.show()
+   *   // +---+---+
+   *   // |   name|age|
+   *   // +---+---+
+   *   // |Michael| 29|
+   *   // |   Andy| 30|
+   *   // | Justin| 19|
+   *   // +---+---+
+   * }}}
+   *
+   * @since 2.0.0
+   * @group dataset
+   */
+  @Experimental
   def createDataset[T : Encoder](data: Seq[T]): Dataset[T] = {
 val enc = encoderFor[T]
 val attributes = enc.schema.toAttributes
@@ -384,6 +418,17 @@ class SparkSession private(
 Dataset[T](self, plan)
   }
 
+  /**
+   * :: Experimental ::
+   * Creates a [[Dataset]] from an RDD of a given type. This method requires an
+   * encoder (to convert a JVM object of type `T` to and from the internal 
Spark SQL representation)
+   * that is generally created automatically through implicits from a 
`SparkSession`, or can be
+   * created explicitly by calling static methods on [[Encoders]].
+   *
+   * @since 2.0.0
+   * @group dataset
+   */
+  @Experimental
   def createDataset[T : Encoder](data: RDD[T]): Dataset[T] = {
 val enc = encoderFor[T]
 val attributes = enc.schema.toAttributes
@@ -392,6 +437,24 @@ class SparkSession private(
 Dataset[T](self, plan)
   }
 
+  /**
+   * :: Experimental ::
+   * Creates a [[Dataset]] from a [[java.util.List]] of a given type. This 
method requires an
+   * encoder (to convert a JVM object of type `T` to and from the internal 
Spark SQL representation)
+   * that is generally created automatically through implicits from a 
`SparkSession`, or can be
+   * created explicitly by calling static methods on [[Encoders]].
+   *
+   * == Java Example ==
+   *
+   * {{{
+   * List data = Arrays.asList("hello", "world");
+   * Dataset ds = spark.createDataset(data, Encoders.STRING());
+   * }}}
+   *
+   * @since 2.0.0
+   * @group dataset
+   */
+  @Experimental
   def createDataset[T : Encoder](data: java.util.List[T]): Dataset[T] = {
 createDataset(data.asScala)
   }


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-15584][SQL] Abstract duplicate code: `spark.sql.sources.` properties

2016-05-27 Thread andrewor14

Repository: spark
Updated Branches:
  refs/heads/master d24e25157 -> 4538443e2


[SPARK-15584][SQL] Abstract duplicate code: `spark.sql.sources.` properties

## What changes were proposed in this pull request?

This PR replaces `spark.sql.sources.` strings with 
`CreateDataSourceTableUtils.*` constant variables.

## How was this patch tested?

Pass the existing Jenkins tests.

Author: Dongjoon Hyun 

Closes #13349 from dongjoon-hyun/SPARK-15584.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4538443e
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4538443e
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4538443e

Branch: refs/heads/master
Commit: 4538443e276597530a27c6922e48503677b13956
Parents: d24e251
Author: Dongjoon Hyun 
Authored: Fri May 27 11:10:31 2016 -0700
Committer: Andrew Or 
Committed: Fri May 27 11:10:31 2016 -0700

--
 .../spark/ml/source/libsvm/LibSVMRelation.scala |  3 +-
 .../command/createDataSourceTables.scala| 28 +-
 .../spark/sql/execution/command/ddl.scala   | 19 +++
 .../spark/sql/execution/command/tables.scala|  4 +-
 .../datasources/DataSourceStrategy.scala|  2 +-
 .../execution/datasources/WriterContainer.scala | 10 ++--
 .../execution/datasources/csv/CSVRelation.scala |  3 +-
 .../datasources/json/JsonFileFormat.scala   |  5 +-
 .../datasources/parquet/ParquetFileFormat.scala |  4 +-
 .../datasources/text/TextFileFormat.scala   |  3 +-
 .../spark/sql/execution/command/DDLSuite.scala  | 10 ++--
 .../spark/sql/hive/HiveMetastoreCatalog.scala   | 18 +++---
 .../spark/sql/hive/orc/OrcFileFormat.scala  |  3 +-
 .../sql/hive/MetastoreDataSourcesSuite.scala| 58 ++--
 .../sql/hive/execution/HiveCommandSuite.scala   | 16 +++---
 .../spark/sql/sources/SimpleTextRelation.scala  |  3 +-
 16 files changed, 95 insertions(+), 94 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/4538443e/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala 
b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala
index 64ebf0c..7629369 100644
--- 
a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala
@@ -34,6 +34,7 @@ import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.encoders.RowEncoder
 import org.apache.spark.sql.catalyst.expressions.AttributeReference
 import 
org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection
+import org.apache.spark.sql.execution.command.CreateDataSourceTableUtils
 import org.apache.spark.sql.execution.datasources._
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types._
@@ -51,7 +52,7 @@ private[libsvm] class LibSVMOutputWriter(
 new TextOutputFormat[NullWritable, Text]() {
   override def getDefaultWorkFile(context: TaskAttemptContext, extension: 
String): Path = {
 val configuration = context.getConfiguration
-val uniqueWriteJobId = 
configuration.get("spark.sql.sources.writeJobUUID")
+val uniqueWriteJobId = 
configuration.get(CreateDataSourceTableUtils.DATASOURCE_WRITEJOBUUID)
 val taskAttemptId = context.getTaskAttemptID
 val split = taskAttemptId.getTaskID.getId
 new Path(path, f"part-r-$split%05d-$uniqueWriteJobId$extension")

http://git-wip-us.apache.org/repos/asf/spark/blob/4538443e/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
index deedb68..4b9aab6 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
@@ -256,15 +256,15 @@ case class CreateDataSourceTableAsSelectCommand(
 
 object CreateDataSourceTableUtils extends Logging {
 
-  // TODO: Actually replace usages with these variables (SPARK-15584)
-
   val DATASOURCE_PREFIX = "spark.sql.sources."
   val DATASOURCE_PROVIDER = DATASOURCE_PREFIX + "provider"
   val DATASOURCE_WRITEJOBUUID = DATASOURCE_PREFIX + "writeJobUUID"
   val DATASOURCE_OUTPUTPATH = DATASOURCE_PREFIX + "output.path"
-  val DATASOURCE_SCHEMA_PREFIX = DATASOURCE_PREFIX + "schema."
+  val

spark git commit: [SPARK-15584][SQL] Abstract duplicate code: `spark.sql.sources.` properties

2016-05-27 Thread andrewor14

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 a355edeef -> a14c88acc


[SPARK-15584][SQL] Abstract duplicate code: `spark.sql.sources.` properties

## What changes were proposed in this pull request?

This PR replaces `spark.sql.sources.` strings with 
`CreateDataSourceTableUtils.*` constant variables.

## How was this patch tested?

Pass the existing Jenkins tests.

Author: Dongjoon Hyun 

Closes #13349 from dongjoon-hyun/SPARK-15584.

(cherry picked from commit 4538443e276597530a27c6922e48503677b13956)
Signed-off-by: Andrew Or 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a14c88ac
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a14c88ac
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a14c88ac

Branch: refs/heads/branch-2.0
Commit: a14c88acce0733f3db8b0508ae8b0417822e08d8
Parents: a355ede
Author: Dongjoon Hyun 
Authored: Fri May 27 11:10:31 2016 -0700
Committer: Andrew Or 
Committed: Fri May 27 11:10:39 2016 -0700

--
 .../spark/ml/source/libsvm/LibSVMRelation.scala |  3 +-
 .../command/createDataSourceTables.scala| 28 +-
 .../spark/sql/execution/command/ddl.scala   | 19 +++
 .../spark/sql/execution/command/tables.scala|  4 +-
 .../datasources/DataSourceStrategy.scala|  2 +-
 .../execution/datasources/WriterContainer.scala | 10 ++--
 .../execution/datasources/csv/CSVRelation.scala |  3 +-
 .../datasources/json/JsonFileFormat.scala   |  5 +-
 .../datasources/parquet/ParquetFileFormat.scala |  4 +-
 .../datasources/text/TextFileFormat.scala   |  3 +-
 .../spark/sql/execution/command/DDLSuite.scala  | 10 ++--
 .../spark/sql/hive/HiveMetastoreCatalog.scala   | 18 +++---
 .../spark/sql/hive/orc/OrcFileFormat.scala  |  3 +-
 .../sql/hive/MetastoreDataSourcesSuite.scala| 58 ++--
 .../sql/hive/execution/HiveCommandSuite.scala   | 16 +++---
 .../spark/sql/sources/SimpleTextRelation.scala  |  3 +-
 16 files changed, 95 insertions(+), 94 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/a14c88ac/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala 
b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala
index 64ebf0c..7629369 100644
--- 
a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala
@@ -34,6 +34,7 @@ import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.encoders.RowEncoder
 import org.apache.spark.sql.catalyst.expressions.AttributeReference
 import 
org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection
+import org.apache.spark.sql.execution.command.CreateDataSourceTableUtils
 import org.apache.spark.sql.execution.datasources._
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types._
@@ -51,7 +52,7 @@ private[libsvm] class LibSVMOutputWriter(
 new TextOutputFormat[NullWritable, Text]() {
   override def getDefaultWorkFile(context: TaskAttemptContext, extension: 
String): Path = {
 val configuration = context.getConfiguration
-val uniqueWriteJobId = 
configuration.get("spark.sql.sources.writeJobUUID")
+val uniqueWriteJobId = 
configuration.get(CreateDataSourceTableUtils.DATASOURCE_WRITEJOBUUID)
 val taskAttemptId = context.getTaskAttemptID
 val split = taskAttemptId.getTaskID.getId
 new Path(path, f"part-r-$split%05d-$uniqueWriteJobId$extension")

http://git-wip-us.apache.org/repos/asf/spark/blob/a14c88ac/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
index deedb68..4b9aab6 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
@@ -256,15 +256,15 @@ case class CreateDataSourceTableAsSelectCommand(
 
 object CreateDataSourceTableUtils extends Logging {
 
-  // TODO: Actually replace usages with these variables (SPARK-15584)
-
   val DATASOURCE_PREFIX = "spark.sql.sources."
   val DATASOURCE_PROVIDER = DATASOURCE_PREFIX + "provider"
   val DATASOURCE_WRITEJOBUUID = DATASOURCE_PREFIX + "writeJobUUID"
   val

spark git commit: [SPARK-15603][MLLIB] Replace SQLContext with SparkSession in ML/MLLib

2016-05-27 Thread andrewor14

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 2cb84dd23 -> a355edeef


[SPARK-15603][MLLIB] Replace SQLContext with SparkSession in ML/MLLib

## What changes were proposed in this pull request?

This PR replaces all deprecated `SQLContext` occurrences with `SparkSession` in 
`ML/MLLib` module except the following two classes. These two classes use 
`SQLContext` in their function signatures.
- ReadWrite.scala
- TreeModels.scala

## How was this patch tested?

Pass the existing Jenkins tests.

Author: Dongjoon Hyun 

Closes #13352 from dongjoon-hyun/SPARK-15603.

(cherry picked from commit d24e251572d39a453293cabfe14e4aed25a55208)
Signed-off-by: Andrew Or 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a355edee
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a355edee
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a355edee

Branch: refs/heads/branch-2.0
Commit: a355edeefa16988da8b05d2539a91277e75e823c
Parents: 2cb84dd
Author: Dongjoon Hyun 
Authored: Fri May 27 11:09:15 2016 -0700
Committer: Andrew Or 
Committed: Fri May 27 11:09:33 2016 -0700

--
 .../spark/ml/clustering/GaussianMixture.scala   |  7 ++--
 .../spark/ml/feature/SQLTransformer.scala   | 14 
 .../org/apache/spark/ml/feature/Word2Vec.scala  | 16 -
 .../spark/mllib/api/python/PythonMLLibAPI.scala | 23 +++--
 .../classification/LogisticRegression.scala | 19 +--
 .../spark/mllib/classification/NaiveBayes.scala | 24 ++---
 .../impl/GLMClassificationModel.scala   | 18 +-
 .../mllib/clustering/BisectingKMeansModel.scala | 12 +++
 .../mllib/clustering/GaussianMixtureModel.scala | 16 -
 .../spark/mllib/clustering/KMeansModel.scala| 13 ---
 .../spark/mllib/clustering/LDAModel.scala   | 36 +---
 .../clustering/PowerIterationClustering.scala   | 12 +++
 .../spark/mllib/feature/ChiSqSelector.scala | 13 ---
 .../apache/spark/mllib/feature/Word2Vec.scala   | 13 +++
 .../org/apache/spark/mllib/fpm/FPGrowth.scala   | 10 +++---
 .../org/apache/spark/mllib/fpm/PrefixSpan.scala | 10 +++---
 .../MatrixFactorizationModel.scala  | 12 +++
 .../mllib/regression/IsotonicRegression.scala   | 12 +++
 .../regression/impl/GLMRegressionModel.scala| 18 +-
 .../mllib/tree/model/DecisionTreeModel.scala| 20 +--
 .../mllib/tree/model/treeEnsembleModels.scala   | 17 +
 .../ml/feature/QuantileDiscretizerSuite.scala   | 14 
 .../mllib/util/MLlibTestSparkContext.scala  |  6 ++--
 23 files changed, 160 insertions(+), 195 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/a355edee/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala 
b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
index 88b6b27..773e50e 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
@@ -20,7 +20,6 @@ package org.apache.spark.ml.clustering
 import breeze.linalg.{DenseVector => BDV}
 import org.apache.hadoop.fs.Path
 
-import org.apache.spark.SparkContext
 import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.ml.{Estimator, Model}
 import org.apache.spark.ml.impl.Utils.EPSILON
@@ -33,7 +32,7 @@ import org.apache.spark.mllib.clustering.{GaussianMixture => 
MLlibGM}
 import org.apache.spark.mllib.linalg.{Matrices => OldMatrices, Matrix => 
OldMatrix,
   Vector => OldVector, Vectors => OldVectors, VectorUDT => OldVectorUDT}
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.{DataFrame, Dataset, Row, SQLContext}
+import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
 import org.apache.spark.sql.functions.{col, udf}
 import org.apache.spark.sql.types.{IntegerType, StructType}
 
@@ -134,9 +133,7 @@ class GaussianMixtureModel private[ml] (
 val modelGaussians = gaussians.map { gaussian =>
   (OldVectors.fromML(gaussian.mean), OldMatrices.fromML(gaussian.cov))
 }
-val sc = SparkContext.getOrCreate()
-val sqlContext = SQLContext.getOrCreate(sc)
-sqlContext.createDataFrame(modelGaussians).toDF("mean", "cov")
+
SparkSession.builder().getOrCreate().createDataFrame(modelGaussians).toDF("mean",
 "cov")
   }
 
   /**

http://git-wip-us.apache.org/repos/asf/spark/blob/a355edee/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala
--

spark git commit: [SPARK-15603][MLLIB] Replace SQLContext with SparkSession in ML/MLLib

2016-05-27 Thread andrewor14

Repository: spark
Updated Branches:
  refs/heads/master c17272902 -> d24e25157


[SPARK-15603][MLLIB] Replace SQLContext with SparkSession in ML/MLLib

## What changes were proposed in this pull request?

This PR replaces all deprecated `SQLContext` occurrences with `SparkSession` in 
`ML/MLLib` module except the following two classes. These two classes use 
`SQLContext` in their function signatures.
- ReadWrite.scala
- TreeModels.scala

## How was this patch tested?

Pass the existing Jenkins tests.

Author: Dongjoon Hyun 

Closes #13352 from dongjoon-hyun/SPARK-15603.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d24e2515
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d24e2515
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d24e2515

Branch: refs/heads/master
Commit: d24e251572d39a453293cabfe14e4aed25a55208
Parents: c172729
Author: Dongjoon Hyun 
Authored: Fri May 27 11:09:15 2016 -0700
Committer: Andrew Or 
Committed: Fri May 27 11:09:15 2016 -0700

--
 .../spark/ml/clustering/GaussianMixture.scala   |  7 ++--
 .../spark/ml/feature/SQLTransformer.scala   | 14 
 .../org/apache/spark/ml/feature/Word2Vec.scala  | 16 -
 .../spark/mllib/api/python/PythonMLLibAPI.scala | 23 +++--
 .../classification/LogisticRegression.scala | 19 +--
 .../spark/mllib/classification/NaiveBayes.scala | 24 ++---
 .../impl/GLMClassificationModel.scala   | 18 +-
 .../mllib/clustering/BisectingKMeansModel.scala | 12 +++
 .../mllib/clustering/GaussianMixtureModel.scala | 16 -
 .../spark/mllib/clustering/KMeansModel.scala| 13 ---
 .../spark/mllib/clustering/LDAModel.scala   | 36 +---
 .../clustering/PowerIterationClustering.scala   | 12 +++
 .../spark/mllib/feature/ChiSqSelector.scala | 13 ---
 .../apache/spark/mllib/feature/Word2Vec.scala   | 13 +++
 .../org/apache/spark/mllib/fpm/FPGrowth.scala   | 10 +++---
 .../org/apache/spark/mllib/fpm/PrefixSpan.scala | 10 +++---
 .../MatrixFactorizationModel.scala  | 12 +++
 .../mllib/regression/IsotonicRegression.scala   | 12 +++
 .../regression/impl/GLMRegressionModel.scala| 18 +-
 .../mllib/tree/model/DecisionTreeModel.scala| 20 +--
 .../mllib/tree/model/treeEnsembleModels.scala   | 17 +
 .../ml/feature/QuantileDiscretizerSuite.scala   | 14 
 .../mllib/util/MLlibTestSparkContext.scala  |  6 ++--
 23 files changed, 160 insertions(+), 195 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/d24e2515/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala 
b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
index 88b6b27..773e50e 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
@@ -20,7 +20,6 @@ package org.apache.spark.ml.clustering
 import breeze.linalg.{DenseVector => BDV}
 import org.apache.hadoop.fs.Path
 
-import org.apache.spark.SparkContext
 import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.ml.{Estimator, Model}
 import org.apache.spark.ml.impl.Utils.EPSILON
@@ -33,7 +32,7 @@ import org.apache.spark.mllib.clustering.{GaussianMixture => 
MLlibGM}
 import org.apache.spark.mllib.linalg.{Matrices => OldMatrices, Matrix => 
OldMatrix,
   Vector => OldVector, Vectors => OldVectors, VectorUDT => OldVectorUDT}
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.{DataFrame, Dataset, Row, SQLContext}
+import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
 import org.apache.spark.sql.functions.{col, udf}
 import org.apache.spark.sql.types.{IntegerType, StructType}
 
@@ -134,9 +133,7 @@ class GaussianMixtureModel private[ml] (
 val modelGaussians = gaussians.map { gaussian =>
   (OldVectors.fromML(gaussian.mean), OldMatrices.fromML(gaussian.cov))
 }
-val sc = SparkContext.getOrCreate()
-val sqlContext = SQLContext.getOrCreate(sc)
-sqlContext.createDataFrame(modelGaussians).toDF("mean", "cov")
+
SparkSession.builder().getOrCreate().createDataFrame(modelGaussians).toDF("mean",
 "cov")
   }
 
   /**

http://git-wip-us.apache.org/repos/asf/spark/blob/d24e2515/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala

spark git commit: [SPARK-15565][SQL] Add the File Scheme to the Default Value of WAREHOUSE_PATH

2016-05-27 Thread yhuai

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 b430aa98c -> 2cb84dd23


[SPARK-15565][SQL] Add the File Scheme to the Default Value of WAREHOUSE_PATH

 What changes were proposed in this pull request?
The default value of `spark.sql.warehouse.dir` is 
`System.getProperty("user.dir")/spark-warehouse`. Since 
`System.getProperty("user.dir")` is a local dir, we should explicitly set the 
scheme to local filesystem.

cc yhuai

 How was this patch tested?
Added two test cases

Author: gatorsmile 

Closes #13348 from gatorsmile/addSchemeToDefaultWarehousePath.

(cherry picked from commit c17272902c95290beca274ee6316a8a98fd7a725)
Signed-off-by: Yin Huai 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2cb84dd2
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2cb84dd2
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2cb84dd2

Branch: refs/heads/branch-2.0
Commit: 2cb84dd2356e782b9e606cd126057726fcf6f228
Parents: b430aa9
Author: gatorsmile 
Authored: Fri May 27 09:54:31 2016 -0700
Committer: Yin Huai 
Committed: Fri May 27 09:54:43 2016 -0700

--
 .../org/apache/spark/sql/internal/SQLConf.scala |  2 +-
 .../spark/sql/execution/command/DDLSuite.scala  | 25 
 .../spark/sql/internal/SQLConfSuite.scala   | 12 ++
 3 files changed, 38 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/2cb84dd2/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 4efefda..d1db0dd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -55,7 +55,7 @@ object SQLConf {
   val WAREHOUSE_PATH = SQLConfigBuilder("spark.sql.warehouse.dir")
 .doc("The default location for managed databases and tables.")
 .stringConf
-.createWithDefault("${system:user.dir}/spark-warehouse")
+.createWithDefault("file:${system:user.dir}/spark-warehouse")
 
   val OPTIMIZER_MAX_ITERATIONS = 
SQLConfigBuilder("spark.sql.optimizer.maxIterations")
 .internal()

http://git-wip-us.apache.org/repos/asf/spark/blob/2cb84dd2/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
--
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
index e32521a..e975756 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
@@ -171,6 +171,31 @@ class DDLSuite extends QueryTest with SharedSQLContext 
with BeforeAndAfterEach {
 }
   }
 
+  test("Create Database using Default Warehouse Path") {
+withSQLConf(SQLConf.WAREHOUSE_PATH.key -> "") {
+  // Will use the default location if and only if we unset the conf
+  spark.conf.unset(SQLConf.WAREHOUSE_PATH.key)
+  val catalog = spark.sessionState.catalog
+  val dbName = "db1"
+  try {
+sql(s"CREATE DATABASE $dbName")
+val db1 = catalog.getDatabaseMetadata(dbName)
+val expectedLocation =
+  "file:" + appendTrailingSlash(System.getProperty("user.dir")) +
+s"spark-warehouse/$dbName.db"
+assert(db1 == CatalogDatabase(
+  dbName,
+  "",
+  expectedLocation,
+  Map.empty))
+sql(s"DROP DATABASE $dbName CASCADE")
+assert(!catalog.databaseExists(dbName))
+  } finally {
+catalog.reset()
+  }
+}
+  }
+
   test("Create/Drop Database - location") {
 val catalog = spark.sessionState.catalog
 val databaseNames = Seq("db1", "`database`")

http://git-wip-us.apache.org/repos/asf/spark/blob/2cb84dd2/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala
--
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala
index ad5365a..3d4fc75 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala
@@ -207,4 +207,16 @@ class SQLConfSuite extends QueryTest with SharedSQLContext 
{
 }
   }
 
+  test("default value of WAREHOUSE_PATH") {
+val

spark git commit: [SPARK-15565][SQL] Add the File Scheme to the Default Value of WAREHOUSE_PATH

2016-05-27 Thread yhuai

Repository: spark
Updated Branches:
  refs/heads/master 6f95c6c03 -> c17272902


[SPARK-15565][SQL] Add the File Scheme to the Default Value of WAREHOUSE_PATH

 What changes were proposed in this pull request?
The default value of `spark.sql.warehouse.dir` is 
`System.getProperty("user.dir")/spark-warehouse`. Since 
`System.getProperty("user.dir")` is a local dir, we should explicitly set the 
scheme to local filesystem.

cc yhuai

 How was this patch tested?
Added two test cases

Author: gatorsmile 

Closes #13348 from gatorsmile/addSchemeToDefaultWarehousePath.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c1727290
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c1727290
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c1727290

Branch: refs/heads/master
Commit: c17272902c95290beca274ee6316a8a98fd7a725
Parents: 6f95c6c
Author: gatorsmile 
Authored: Fri May 27 09:54:31 2016 -0700
Committer: Yin Huai 
Committed: Fri May 27 09:54:31 2016 -0700

--
 .../org/apache/spark/sql/internal/SQLConf.scala |  2 +-
 .../spark/sql/execution/command/DDLSuite.scala  | 25 
 .../spark/sql/internal/SQLConfSuite.scala   | 12 ++
 3 files changed, 38 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/c1727290/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 4efefda..d1db0dd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -55,7 +55,7 @@ object SQLConf {
   val WAREHOUSE_PATH = SQLConfigBuilder("spark.sql.warehouse.dir")
 .doc("The default location for managed databases and tables.")
 .stringConf
-.createWithDefault("${system:user.dir}/spark-warehouse")
+.createWithDefault("file:${system:user.dir}/spark-warehouse")
 
   val OPTIMIZER_MAX_ITERATIONS = 
SQLConfigBuilder("spark.sql.optimizer.maxIterations")
 .internal()

http://git-wip-us.apache.org/repos/asf/spark/blob/c1727290/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
--
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
index e32521a..e975756 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
@@ -171,6 +171,31 @@ class DDLSuite extends QueryTest with SharedSQLContext 
with BeforeAndAfterEach {
 }
   }
 
+  test("Create Database using Default Warehouse Path") {
+withSQLConf(SQLConf.WAREHOUSE_PATH.key -> "") {
+  // Will use the default location if and only if we unset the conf
+  spark.conf.unset(SQLConf.WAREHOUSE_PATH.key)
+  val catalog = spark.sessionState.catalog
+  val dbName = "db1"
+  try {
+sql(s"CREATE DATABASE $dbName")
+val db1 = catalog.getDatabaseMetadata(dbName)
+val expectedLocation =
+  "file:" + appendTrailingSlash(System.getProperty("user.dir")) +
+s"spark-warehouse/$dbName.db"
+assert(db1 == CatalogDatabase(
+  dbName,
+  "",
+  expectedLocation,
+  Map.empty))
+sql(s"DROP DATABASE $dbName CASCADE")
+assert(!catalog.databaseExists(dbName))
+  } finally {
+catalog.reset()
+  }
+}
+  }
+
   test("Create/Drop Database - location") {
 val catalog = spark.sessionState.catalog
 val databaseNames = Seq("db1", "`database`")

http://git-wip-us.apache.org/repos/asf/spark/blob/c1727290/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala
--
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala
index ad5365a..3d4fc75 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala
@@ -207,4 +207,16 @@ class SQLConfSuite extends QueryTest with SharedSQLContext 
{
 }
   }
 
+  test("default value of WAREHOUSE_PATH") {
+val original = spark.conf.get(SQLConf.WAREHOUSE_PATH)
+try {
+  // to get the default value, always unset it
+

spark git commit: [SPARK-15431][SQL][HOTFIX] ignore 'list' command testcase from CliSuite for now

2016-05-27 Thread yhuai

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 b3845fede -> b430aa98c


[SPARK-15431][SQL][HOTFIX] ignore 'list' command testcase from CliSuite for now

## What changes were proposed in this pull request?
The test cases for  `list` command added in `CliSuite` by PR #13212 can not run 
in some jenkins jobs after being merged.
However, some jenkins jobs can pass:
https://amplab.cs.berkeley.edu/jenkins/job/spark-master-test-sbt-hadoop-2.6/
https://amplab.cs.berkeley.edu/jenkins/job/spark-master-test-sbt-hadoop-2.4/
https://amplab.cs.berkeley.edu/jenkins/job/spark-master-test-sbt-hadoop-2.2/
https://amplab.cs.berkeley.edu/jenkins/job/spark-master-test-sbt-hadoop-2.7/
https://amplab.cs.berkeley.edu/jenkins/job/spark-master-test-sbt-hadoop-2.3/

Others failed on this test case. But the failures on those jobs are at slightly 
different checkpoints among different jobs too. So it seems that CliSuite's 
output capture is flaky for list commands to check for expected output. There 
are test cases already in `HiveQuerySuite` and `SparkContextSuite` to cover the 
cases. So I am ignoring 2 test cases added by PR #13212 .

Author: Xin Wu 

Closes #13276 from xwu0226/SPARK-15431-clisuite.

(cherry picked from commit 6f95c6c030db0057de213733c2bd3453463bc6f2)
Signed-off-by: Yin Huai 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b430aa98
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b430aa98
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b430aa98

Branch: refs/heads/branch-2.0
Commit: b430aa98caa16978cd53dd354423cac45410c284
Parents: b3845fe
Author: Xin Wu 
Authored: Fri May 27 08:54:14 2016 -0700
Committer: Yin Huai 
Committed: Fri May 27 08:54:54 2016 -0700

--
 .../scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/b430aa98/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala
--
diff --git 
a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala
 
b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala
index 2bf0221..656fe97 100644
--- 
a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala
+++ 
b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala
@@ -239,7 +239,7 @@ class CliSuite extends SparkFunSuite with BeforeAndAfterAll 
with Logging {
   "" -> "This is a test for Spark-11624")
   }
 
-  test("list jars") {
+  ignore("list jars") {
 val jarFile = 
Thread.currentThread().getContextClassLoader.getResource("TestUDTF.jar")
 runCliWithin(2.minute)(
   s"ADD JAR $jarFile" -> "",
@@ -248,7 +248,7 @@ class CliSuite extends SparkFunSuite with BeforeAndAfterAll 
with Logging {
 )
   }
 
-  test("list files") {
+  ignore("list files") {
 val dataFilePath = Thread.currentThread().getContextClassLoader
   .getResource("data/files/small_kv.txt")
 runCliWithin(2.minute)(


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-15431][SQL][HOTFIX] ignore 'list' command testcase from CliSuite for now

2016-05-27 Thread yhuai

Repository: spark
Updated Branches:
  refs/heads/master d5911d117 -> 6f95c6c03


[SPARK-15431][SQL][HOTFIX] ignore 'list' command testcase from CliSuite for now

## What changes were proposed in this pull request?
The test cases for  `list` command added in `CliSuite` by PR #13212 can not run 
in some jenkins jobs after being merged.
However, some jenkins jobs can pass:
https://amplab.cs.berkeley.edu/jenkins/job/spark-master-test-sbt-hadoop-2.6/
https://amplab.cs.berkeley.edu/jenkins/job/spark-master-test-sbt-hadoop-2.4/
https://amplab.cs.berkeley.edu/jenkins/job/spark-master-test-sbt-hadoop-2.2/
https://amplab.cs.berkeley.edu/jenkins/job/spark-master-test-sbt-hadoop-2.7/
https://amplab.cs.berkeley.edu/jenkins/job/spark-master-test-sbt-hadoop-2.3/

Others failed on this test case. But the failures on those jobs are at slightly 
different checkpoints among different jobs too. So it seems that CliSuite's 
output capture is flaky for list commands to check for expected output. There 
are test cases already in `HiveQuerySuite` and `SparkContextSuite` to cover the 
cases. So I am ignoring 2 test cases added by PR #13212 .

Author: Xin Wu 

Closes #13276 from xwu0226/SPARK-15431-clisuite.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6f95c6c0
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6f95c6c0
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6f95c6c0

Branch: refs/heads/master
Commit: 6f95c6c030db0057de213733c2bd3453463bc6f2
Parents: d5911d1
Author: Xin Wu 
Authored: Fri May 27 08:54:14 2016 -0700
Committer: Yin Huai 
Committed: Fri May 27 08:54:14 2016 -0700

--
 .../scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/6f95c6c0/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala
--
diff --git 
a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala
 
b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala
index 2bf0221..656fe97 100644
--- 
a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala
+++ 
b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala
@@ -239,7 +239,7 @@ class CliSuite extends SparkFunSuite with BeforeAndAfterAll 
with Logging {
   "" -> "This is a test for Spark-11624")
   }
 
-  test("list jars") {
+  ignore("list jars") {
 val jarFile = 
Thread.currentThread().getContextClassLoader.getResource("TestUDTF.jar")
 runCliWithin(2.minute)(
   s"ADD JAR $jarFile" -> "",
@@ -248,7 +248,7 @@ class CliSuite extends SparkFunSuite with BeforeAndAfterAll 
with Logging {
 )
   }
 
-  test("list files") {
+  ignore("list files") {
 val dataFilePath = Thread.currentThread().getContextClassLoader
   .getResource("data/files/small_kv.txt")
 runCliWithin(2.minute)(


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

49 matches

Mail list logo